TCGA数据库下载
library(TCGAbiolinks)
library(stringr)
##使用TCGAbiolinks从GDC Data Portal上下载
query = GDCquery(project = "TCGA-LAML", legacy = FALSE, experimental.strategy = "RNA-Seq", data.category = "Transcriptome Profiling", data.type = "Gene Expression Quantification", workflow.type = "HTSeq - Counts")
##
GDCdownload(query)
#GDCprepare: Reads the data downloaded and prepare it into an R object
dataAssy = GDCprepare(query, summarizedExperiment = F)
rownames(dataAssy) = dataAssy[,1]
dataAssy = dataAssy[,-1]
colnames(dataAssy) = str_match(colnames(dataAssy), "(TCGA-[^-]*-[^-]*-[^-]*)")[,2]
dataAssyout = cbind(rownames(dataAssy), dataAssy)
colnames(dataAssyout)[1] = "Symbol"
dataAssyout$Symbol=as.character(dataAssyout$Symbol)
str(dataAssyout)
tt=tail(dataAssyout)
tt$Symbol=as.character(tt$Symbol)
#for(i in 1:nrow(dataAssyout)){
# dataAssyout$Symbol[i]=str_split(dataAssyout$Symbol[i],"\\.")[[1]][1]
#}
my_function=function(x) {x=str_split(x,"\\.")[[1]][1]
}
tt$Symbol=apply(data.frame(tt$Symbol),1,my_function)
dataAssyout$Symbol=apply(data.frame(dataAssyout$Symbol),1,my_function)
head(dataAssyout)
##去掉前五行
dataAssyout2=dataAssyout[-c(1:5),]
head(dataAssyout2)
网友评论