UCSC Xena 浏览器 分类打包,直接下载
gdcRNAtools基于gdc-client下载并简化整理,用R语言完成
备份的TCGA数据来源于xena,ucsc的,都在,https://share.weiyun.com/5zLnKmO
需求最大的是tcga数据库的生存分析和表达量差异
看看这两个视频:
https://www.bilibili.com/video/av25643438?p=9
https://www.bilibili.com/video/av49363776?p=6
(from 曾老师)
1.xena
if(F){
download.file(url = "https://gdc.xenahubs.net/download/TCGA-CHOL.htseq_counts.tsv.gz",destfile = "counts.tsv.gz")
download.file(url = "https://gdc.xenahubs.net/download/TCGA-CHOL.GDC_phenotype.tsv.gz",destfile = "phenotype.tsv.gz")
download.file(url = "https://gdc.xenahubs.net/download/TCGA-CHOL.survival.tsv.gz",destfile = "survival.tsv.gz")
}
dat = read.table("counts.tsv.gz",check.names = F,row.names = 1,header = T)
逆转log
dat = as.matrix(2^dat - 1)
dat[1:4,1:4]
as.character(dat[1:100,1:10]) #有一些小数
用apply转换为整数矩阵
exp = apply(dat, 2, as.integer) #对dat矩阵的2每一列数值as.integer取整数
exp[1:4,1:4] #行名消失
rownames(exp) = rownames(dat) #补上行名
clinical = read.table("phenotype.tsv.gz",fill = T,header = T,sep = "\t")
surv = read.table("survival.tsv.gz",header = T)
clinical[1:4,1:4]
surv[1:4,1:4]
2.GDCRNATools #自制教程,可自学
http://bioconductor.org/packages/devel/bioc/vignettes/GDCRNATools/inst/doc/GDCRNATools.html
其他来源的RNA-seq数据
GEO
library(GEOquery)
eSet = getGEO("GSE162550",destdir = F,getGPL = F)
#数据下载跟之前的芯片数据下载方式不一样,不能从r包直接加载。去官网看补充数据,并看清楚具体数据类型
rm(list = ls())
dat = read.table("GSE162550_gene_sample_count_with_symbol.xls",
fill = T,sep = "\t",header = T)
table(!duplicated(dat$Symbol)) #行名不能重复,此处看symble有无重复
o = order(rowSums(dat[,4:9]),decreasing = T)
view(o)
dat = dat[!duplicated(dat$Symbol),] #Symbol去重复,行名
dat = dat[dat$Symbol!="---",] #Symbol去掉奇奇怪怪的数据---,行名
exp = dat[,4:9]
rownames(exp) = dat$Symbol #加上行名
网友评论