美文网首页
TCGA常见数据库下载方式

TCGA常见数据库下载方式

作者: 医只蜗牛 | 来源:发表于2021-07-22 20:51 被阅读0次

    TCGA常见下载方式:

    【01】直接复制链接,在线下载解压

    来源:

    > getwd()
    [1] "D:/R_code/follow_practice/xuetu_GEO_follow/week_practise/01_follow_practise/01_TP53_BRCA"
    
    # Step1 download TCGA dateset ---------------------------------------------
    rm(list=ls())
    if (!file.exists( './data/TCGA-BRCA.htseq_counts.Rdata' )) {
      gzfile <- "./raw_data/TCGA-BRCA.htseq_counts.tsv.gz"
      download.file("https://gdc-hub.s3.us-east-1.amazonaws.com/download/TCGA-BRCA.htseq_counts.tsv.gz", 
                    destfile = gzfile)
      library(R.utils)
      gunzip(gzfile, remove = F)
      library(data.table)
      raw_data <- fread( "./raw_data/TCGA-BRCA.htseq_counts.tsv",
                         sep = '    ', header = T)
      raw_data <- as.data.frame( raw_data )
      raw_data[1:5, 1:6] 
      rownames( raw_data ) <- raw_data[, 1]
      raw_data <- raw_data[, -1]
      raw_data[1:5, 1:6]
      raw_data <- 2^raw_data - 1
      raw_data <- ceiling( raw_data )
      raw_data[1:5, 1:6]
      pick_row <- apply( raw_data, 1, function(x){
        sum(x == 0) < 10
      })
      raw_data <- raw_data[pick_row, ]
      dim(raw_data  )
      save( raw_data, file = './data/TCGA-BRCA.htseq_counts.Rdata' )
    }else{
      load('./data/TCGA-BRCA.htseq_counts.Rdata')
    }
    
    
    
    # Step2 Grouping by special clinical information --------------------------
    
    if (!file.exists( './raw_data/TCGA-BRCA.GDC_phenotype.tsv.gz' )) {
      gzfile <- "./raw_data/TCGA-BRCA.GDC_phenotype.tsv.gz"
      download.file("https://gdc-hub.s3.us-east-1.amazonaws.com/download/TCGA-BRCA.GDC_phenotype.tsv.gz", 
                    destfile = gzfile)
      phenoData <- read.table( gzfile,
                               header = T,
                               sep = '  ',
                               quote = '' )
      save( phenoData, file = './data/TCGA-BRCA.GDC_phenotype.Rdata' )
    }else{
      load('./data/TCGA-BRCA.GDC_phenotype.Rdata')
    }
    
    pheno_num <- c()
    invisible(
      lapply(1:ncol(phenoData), 
             function(col_num){
               ## Assume that the classification project is between 2 and 4
               if (1 < dim(table(phenoData[,col_num])) & 
                   dim(table(phenoData[,col_num])) < 5) {
                 pheno_num <<- append(pheno_num, col_num, after = length(pheno_num))
               }
             }
      )
    )
    View(phenoData[, pheno_num])
    names(phenoData[, pheno_num])
    
    ## Category 3: TP53
    if (!file.exists( './raw_data/TCGA-BRCA.mutect2_snv.tsv.gz' )) {
      gzfile <- "./raw_data/TCGA-BRCA.mutect2_snv.tsv.gz"
      download.file("https://gdc-hub.s3.us-east-1.amazonaws.com/download/TCGA-BRCA.mutect2_snv.tsv.gz", 
                    destfile = gzfile)
      mutype_file <- read.table( gzfile,
                                 header = T,
                                 sep = '    ',
                                 quote = '' )
      save( mutype_file, file = './data/TCGA-BRCA.mutect2_snv.Rdata' )
    }else{
      load('./data/TCGA-BRCA.mutect2_snv.Rdata')
    }
    
    ## Pick columns that contains 'tp53'
    TP53 <- mutype_file[mutype_file$gene == 'tp53' | mutype_file$gene == 'TP53',]
    TP53_sample <- unique( sort( TP53$Sample_ID ) )
    tumor_sample <- colnames(raw_data)[substr( colnames(raw_data),14,15) < 10]
    TP53_sample <- intersect(tumor_sample, TP53_sample) #intersect取交集
    noTP53_sample <- setdiff(tumor_sample, TP53_sample)
    save(TP53_sample, noTP53_sample, file = './data/sample_by_TP53.Rdata')
    
    
    # Step3 Filt sample ------------------------------------------------
    
    load('./data/TCGA-BRCA.htseq_counts.Rdata')
    
    tp53_sample <- c(TP53_sample, noTP53_sample)
    AssayData <- raw_data[, tp53_sample]
    dim(AssayData)
    group_list <- c(rep('TP53', length(TP53_sample)),
                    rep('NO_TP53', length(noTP53_sample)))
    save(AssayData, group_list, file = './data/tnbc_tumor_TP53_AssayData.Rdata')
    
    

    【02】UCSCXenaTools包下载

    来源:

    UCSCXenaTools包下载,下载好之后直接是可操作形式,省去fread()函数读取和处理。

     getwd()
    [1] "D:/R_code/follow_practice/xuetu_GEO_follow/week_practise/01_follow_practise/03_TCGA-BRCA"
    

    这地方需要注意,UCSCXenaTools下载的TCGA-BRCA.mutect2_snv.tsv这种可以直接使用,但是 TCGA-BRCA.htseq_counts.tsv不行,读出来的不一样【修正】。也可以直接用,直接赋值给

    需要注意,临床信息可能不一样。临床信息中TCGAbiolinks包下载的更佳。

    a= raw_data
    a=as.data.frame(a) ##后面是一样的使用。

    就是说,前面那种下载解压方法, 后面的可以直接保存曾.Rdata文件,然后上面fread()读出来的效果和.Rdata是一样的。但后面需要进行的操作不能少。

    结合这个看,两者结合。后面有筛选的部分。

    ##等价于自己官网下载
    rm(list=ls())
    library("UCSCXenaTools")
    raw_data<-XenaGenerate(subset = XenaCohorts =="GDC TCGA Breast Cancer (BRCA)")%>% 
      XenaFilter(filterDatasets    = "TCGA-BRCA.htseq_counts.tsv") %>% 
      XenaQuery() %>%
      XenaDownload() %>% 
      XenaPrepare()   #加载数据
    
    head(raw_data)
    
    ##查找特定的
    # stad_set4 <- XenaScan(pattern = 'stomach cancer ')
    # stad_set5 <- stad_set4 %>%
    #   XenaGenerate()
    ##返回第一步下载
    save(raw_data,file = "TCGA-BRCA.htseq_counts.tsv")
    save(raw_data,file = "TCGA-BRCA.htseq_counts.Rdata")
    
    
    
    library("UCSCXenaTools")
    phenotype_file<-XenaGenerate(subset = XenaCohorts =="GDC TCGA Breast Cancer (BRCA)")%>% 
      XenaFilter(filterDatasets    = "TCGA-BRCA.GDC_phenotype.tsv") %>% 
      XenaQuery() %>%
      XenaDownload() %>% 
      XenaPrepare()   #加载数据
    
    head(phenotype_file)
    save(phenotype_file,file = "TCGA-BRCA.GDC_phenotype_file.tsv")
    save(phenotype_file,file = "TCGA-BRCA.GDC_phenotype_file.Rdata")
    

    【03】官网下载:

    https://xenabrowser.net/datapages/
    要啥有啥

    相关文章

      网友评论

          本文标题:TCGA常见数据库下载方式

          本文链接:https://www.haomeiwen.com/subject/jeuymltx.html