美文网首页
将TCGA下载的数据合并成矩阵

将TCGA下载的数据合并成矩阵

作者: 一只烟酒僧 | 来源:发表于2022-01-19 14:35 被阅读0次

    TCGA.R

    /bin/Rscript TCGA.R --help
    
    Options:
            --my_dir=MY_DIR
                    设置文件根目录
    
            --mainfest=MAINFEST
                    MAINFEST.txt文件路径
    
            --clinical_file=CLINICAL_FILE
                    clinical文件路径
    
            --sample_sheet=SAMPLE_SHEET
                    samplesheet文件路径
    
            --exclude_file_pattern=EXCLUDE_FILE_PATTERN
                    排除的文件正则模式
    
            --output_dir=OUTPUT_DIR
                    设置输出文件路径
    
            -h, --help
                    Show this help message and exit
    
    
    #示例
    /bin/Rscript TCGA.R  --my_dir /home/whq/myproject/zjl20211220/code/analysis/otherfile/TCGAfile/ --mainfest ../TCGA_dowloadfile/gdc_manifest_20220117_113651.txt --clinical_file ../TCGA_dowloadfile/clinical.tsv --sample_sheet ../TCGA_dowloadfile/gdc_sample_sheet.2022-01-17.tsv --exclude_file_pattern star_gene_counts.tsv.gz --output_dir ../
    

    具体代码

    if(T){
      library(dplyr,quietly = T)
      library(stringr,quietly = T)
      library(openxlsx,quietly = T)
      library(optparse,quietly = T)
      library(plyr,quietly = T)
    }
    
    option_list<-list(make_option("--my_dir",type = "character",default = "./",help = "设置文件根目录"),
                      make_option("--mainfest",type = "character",help = "MAINFEST.txt文件路径"),
                      make_option("--clinical_file",type = "character",help = "clinical文件路径"),
                      make_option("--sample_sheet",type = "character",help = "samplesheet文件路径"),
                      make_option("--exclude_file_pattern",type = "character",default = "xxxxxxxxxxxxx",help = "排除的文件正则模式"),
                      make_option("--output_dir",type = "character",default = "./",help = "设置输出文件路径")
                      )
    
    args <- parse_args(OptionParser(option_list=option_list))
    
    if(F){
      #示例,不运行
      my_dir<-"/home/whq/myproject/zjl20211220/code/analysis/otherfile/TCGAfile/"
      mainfest<-"../TCGA_dowloadfile/gdc_manifest_20220117_113651.txt"
      clinical_file<-"../TCGA_dowloadfile/clinical.tsv"
      sample_sheet<-"../TCGA_dowloadfile/gdc_sample_sheet.2022-01-17.tsv"
      exclude_file_pattern<-"star_gene_counts.tsv.gz"
      output_dir="../"
    }
    
    #设置参数
    if(T){
      my_dir=args$my_dir
      mainfest=args$mainfest
      clinical_file=args$clinical_file
      sample_sheet=args$sample_sheet
      exclude_file_pattern=args$exclude_file_pattern
      output_dir=args$output_dir
    }
    
    
    setwd(my_dir)
    
    #整理文件夹信息
    tcga_mainfest<-read.csv(mainfest,sep = "\t")
    tcga_clinical<-read.csv(clinical_file,sep = "\t")
    tcga_sample_sheet<-read.csv(sample_sheet,sep = "\t")
    
    
    #整理表达矩阵
    tcga_sample_sheet_filter<-tcga_sample_sheet%>%mutate(file_dir=paste(File.ID,File.Name,sep = "/"),
                                                         exclud=ifelse(str_detect(file_dir,exclude_file_pattern),T,F))
    tcga_mat_name_filter<-tcga_sample_sheet_filter%>%filter(exclud==F)
    tcga_mat<-tcga_mat_name_filter%>%dplyr::select(file_dir,Case.ID)%>%mlply(.,function(file_dir,Case.ID){
      read.csv(file_dir,sep = "\t",row.names = 1,header = F)%>%
        setNames(.,Case.ID)
    })
    
    tcga_mat_rownames<-lapply(tcga_mat,rownames)%>%Reduce(intersect,.)
    
    tcga_mat<-lapply(tcga_mat,function(x){
      x[tcga_mat_rownames,,drop=F]
    })%>%do.call(cbind,.)
    
    #整理annotation
    
    tcga_anno<-lapply(tcga_sample_sheet_filter$File.ID,function(x){
      list.files(x,pattern = "annotation",full.names = T) ->a
      if(length(a)>0){a}else{NULL}
    })%>%.[sapply(.,function(x){!is.null(x)})]%>%
      lapply(.,function(x){
        read.csv(x,header = T,sep = "\t")
      })%>%do.call(rbind,.)
    
    #整合annotation和clinical
    tcga_intagrated_info<-merge(tcga_clinical,tcga_anno,by.x="case_id",by.y="entity_id",all.x=T)
    
    write.csv(tcga_mat,paste0(output_dir,"tcga_mat.csv"),quote = F,row.names = T)
    write.xlsx(tcga_intagrated_info,paste0(output_dir,"tcga_intagrated_info.xlsx"),overwrite = T)
    
    

    相关文章

      网友评论

          本文标题:将TCGA下载的数据合并成矩阵

          本文链接:https://www.haomeiwen.com/subject/ahprhrtx.html