美文网首页生信学习
TCGA子集做一个生存分析

TCGA子集做一个生存分析

作者: Stone_Stan4d | 来源:发表于2018-07-24 22:53 被阅读414次

    想拿TCGA-HNSC的数据集做一个生存分析。TCGA-HNSC的肿瘤类型较多。

    options(stringsAsFactors = F)
    library(survival)
    project <- 'HNSC'
    remove(list = ls())
    
    #====================================读取和临床资料整理=======================
    #读入临床数据,将临床特征转置为列名
    clinical <- read.table('HNSC.merged_only_clinical_clin_format.txt',
                                         header=T, row.names=1, sep='\t', fill = T)
    clinical <- as.data.frame(t(clinical))
    
    #构造一个新函数,从总表中提取生存时间和事件以及病理分级等信息
    clinSub <- function(clin){
      clin$IDs <- toupper(clin$patient.bcr_patient_barcode)
      rownames(clin) <- clin$IDs
      
      #一下一段代码为了获取new_tumor相关时间
      #获得列, new tumor event相关
      ind_keep <- grep('days_to_new_tumor_event_after_initial_treatment',colnames(clin))
      #这个是为了获取最大的那个时间值,对应的是无病生存期,总生存时间等,有点无聊
      new_tumor = as.matrix(clinical[,ind_keep])
      #返回值为NA值,或者new_tumor行的最小值。
      min_days = function(min_day){
        minimal_day = ifelse(sum(is.na(min_day)) < length(min_day), 
                             min(min_day, na.rm = T), NA)
        return(minimal_day)
      }
      new_tumor_collapsed = apply(new_tumor, 1, min_days)
      
      #获取生存时间,操作同new_tumor
      ind_keep = grep('days_to_death',colnames(clinical))
      death = as.matrix(clinical[,ind_keep])
      #构造一个获取生存时间函数,不同于new tumor取最小值,这里取最大值
      max_days = function(max_day){
        maximal_day = ifelse(sum(is.na(max_day)) < length(max_day), 
                             max(max_day, na.rm = T), NA)
        return(maximal_day)
      }
      death_collapsed = apply(death, 1, max_days)
      
      #获取随访截止时间
      ind_keep = grep('days_to_last_followup',colnames(clinical))
      follow = as.matrix(clinical[,ind_keep])
      follow_collapsed = apply(follow, 1, max_days)
      
      #表格整合
      all_clin <- data.frame(new_tumor_collapsed,death_collapsed,follow_collapsed)
      colnames(all_clin) = c('new_tumor_days', 'death_days', 'follow_days')
      rownames(all_clin) <- clin$IDs
      #创建新的列,包含肿瘤事件的时间和结局,1表示发生,0表示截尾
      all_clin$new_tumor_time = ifelse (!is.na(all_clin$new_tumor_days),
                                        as.numeric(all_clin$new_tumor_days),
                                        ifelse(!is.na(all_clin$follow_days),
                                               as.numeric(all_clin$follow_days),
                                               as.numeric(all_clin$death_days)))
      all_clin$new_tumor_event = ifelse(is.na(all_clin$new_tumor_days), 0, 1)
      #同样创建新的列来储存生存事件的时间和结局,1表示死亡,0表示截尾
      all_clin$death_time = ifelse(is.na(all_clin$death_days),
                                   as.numeric(all_clin$follow_days),
                                   as.numeric(all_clin$death_days))
      all_clin$death_event = ifelse(clin$patient.vital_status == 'alive', 0,1)
      
      #把放化疗吸烟饮酒史数据去除,形成一个新的临床数据,
      #再提取年龄性别解剖位置和临床病理资料
      clin_sub = clin[, -grep('radiation|drug|smoke|alcohol', colnames(clin))]
      clin_sub = clin_sub[, grep('//.age|subdivision|gender|pathologic|clinical', 
                                  colnames(clin_sub))]
      
      #把一般临床资料和生存资料整合到一起
      all_clin <- cbind(all_clin, clin_sub)
      #输出结果
      write.csv(all_clin, 'clinical_data_with_tnm.csv', row.names = T)
      #设置函数返回值
      return(all_clin)
    }
    
    #用构造的函数对临床资料进行处理
    clin_fil <- clinSub(clinical)
    #从HNSC的样本中提取tongue cancer
    tongue_clin <- clin_fil[grepl('tongue', clin_fil$patient.anatomic_neoplasm_subdivision),
                           c(4:9, 11, 13:20)]
    
    #下面的代码是把长的列名截短,其中一个用到高级函数lapply
    colnames(tongue_clin)[5:7] <- c('age', 'subdivision', 'gender')
    colsub <- function(colName){
      colSub = unlist(strsplit(colName, split = '\\.'))
      colSub = colSub[length(colSub)]
      return(colSub)
    }
    colnames(tongue_clin)[8:15] <- lapply(colnames(tongue_clin[8:15]), colsub)
    
    remove(clinSub, colsub)
    
    #经过分析,有一例ID为TCGA-CQ-A4CA的病例生存时间和DFS时间均是NA,需要剔除,剩159例 
    tongue_clin <- tongue_clin[-is.na(tongue_clin$new_tumor_time),]
    #============================读取基因表达数据=================================
    RNAseq.Rsem <- paste(project,
                         paste('rnaseqv2','illuminahiseq_rnaseqv2','unc_edu','Level_3',
                               'RSEM_genes_normalized','data.data.txt', sep = '__'),
                         sep = '.')
    
    #读入RNASeq RSEM数据,数据来自FireHose官网
    rna <- read.table(RNAseq.Rsem,
                      header=T,row.names=1,sep='\t',
                      stringsAsFactors = F)
    remove(RNAseq.Rsem)
    #去除第一行
    rna1 <- rna[-1,]
    #构造一个能够把基因名提取出来,并剔除无基因名的函数,以形成新的列表
    geneSub <- function(geneRna){
      #构造一个把基因名和Entrez ID分开的函数,便于后面lapply使用。
      SymbExtract <- function(x){
        sym = strsplit(x, split = '\\|')
        sym = sym[[1]][1]
        return(sym)
      }
      rnaTarget = geneRna[unlist(lapply(rownames(geneRna), SymbExtract)) != '?', ]
      #这一行代码是为了找出rnaTarget的基因ID有重复的基因名,准备删除
      geneDel <- rownames(which(table(unlist(lapply(rownames(rnaTarget), SymbExtract))) > 1, 
                                arr.ind = T))
      rnaTarget <- rnaTarget[!(unlist(lapply(rownames(rnaTarget), SymbExtract)) %in% geneDel),]
      #把基因名提取出来,配合使用lapply和前面构造的一个小函数,向量化运算
      rownames(rnaTarget) <- unlist(lapply(rownames(rnaTarget), SymbExtract))
      #把样本名截短, 只要前15个字符
      colnames(rnaTarget) <- gsub('\\.', '-', substr(colnames(rnaTarget), 1, 15))
      return(rnaTarget)
      
    }
    #利用构造的函数对rna进行处理
    rna1 <- geneSub(rna1)
    #只保留第14到15位字符是01和11的样本,他们分别代表原发肿瘤样本和正常样本
    rna1 <- rna1[, substr(colnames(rna1), 14, 15) %in% c('01', '11')]
    
    tongue_rna <- rna1[, substr(colnames(rna1), 1, 12) %in% rownames(tongue_clin)]
    #获得肿瘤或者正常样本的索引
    n_index <- which(substr(colnames(tongue_rna),14,15) == '11')
    t_index <- which(substr(colnames(tongue_rna),14,15) == '01')
    
    # 先删除在50%以上的样本中表达值为0的基因
    rem <- function(x){
      x = as.matrix(x)
      x = t(apply(x,1,as.numeric))
      r = as.numeric(apply(x,1,function(i) sum(i == 0)))
      remove = which(r > dim(x)[2]*0.5)
      return(remove)
    }
    remove <- rem(tongue_rna)
    #对tongue cancer的表达矩阵进行基因筛除
    tongue_rna <- tongue_rna[-remove,]
    
    
    library(limma)
    # 用voom函数来标准化数据,以便用limma包进行分析
    vm <- function(x){
      #对样本进行标记,列名第14,15位为01的是肿瘤样本,标记为1
      cond = factor(ifelse(substr(colnames(x), 14, 15) == '01', 1, 0))
      d = model.matrix(~1+cond)
      x = t(apply(x,1,as.numeric))
      ex = voom(x,d,plot = F)
      return(ex$E)
    }
    
    tongue_rna_vm  <- vm(tongue_rna)
    #colnames(tongue_rna_vm) <- gsub('\\.','-',substr(colnames(tongue_rna),1,12))
    dim(tongue_rna_vm)
    
    # 查看数据如何分布,应该呈正态
    hist(tongue_rna_vm)
    
    
    #z = (value gene X in tumor Y)-(mean gene X in normal)]/(standard deviation X in normal)
    
    
    # 计算z分数
    scal <- function(x,y){
      mean_n = rowMeans(y)  # mean of normal
      sd_n = apply(y,1,sd)  # SD of normal
      # z score as (value - mean normal)/SD normal
      res = matrix(nrow=nrow(x), ncol=ncol(x))
      colnames(res) = colnames(x)
      rownames(res) = rownames(x)
      for(i in 1:dim(x)[1]){
        for(j in 1:dim(x)[2]){
          res[i,j] <- (x[i,j]-mean_n[i])/sd_n[i]
        }
      }
      return(res)
    }
    z_rna <- scal(tongue_rna_vm[,t_index],tongue_rna_vm[,n_index])
    
    rm(tongue_rna_vm) #不再需要
    

    未完待续

    相关文章

      网友评论

        本文标题:TCGA子集做一个生存分析

        本文链接:https://www.haomeiwen.com/subject/kvrgpftx.html