美文网首页
TCGA_批量基因_生存分析_流程搭建

TCGA_批量基因_生存分析_流程搭建

作者: 小贝学生信 | 来源:发表于2022-01-11 23:44 被阅读0次

    前期基础:
    [R]TCGAbiolinks包:数据准备--query、download、prepare - 简书 (jianshu.com)
    R—生存分析—Kaplan-Meier + Log-rank test + Cox回归 - 简书 (jianshu.com)

    step1:下载TCGA表达数据

    library(TCGAbiolinks)
    projects = TCGAbiolinks:::getGDCprojects()$project_id
    grep("TCGA", projects, value = T)
    TCGAbiolinks:::getProjectSummary("TCGA-LIHC")$data_categories
    query <- GDCquery(project = c("TCGA-LIHC"),
                      legacy = FALSE, #default(GDC harmonized database)
                      data.category = "Transcriptome Profiling",
                      data.type = "Gene Expression Quantification",
                      workflow.type = "HTSeq - FPKM")
    dim(getResults(query))
    # [1] 424  29
    
    TP = TCGAquery_SampleTypes(getResults(query)$sample.submitter_id,"TP")
    NT = TCGAquery_SampleTypes(getResults(query)$sample.submitter_id,"NT")
    query <- GDCquery(project = c("TCGA-LIHC"),
                      legacy = FALSE, #default(GDC harmonized database)
                      data.category = "Transcriptome Profiling",
                      data.type = "Gene Expression Quantification",
                      workflow.type = "HTSeq - FPKM",
                      barcode = c(TP, NT))
    dim(getResults(query))
    # [1] 421  29
    GDCdownload(query, files.per.chunk = 10)
    data <- GDCprepare(query, save = T, save.filename = "LIHC_RNAseq_FPKM.rda")
    
    • 进一步整理信息
    #样本(临床)信息
    colData(data) = colData(data)[,"ID",drop=F]
    # 筛选 01 TP primary tumor
    tp_index = which(stringr::str_detect(colData(data)$ID,
                                         "-01"))
    data_tp = data[,tp_index]
    assay(data_tp)[1:4,1:4]
    #                 TCGA-DD-AAVS-01A-11R-A41C-07 TCGA-DD-AAE3-01A-11R-A41C-07
    # ENSG00000000003                    32.681039                    39.248105
    # ENSG00000000005                     0.000000                     0.000000
    # ENSG00000000419                    21.935276                    33.647190
    # ENSG00000000457                     3.062907                     0.844674
    #                 TCGA-DD-A4NS-01A-11R-A311-07 TCGA-5R-AA1D-01A-11R-A38B-07
    # ENSG00000000003                  12.58508419                  17.62580911
    # ENSG00000000005                   0.01225347                   0.02573094
    # ENSG00000000419                  17.12927554                  17.10958452
    # ENSG00000000457                   2.68276766                   1.22480857
    

    step2:结合ucsc整理好的生存信息

    #可根据选择癌症类型,修改下面的链接地址
    surdat_url="https://tcga-xena-hub.s3.us-east-1.amazonaws.com/download/survival%2FLIHC_survival.txt"
    # ucsc_sur = data.table::fread(surdat_url)
    ucsc_sur = data.table::fread("ucsc/Curated_survival_data.txt",
                                 data.table = F)
    head(ucsc_sur)
    colData(data_tp) = dplyr::left_join(as.data.frame(colData(data_tp)), 
                     ucsc_sur, by=c("ID"="sample")) %>% 
      DataFrame(., row.names = colnames(data_tp))
    
    head(colData(data_tp))
    # DataFrame with 6 rows and 11 columns
    # ID    X_PATIENT        OS   OS.time       DSS  DSS.time
    # <character>  <character> <integer> <integer> <integer> <integer>
    # TCGA-DD-AAVS-01A-11R-A41C-07 TCGA-DD-AAVS-01 TCGA-DD-AAVS         0      1823         0      1823
    # TCGA-DD-AAE3-01A-11R-A41C-07 TCGA-DD-AAE3-01 TCGA-DD-AAE3         0       566         0       566
    # TCGA-DD-A4NS-01A-11R-A311-07 TCGA-DD-A4NS-01 TCGA-DD-A4NS         1      2456         1      2456
    # TCGA-5R-AA1D-01A-11R-A38B-07 TCGA-5R-AA1D-01 TCGA-5R-AA1D         0       449         0       449
    # TCGA-BW-A5NO-01A-11R-A27V-07 TCGA-BW-A5NO-01 TCGA-BW-A5NO         0        20         0        20
    # TCGA-DD-AADL-01A-11R-A41C-07 TCGA-DD-AADL-01 TCGA-DD-AADL         0       636         0       636
    # DFI  DFI.time       PFI  PFI.time   Redaction
    # <integer> <integer> <integer> <integer> <character>
    # TCGA-DD-AAVS-01A-11R-A41C-07         0      1823         0      1823            
    # TCGA-DD-AAE3-01A-11R-A41C-07         0       566         0       566            
    # TCGA-DD-A4NS-01A-11R-A311-07         1       893         1       893            
    # TCGA-5R-AA1D-01A-11R-A38B-07         0       449         0       449            
    # TCGA-BW-A5NO-01A-11R-A27V-07        NA        NA         0        20            
    # TCGA-DD-AADL-01A-11R-A41C-07         0       636         0       636
    
    sur_clinical = colData(data_tp) %>% as.data.frame() %>% 
      dplyr::select(OS, OS.time) %>% 
      dplyr::mutate(time = OS.time/365) %>% 
      dplyr::rename(status = OS) %>% 
      dplyr::select(status, time)
    head(sur_clinical)
    #                              status       time
    # TCGA-DD-AAVS-01A-11R-A41C-07      0 4.99452055
    # TCGA-DD-AAE3-01A-11R-A41C-07      0 1.55068493
    # TCGA-DD-A4NS-01A-11R-A311-07      1 6.72876712
    # TCGA-5R-AA1D-01A-11R-A38B-07      0 1.23013699
    # TCGA-BW-A5NO-01A-11R-A27V-07      0 0.05479452
    # TCGA-DD-AADL-01A-11R-A41C-07      0 1.74246575
    

    step3:批量生存分析

    exp_t = assay(data_tp) %>% t()
    identical(rownames(sur_clinical), rownames(exp_t))
    # [1] TRUE
    
    library(survival)
    gene2sur = colnames(exp_t)[1:100]
    gene_sur_list = lapply(seq(gene2sur), function(i){
      # i = 1
      print(paste0(i, " / ", length(gene2sur)))
      gene_sle = gene2sur[i]
      group_info = ifelse(exp_t[,gene_sle] > median(exp_t[,gene_sle]),
                          "high", "low")
      #表达值全为0的情况,跳过
      if(length(unique(group_info))==1) return(NULL)
      surdata = sur_clinical %>% 
        dplyr::select(status, time) %>% 
        dplyr::mutate(group = group_info)
      surv_diff <- survdiff(Surv(time, status) ~ group, data = surdata)
      p.val = 1 - pchisq(surv_diff$chisq, length(surv_diff$n) - 1)
      return(c(p.val, median(exp_t[,gene_sle])))
    })
    names(gene_sur_list) = gene2sur
    gene_sur = do.call(rbind, gene_sur_list) %>% as.data.frame()
    colnames(gene_sur) = c("pval","median_Exp")
    head(gene_sur)
    #                        pval  median_Exp
    # ENSG00000000003 0.827913157 20.13245532
    # ENSG00000000005 0.491871470  0.01302639
    # ENSG00000000419 0.001291037 20.46595268
    # ENSG00000000457 0.680007018  1.97177656
    # ENSG00000000460 0.024186393  0.86061452
    # ENSG00000000938 0.245993727  0.97951132
    

    相关文章

      网友评论

          本文标题:TCGA_批量基因_生存分析_流程搭建

          本文链接:https://www.haomeiwen.com/subject/hlaicrtx.html