美文网首页
TCGA泛癌TMB计算

TCGA泛癌TMB计算

作者: 小熊_wh | 来源:发表于2022-12-01 10:53 被阅读0次

    在前面系列教程中分享了如何下载TCGA表达矩阵(2022新版TCGA数据下载与整理),通过R进行数据合并就能得到TCGA泛癌表达矩阵了。有时候需要分析基因表达与TMB之间的相关性,就会涉及到TCGA突变数据下载和TMB计算。TCGA突变数据maf以前可以直接在TCGA下载,也可以用TCGAbiolinks包的GDCquery_Maf函数下载,后来TCGA不让下载了,TCGAbiolinks的GDCquery_Maf函数在新版本里面也被抛弃了。但是TCGAbiolinks包的作者提供了替代方案,具体见TCGAbiolinks: Searching, downloading and visualizing mutation files,作者提供了hg38和hg19两个版本的下载方法,以hg38的TCGA-CHOL为例:

    library("TCGAbiolinks")
    query <- GDCquery(
        project = "TCGA-CHOL", 
        data.category = "Simple Nucleotide Variation", 
        access = "open", 
        legacy = FALSE, 
        data.type = "Masked Somatic Mutation", 
        workflow.type = "Aliquot Ensemble Somatic Variant Merging and Masking"
    )
    GDCdownload(query)
    maf <- GDCprepare(query)
    

    得到的maf就可以用于后续TMB计算了,TMB的计算参考这个链接(TCGA 数据分析实战 —— TMB 与免疫浸润联合分析):

    get_TMB <- function(file) {
      # 需要用到的列
      use_cols <- c(
        "Hugo_Symbol", "Variant_Classification", "Tumor_Sample_Barcode", 
        "HGVSc", "t_depth", "t_alt_count"
      )
      # 删除这些突变类型
      mut_type <- c(
        "5'UTR", "3'UTR", "3'Flank", "5'Flank", "Intron", "IGR",
        "Silent", "RNA", "Splice_Region"
      )
      # 读取文件
      df <- read_csv(file, col_select = use_cols)
      data <- df %>% subset(!Variant_Classification %in% mut_type) %>%
        # 计算 VAF
        mutate(vaf = t_alt_count / t_depth) %>%
        group_by(Tumor_Sample_Barcode) %>%
        summarise(mut_num = n(), TMB = mut_num / 30, MaxVAF = max(vaf))
      return(data)
    }
    

    为了得到泛癌的TMB数据,我将代码写成循环,并稍微做了些调整。

    rm(list=ls())
    library(TCGAbiolinks)
    library(dplyr)
    library(stringr)
    projects <- getGDCprojects()$project_id
    projects <- projects[grepl('^TCGA', projects, perl=TRUE)]
    projects <- projects[order(projects)]
    
    pan_TMB=list()
    for (project in projects){
      query <- GDCquery(
        project = project, 
        data.category = "Simple Nucleotide Variation", 
        access = "open", 
        legacy = FALSE, 
        data.type = "Masked Somatic Mutation", 
        workflow.type = "Aliquot Ensemble Somatic Variant Merging and Masking"
      )
      GDCdownload(query)
      maf <- GDCprepare(query)
      
      ## TMB calculation
      # Code Reference https://zhuanlan.zhihu.com/p/394609586
    get_TMB <- function(file) {
      use_cols <- c("Hugo_Symbol", "Variant_Classification", "Tumor_Sample_Barcode", 
                      "HGVSc", "t_depth", "t_alt_count")
      # 删除这些突变类型
      mut_type <- c("5'UTR", "3'UTR", "3'Flank", "5'Flank", "Intron", "IGR",
                      "Silent", "RNA", "Splice_Region")
      # 读取文件
      df <- select(file, use_cols)
      data <- df %>% subset(!Variant_Classification %in% mut_type) %>%
      # 计算 VAF
        mutate(vaf = t_alt_count / t_depth) %>%
        group_by(Tumor_Sample_Barcode) %>%
        summarise(mut_num = n(), TMB = mut_num / 30, MaxVAF = max(vaf))
      return(data)
      }
      
      pan_TMB[[project]]=get_TMB(maf)
    }
    
    pan_TMB=do.call(rbind,pan_TMB)
    pan_TMB$Cancer=str_split(rownames(pan_TMB),'-',simplify = T)[,2]
    pan_TMB$Cancer=str_split(pan_TMB$Cancer,'[.]',simplify = T)[,1]
    pan_TMB$Tumor_Sample_Barcode=str_sub(pan_TMB$Tumor_Sample_Barcode,1,16)
    pan_TMB=pan_TMB[,c(5,1,3)]
    colnames(pan_TMB)=c("Cancer","ID","TMB")
    save(pan_TMB,file="pan_TMB.Rdata")
    

    但是在循环到TCGA-MESO环节,总是会报"Error: Failure to retrieve index 140/0",然后R就重启了。没办法,把TCGA-MESO单独出来,再合并。中间处理TCGA-MESO时在用maftools的read.maf时也有报错"Error in read.maf(x, isTCGA = TRUE) : No non-synonymous mutations found Check vc_nonSyn`` argumet inread.maffor details",后来按照Github上的方法直接读入表格([No non-synonymous mutations found Check vc_nonSyn argumet in read.maf for details #838](https://github.com/PoisonAlien/maftools/issues/838)

    最终代码如下:

    rm(list=ls())
    library(TCGAbiolinks)
    library(dplyr)
    library(stringr)
    projects <- getGDCprojects()$project_id
    projects <- projects[grepl('^TCGA', projects, perl=TRUE)]
    projects <- projects[order(projects)]
    projects=projects[-19]
    pan_TMB=list()
    for (project in projects){
      query <- GDCquery(
        project = project, 
        data.category = "Simple Nucleotide Variation", 
        access = "open", 
        legacy = FALSE, 
        data.type = "Masked Somatic Mutation", 
        workflow.type = "Aliquot Ensemble Somatic Variant Merging and Masking"
      )
      GDCdownload(query)
      maf <- GDCprepare(query)
      
      ## TMB calculation
      # Code Reference https://zhuanlan.zhihu.com/p/394609586
      get_TMB <- function(file) {
        use_cols <- c("Hugo_Symbol", "Variant_Classification", "Tumor_Sample_Barcode", 
                      "HGVSc", "t_depth", "t_alt_count")
        # 删除这些突变类型
        mut_type <- c("5'UTR", "3'UTR", "3'Flank", "5'Flank", "Intron", "IGR",
                      "Silent", "RNA", "Splice_Region")
        # 读取文件
        df <- select(file, use_cols)
        data <- df %>% subset(!Variant_Classification %in% mut_type) %>%
          # 计算 VAF
          mutate(vaf = t_alt_count / t_depth) %>%
          group_by(Tumor_Sample_Barcode) %>%
          summarise(mut_num = n(), TMB = mut_num / 30, MaxVAF = max(vaf))
        return(data)
      }
      
      pan_TMB[[project]]=get_TMB(maf)
    }
    
    # For TCGA-MESO
    query <- GDCquery(
      project = "TCGA-MESO", 
      data.category = "Simple Nucleotide Variation", 
      access = "open", 
      legacy = FALSE, 
      data.type = "Masked Somatic Mutation", 
      workflow.type = "Aliquot Ensemble Somatic Variant Merging and Masking"
    )
    GDCdownload(query)
    
    # Reference:https://github.com/PoisonAlien/maftools/issues/838
    mafFilePath2 = dir(path = "GDCdata/TCGA-MESO",pattern = "masked.maf.gz$",full.names = T,recursive=T) 
    TCGA_MESO = lapply(mafFilePath2, data.table::fread, skip = "Hugo_Symbol")
    TCGA_MESO = data.table::rbindlist(l = TCGA_MESO, use.names = TRUE, fill = TRUE)
    pan_TMB[["TCGA-MESO"]]=get_TMB(TCGA_MESO)
    
    pan_TMB=do.call(rbind,pan_TMB)
    pan_TMB$Cancer=str_split(rownames(pan_TMB),'-',simplify = T)[,2]
    pan_TMB$Cancer=str_split(pan_TMB$Cancer,'[.]',simplify = T)[,1]
    pan_TMB$Tumor_Sample_Barcode=str_sub(pan_TMB$Tumor_Sample_Barcode,1,16)
    pan_TMB=pan_TMB[,c(5,1,3)]
    colnames(pan_TMB)=c("Cancer","ID","TMB")
    pan_TMB$TMB=round(pan_TMB$TMB,1) # 保留一位小数
    pan_TMB=pan_TMB[order(pan_TMB$Cancer),]
    save(pan_TMB,file="pan_TMB.Rdata")
    

    最终数据如下:


    image.png

    查看下每个肿瘤的突变情况。

    table(pan_TMB$Cancer)
    
    # ACC BLCA BRCA CESC CHOL COAD DLBC ESCA  GBM HNSC KICH KIRC KIRP LAML  LGG LIHC LUAD LUSC MESO   OV PAAD PCPG 
    # 90  414  986  289   51  454   47  184  461  510   66  357  282  128  521  371  616  544   78  462  170  182 
    # PRAD READ SARC SKCM STAD TGCT THCA THYM UCEC  UCS  UVM 
     # 493  161  238  469  431  146  494  121  518   57   80 
    

    通过ID可以和TCGA表达矩阵合并,再根据TCGA基因与免疫评分带侧边密度图的相关性点图的方法 就可以进行TMB与基因表达相关性分析了。

    相关文章

      网友评论

          本文标题:TCGA泛癌TMB计算

          本文链接:https://www.haomeiwen.com/subject/yhjefdtx.html