美文网首页TC
TCGA 01 TCGAbiolinks下载转录组数据差异分析

TCGA 01 TCGAbiolinks下载转录组数据差异分析

作者: rochiman | 来源:发表于2020-05-18 14:11 被阅读0次

    1. 准备

    安装和导入TCGAbiolinks包

    if (!requireNamespace("BiocManager", quietly = TRUE))
        install.packages("BiocManager")
    
    BiocManager::install("TCGAbiolinks")
    
    library(TCGAbiolinks)
    library(dplyr)
    library(DT)
    

    查看各个癌种的项目id,总共有65个ID值

    getGDCprojects()$project_id
    # [1] "GENIE-MSK"             "TCGA-UCEC"             "TCGA-ACC"             
    # [4] "TCGA-LGG"              "TCGA-SARC"             "TCGA-PAAD"            
    # [7] "TCGA-ESCA"             "TCGA-LUAD"             "TCGA-PRAD"            
    # [10] "GENIE-VICC"            "TCGA-LAML"             "TCGA-KIRC"            
    # [13] "TCGA-COAD"             "TCGA-PCPG"             "TCGA-HNSC"            
    # [16] "BEATAML1.0-COHORT"     "BEATAML1.0-CRENOLANIB" "GENIE-JHU"            
    # [19] "MMRF-COMMPASS"         "TCGA-LUSC"             "TCGA-OV"              
    # [22] "TCGA-GBM"              "TCGA-UCS"              "WCDT-MCRPC"           
    # [25] "TCGA-MESO"             "TCGA-TGCT"             "TCGA-KICH"            
    # [28] "TCGA-READ"             "TCGA-UVM"              "TCGA-STAD"            
    # [31] "TCGA-THCA"             "OHSU-CNL"              "GENIE-DFCI"           
    # [34] "GENIE-NKI"             "ORGANOID-PANCREATIC"   "VAREPOP-APOLLO"       
    # [37] "GENIE-GRCC"            "FM-AD"                 "TARGET-ALL-P1"        
    # [40] "TARGET-ALL-P2"         "TARGET-ALL-P3"         "TARGET-RT"            
    # [43] "CTSP-DLBCL1"           "GENIE-UHN"             "GENIE-MDA"            
    # [46] "TARGET-AML"            "TARGET-NBL"            "TARGET-WT"            
    # [49] "NCICCR-DLBCL"          "TCGA-LIHC"             "TCGA-THYM"            
    # [52] "TCGA-CHOL"             "TCGA-SKCM"             "TARGET-CCSK"          
    # [55] "TARGET-OS"             "TCGA-DLBC"             "TCGA-CESC"            
    # [58] "TCGA-BRCA"             "TCGA-KIRP"             "TCGA-BLCA"            
    # [61] "CGCI-HTMCP-CC"         "HCMI-CMDC"             "CGCI-BLGSP"           
    # [64] "CPTAC-2"               "CPTAC-3" 
    

    查看project中有哪些数据类型,如查询"TCGA-LUAD",有7种数据类型,case_count为病人数,file_count为对应的文件数,file_size为总文件大小,若要下载表达谱,可以设置参数data.category="Transcriptome Profiling"

    TCGAbiolinks:::getProjectSummary("TCGA-LUAD")
    # $case_count
    # [1] 585
    # 
    # $file_count
    # [1] 18162
    # 
    # $file_size
    # [1] 2.304064e+13
    # 
    # $data_categories
    #   case_count file_count               data_category
    # 1        519       2916     Transcriptome Profiling
    # 2        569       5368 Simple Nucleotide Variation
    # 3        585       2731                 Biospecimen
    # 4        585        623                    Clinical
    # 5        579        657             DNA Methylation
    # 6        518       3383       Copy Number Variation
    # 7        582       2484            Sequencing Reads
    

    2. 下载LUAD肺腺癌的转录组数据

    建立查询

    query <- GDCquery(project = "TCGA-LUAD",
                      data.category = "Transcriptome Profiling",
                      data.type = "Gene Expression Quantification", 
                      workflow.type = "HTSeq - Counts")
    

    查看所有样本编号

    # 594个barcode
    samplesDown <- getResults(query,cols=c("cases"))
    

    从结果中筛选出肿瘤样本barcode:TP(primary solid tumor)

    # 533个barcode
    dataSmTP <- TCGAquery_SampleTypes(barcode = samplesDown,
                                      typesample = "TP")
    

    从结果中筛选出NT(正常组织)样本的barcode:NT()

    # 59个barcode
    dataSmNT <- TCGAquery_SampleTypes(barcode = samplesDown,
                                      typesample = "NT")
    

    还有2个复发的样本barcode:NR(Recurrent Solid Tumor)

    # 2个barcode
    dataSmTR <- TCGAquery_SampleTypes(barcode = samplesDown,
                                      typesample = "TR")
    

    重新按照样本分组建立查询

    queryDown <- GDCquery(project = "TCGA-LUAD", 
                          data.category = "Transcriptome Profiling",
                          data.type = "Gene Expression Quantification", 
                          workflow.type = "HTSeq - Counts", 
                          barcode = c(dataSmTP, dataSmNT)
                          )
    

    下载查询到的数据,默认存放位置为当前工作目录下的GDCdata文件夹中

    GDCdownload(query = queryDown, files.per.chunk=6, method ="api", 
                directory ="lung_cancer")
    

    3. 数据读入与预处理

    读取下载的数据并将其准备到R对象中,在工作目录生成luad_case.rda文件,同时还产生Human_genes__GRCh38_p12_.rda文件(project文件)

    dataPrep1 <- GDCprepare(query = queryDown, save = TRUE, 
                            save.filename = "luad_cases.rda", 
                            directory ="lung_cancer")
    # Starting to add information to samples
    # => Add clinical information to samples
    # => Adding TCGA molecular information from marker papers
    # => Information will have prefix 'paper_' 
    # luad subtype information from:doi:10.1038/nature13385
    # Accessing www.ensembl.org to get gene information
    # Downloading genome information (try:0) Using: Human genes (GRCh38.p13)
    # From the 60483 genes we couldn't map 3990(不能mapping则自动删除该基因)
    # 去除dataPrep1中的异常值,dataPrep数据中含有肿瘤组织和正常组织的数据
    
    # save(dataPrep1, file="dataPrep1_LUAD_TP_TN.RData")
    rm(list=ls())
    load("dataPrep1_LUAD_TP_TN.RData")
    

    TCGAanalyze_Preprocessing执行不同样本表达矩阵之间相关性(Array Array Intensity correlation,AAIC)分析。根据样本之间Spearman相关系数的平方对称矩阵和箱线图,找到具有低相关性的样本,这些样本可以被识别为可能的异常值。使用spearman相关系数去除数据中的异常值,一般设置阈值为0.6。在该次分析中无异常样本

    # 无样本被删除
    dataPrep <- TCGAanalyze_Preprocessing(object = dataPrep1, 
                                          cor.cut = 0.6,
                                          datatype = "HTSeq - Counts",
                                          filename = "AAIC.png")
    

    将数据进行标准化使得不同样本之间具有可比性,使用包含EDASeq包功能的TCGAanalyze_Normalization函数,将mRNA标准化,此功能实现泳道内归一化程序,以针对GC含量效应(或其他基因水平的效应):全局缩放和分位数归一化。

    dataNorm.luad <- TCGAanalyze_Normalization(tabDF = dataPrep,
                                               geneInfo = geneInfo,
                                               method = "gcContent")
    

    使用TCGAanalyze_Filtering对低表达量的mRNA进行过滤,method = "quantile"时为筛选出所有样本中均值小于所有样本中rowMeans qnt.cut = 0.25的分位数(默认为25%的分位数)基因,即只保留表达量为前75%的基因。参考:The filtering of low-expression genes of RNA-seq data

    dataFilt <- TCGAanalyze_Filtering(tabDF = dataNorm.luad,
                                      method = "quantile", 
                                      qnt.cut =  0.25)
    save(dataFilt, file="dataFilt.RData")
    load("dataFilt.RData")
    

    4. 分组及差异分析

    选择NT组的前10和TP组的前10个样本进行差异分析

    # selection of normal samples "NT"
    samplesNT <- TCGAquery_SampleTypes(barcode = colnames(dataFilt),
                                       typesample = c("NT"))
    # selection of tumor samples "TP"
    samplesTP <- TCGAquery_SampleTypes(barcode = colnames(dataFilt), 
                                       typesample = c("TP"))
    # Diff.expr.analysis (DEA)
    dataDEGs <- TCGAanalyze_DEA(mat1 = dataFilt[,samplesNT[1:10]],
                                mat2 = dataFilt[,samplesTP[1:10]],
                                Cond1type = "Normal",
                                Cond2type = "Tumor",
    #                            fdr.cut = 0.01 ,
    #                            logFC.cut = 1,
                                method = "glmLRT")
    # Batch correction skipped since no factors provided
    # ----------------------- DEA -------------------------------
    #   there are Cond1 type Normal in  10 samples
    # there are Cond2 type Tumor in  10 samples
    # there are  12980 features as miRNA or genes 
    # I Need about  8.7 seconds for this DEA. [Processing 30k elements /s]  
    # ----------------------- END DEA -------------------------------
    
    DEG.LUAD.edgeR <- TCGAanalyze_DEA(mat1 = dataFilt[,samplesNT[1:10]],
                                      mat2 = dataFilt[,samplesTP[1:10]],
                                      pipeline="edgeR",
                                      batch.factors = c("TSS"),
                                      Cond1type = "Normal",
                                      Cond2type = "Tumor",
                                      voom =FALSE, 
                                      method = "glmLRT",
                                      # fdr.cut = 0.01,  #保留FDR<0.01的基因
                                      # logFC.cut = 1 #保留logFC>1的基因
    )
    # ----------------------- DEA -------------------------------
    #   there are Cond1 type Normal in  10 samples
    # there are Cond2 type Tumor in  10 samples
    # there are  12980 features as miRNA or genes 
    # I Need about  8.7 seconds for this DEA. [Processing 30k elements /s]  
    # ----------------------- END DEA -------------------------------
    

    5. 绘制火山图和热图

    火山图绘制

    valcano_data <- data.frame(genes=rownames(DEG.LUAD.edgeR), 
                               logFC=DEG.LUAD.edgeR$logFC, 
                               FDR=DEG.LUAD.edgeR$FDR,
                               group=rep("NotSignificant", 
                                         nrow(DEG.LUAD.edgeR)),
                               stringsAsFactors = F)
    
    valcano_data[which(valcano_data['FDR'] < 0.05 & 
                         valcano_data['logFC'] > 1.5),"group"] <- "Increased"
    valcano_data[which(valcano_data['FDR'] < 0.05 &
                         valcano_data['logFC'] < -1.5),"group"] <- "Decreased"
    
    cols = c("darkgrey","#00B2FF","orange")
    names(cols) = c("NotSignificant","Increased","Decreased")
    
    library(ggplot2)
    ggplot(valcano_data, aes(x = logFC, y = -log10(FDR), color = group))+
      scale_colour_manual(values = cols) +
      ggtitle(label = "Volcano Plot", subtitle = "LUAD 10 samples volcano plot") +
      geom_point(size = 2.5, alpha = 1, na.rm = T) +
      theme_bw(base_size = 14) + 
      theme(legend.position = "right") + 
      xlab(expression(log[2]("logFC"))) + 
      ylab(expression(-log[10]("FDR"))) +
      geom_hline(yintercept = 1.30102, colour="#990000", linetype="dashed") + 
      geom_vline(xintercept = 1.5849, colour="#990000", linetype="dashed") + 
      geom_vline(xintercept = -1.5849, colour="#990000", linetype="dashed")+ 
      scale_y_continuous(trans = "log1p")
    
    
    火山图

    在火山图中标注特定基因如CALCA和MUC5B基因为红色

    valcano_data_specific_genes = valcano_data[which(valcano_data$genes %in% 
                                              c("CALCA", "MUC5B")), ]
    
    cols = c("darkgrey","#00B2FF","orange", "red")
    names(cols) = c("NotSignificant","Increased","Decreased", "Specific")
    
    library(ggplot2)
    ggplot(valcano_data, aes(x = logFC, y = -log10(FDR), color = group))+
      scale_colour_manual(values = cols) +
      ggtitle(label = "Volcano Plot", subtitle = "LUAD 10 samples volcano plot") +
      geom_point(size = 2.5, alpha = 1, na.rm = T) +
      theme_bw(base_size = 14) + 
      theme(legend.position = "right") + 
      xlab(expression(log[2]("logFC"))) + 
      ylab(expression(-log[10]("FDR"))) +
      geom_hline(yintercept = 1.30102, colour="#990000", linetype="dashed") + 
      geom_vline(xintercept = 1.5849, colour="#990000", linetype="dashed") + 
      geom_vline(xintercept = -1.5849, colour="#990000", linetype="dashed")+ 
      scale_y_continuous(trans = "log1p") + 
      geom_point(data=valcano_data_specific_genes, col="red", size=2)
    
    火山图——带标注

    合并平均表达量与差异基因表格(方便查询)

    #获取肿瘤组织对应的表达矩阵
    dataTP <- dataFilt[,samplesTP[1:10]]
    
    #获取正常组织对应的表达矩阵
    dataTN <- dataFilt[,samplesNT[1:10]]
    
    # 合并表格,增加2组每个基因的平均表达量
    dataDEGsFiltLevel <- TCGAanalyze_LevelTab(FC_FDR_table_mRNA=dataDEGsFilt,
                                              typeCond1="Normal",
                                              typeCond2="Tumor", 
                                              TableCond1=dataTN,
                                              TableCond2=dataTP)
    #查看结果
    head(dataDEGsFiltLevel,2)
    # mRNA     logFC         FDR   Normal    Tumor     Delta
    # SFTPC   SFTPC -9.656391 0.001493513 850460.0  76090.8 8212374.1
    # SFTPA2 SFTPA2 -1.755048 0.277785397 521998.4 154404.7  916132.2
    # CALCA CALCA   5.803164    0.2678382   22.8    9511.5  132.3121
    # MUC5B MUC5B   5.2703226   1.827029e-03    2722.7  49155.1 14349.507227
    
    

    热图绘制

    library(pheatmap)
    t <- dataFilt[valcano_data$genes[which(valcano_data['FDR'] < 0.05 & 
                                abs(valcano_data['logFC']) >1.5)],
                  c(samplesTP[1:10], samplesNT[1:10])]
    
    t <- na.omit(t)
    t <- log2(t+1)
    col <- data.frame(Type=c(rep("Tumor", 10), rep("Normal", 10)))
    col$Type <- factor(col$Type, levels = c("Normal", "Tumor"))
    rownames(col) <- colnames(t)
    pheatmap(t,
             annotation_col = col,
             annotation_colors = list(Type=c(Tumor="red", Normal="#00B2FF")),
             annotation_legend = T,
             border_color = "black",
             cluster_rows = T,
             cluster_cols = T,
             show_colnames = F,
             show_rownames = F,
             color = colorRampPalette(c("green", "black", "red"))(50))
    
    热图

    相关文章

      网友评论

        本文标题:TCGA 01 TCGAbiolinks下载转录组数据差异分析

        本文链接:https://www.haomeiwen.com/subject/inhbohtx.html