美文网首页
2022-03-20 TCGA 差异基因处理2

2022-03-20 TCGA 差异基因处理2

作者: 千容安 | 来源:发表于2022-03-21 00:05 被阅读0次
    setwd("C:\\Users\\Administrator.DESKTOP-4UQ3Q0K\\Desktop\\国奖")
    library("TCGAbiolinks")
    length(paired)
    NT <- data.frame(NT1=substr(samplesNT,1,12),NT2=samplesNT)
    TP <- data.frame(TP1 =substr(samplesTP,1,12),TP2=samplesTP)
    TP_NT <- merge(TP,NT,by.x = "TP1",by.y = "NT1")
    head(TP_NT,3)
    #1.1获取配对正常组织的barcodes:
    TP <- TP_NT$TP2
    
    #1.2获取配对肿瘤组织的barcodes:
    NT <- TP_NT$NT2
    #2 参照前面几期进行数据下载和数据预处理:
    queryDown <- GDCquery(project = "TCGA-PRAD",
                          data.category = "Transcriptome Profiling",
                          data.type = "Gene Expression Quantification",
                          workflow.type = "HTSeq - Counts",
                          barcode = c(TP, NT))
    
    GDCdownload(queryDown,method = "api", directory = "GDCdata",
                files.per.chunk = 10)
    
    #2.1 将数据准备成R语言可处理的形式
    dataPrep1 <- GDCprepare(query = queryDown, save = TRUE, save.filename =
                              "PRAD_case2.rda")
    
    #2.2数据预处理:根据样本与样本之间的spearman相关系数去掉离群值
    dataPrep2 <- TCGAanalyze_Preprocessing(object= dataPrep1,
                                           cor.cut = 0.6,
                                           datatype = "HTSeq - Counts")
    
    #2.3选择肿瘤纯度大于90%的肿瘤样本
    purityDATA <- TCGAtumor_purity(colnames(dataPrep1), 0, 0, 0, 0, 0.9)
    # # filtered 为被过滤的数据, pure_barcodes是我们要的肿瘤数据
    Purity.PRAD<-purityDATA$pure_barcodes
    normal.PRAD<-purityDATA$filtered
    View(normal.PRAD)  #52个
    View(Purity.PRAD)  #34个
    #获取肿瘤纯度大于90%的样本+正常组织样本,共计86个样本
    puried_data <-dataPrep2[,c(Purity.PRAD,normal.PRAD)]
    dim(puried_data)
    #有34个肿瘤纯度大于90%的样本
    
    #2.4基因注释
    library("SummarizedExperiment")
    rowData(dataPrep1)
    # DataFrame with 56602 rows and 3 columns
    #                 ensembl_gene_id external_gene_name original_ensembl_gene_id
    #                     <character>        <character>              <character>
    # ENSG00000000003 ENSG00000000003             TSPAN6       ENSG00000000003.13
    # ENSG00000000005 ENSG00000000005               TNMD        ENSG00000000005.5
    rownames(puried_data)<-rowData(dataPrep1)$external_gene_name
    write.csv(puried_data,file="puried.PRAD.csv",quote=FALSE)
    #2.5使用EDAseq进行文库大小和GC丰度标准化
    dataNorm <- TCGAanalyze_Normalization(tabDF = puried_data,
                                          geneInfo = geneInfo,
                                          method = "gcContent")
    
    #2.6过滤低count的基因,并将结果输出
    dataFilt <- TCGAanalyze_Filtering(tabDF = dataNorm,
                                      method = "quantile",
                                      qnt.cut =  0.25)
    
    write.csv(dataFilt,file = "paired_TCGA_PRAD_final.csv",quote = FALSE)
    
    
    
    
    
    
    TCGA_PRAD_data <- read.csv(file = "TCGA_PRAD_final.csv",
                               header = T,
                               row.names = 1,
                               check.names = FALSE)
    #samplesNT <- TCGAquery_SampleTypes(colnames(TCGA_PRAD_data), typesample = c("NT"))
    #samplesTP <- TCGAquery_SampleTypes(colnames(TCGA_PRAD_data), typesample = c("TP"))
    #paired <- intersect(substr(samplesNT,1,12),substr(samplesTP,1,12))
    
    dataFilt <- read.csv(file = "paired_TCGA_PRAD_final.csv",header = T,row.names = 1,check.names = FALSE)
    
    samplesNT <- TCGAquery_SampleTypes(colnames(dataFilt), typesample = c("NT"))
    
    #设置typesample=TP,获取肿瘤组织对应的barcodes
    samplesTP <- TCGAquery_SampleTypes(colnames(dataFilt), typesample = c("TP"))
    
    TCGA_PRAD_data <- read.csv(file = "paired_TCGA_PRAD_final.csv",header = T,row.names = 1,check.names = FALSE)
    
    mat1 <- TCGA_PRAD_data[,1:52]
    mat2 <- TCGA_PRAD_data[,53:106]
    DEG.PRAD.edgeR <- TCGAanalyze_DEA(mat1 = mat1,
                                      mat2 = mat2,
                                      pipeline  =  "edgeR",
                                      batch.factors =  c("TSS"),                                  Cond1type = "tumor",
                                      Cond2type = "normal",
                                      voom = FALSE,             ##设置了paired时,会出错(paired = TRUE),故此处未设置
                                      method = "glmLRT",
                                      contrast.formula = "Mycontrast =tumor -normal",
                                      fdr.cut = 0.01,           #设置过滤参数1,保留FDR<0.01的基因
                                      logFC.cut = 1)
    write.csv(DEG.PRAD.edgeR,file = "paired_DEG_by_edgeR.csv")
    
    dataFilt.PRAD.final<-read.csv(file = "paired_TCGA_PRAD_final.csv",header = T,row.names = 1,check.names = FALSE)
    
    dataDEGsFilt <- DEG.PRAD.edgeR[abs(DEG.PRAD.edgeR$logFC) >= 1,]
    str(dataDEGsFilt)
    # 'data.frame':    1336 obs. of  5 variables:
    #4.1 TCGAquery_SampleTypes()用于获取特定组织对应的barcodes,如肿瘤组织(TP)、正常组织(NT)
    #设置typesample =NT,获取正常组织对应的barcodes
    samplesNT <- TCGAquery_SampleTypes(colnames(dataFilt.PRAD.final), typesample = c("NT"))
    
    #设置typesample=TP,获取肿瘤组织对应的barcodes
    samplesTP <- TCGAquery_SampleTypes(colnames(dataFilt.PRAD.final), typesample = c("TP"))
    dataTP <- dataFilt.PRAD.final[,samplesTP]
    dataTN <- dataFilt.PRAD.final[,samplesNT]
    
    dataDEGsFiltLevel <- TCGAanalyze_LevelTab(FC_FDR_table_mRNA = dataDEGsFilt,
                                              typeCond1 = "Normal",
                                              typeCond2 = "Tumor",
                                              TableCond1 = dataTN,
                                              TableCond2 = dataTP)
    head(dataDEGsFiltLevel,2)
    pca <- TCGAvisualize_PCA(dataFilt= dataFilt,
                             dataDEGsFiltLevel=dataDEGsFiltLevel, 
                             ntopgenes = 100, 
                             group1=samplesNT, 
                             group2=samplesTP)
    

    从这开始不出图了

    #6.1 获取差异表达基因的表达水平
    datDEGs <- dataFilt.PRAD.final[match(rownames(DEG.PRAD.edgeR),rownames(dataFilt.PRAD.final)),]
    
    str(datDEGs)
    #'data.frame':    1336 obs. of  106 variables:
    
    #6.2 根据临床信息构造patient的metadata信息;或(和)基因的相关信息(此处不添加基因的注释信息)
    #获取每一个患者barcode(barcode的前12位代表的patient信息,前16位代表的是sample信息)对应的临床信息,但是其barcodes与datDEGs_test顺序不一致
    query <- GDCquery(project = "TCGA-PRAD",
                      data.category = "Clinical",
                      file.type = "xml",
                      barcode = substr(colnames(datDEGs),1,12))
    
    GDCdownload(query)  
    
    clinical <- GDCprepare_clinic(query,"patient")  #53个样本
    ##根据表达矩阵中的样本barcodes对样本临床信息匹配
    datDEGs_test_barcodes <- as.data.frame(substr(colnames(datDEGs),1,12), ncol = 1 )
    colnames(datDEGs_test_barcodes) <-"PRAD_patient_barcode"
    
    m <- clinical[match(datDEGs_test_barcodes[,1], clinical[ , 1 ]),]
    
    str(m)
    #'data.frame':    106 obs. of  68 variables:
    
    table(duplicated(m))
    #FALSE  TRUE
    #52    54
    #使用table(duplicated()查看m矩阵中是否有重复数据。
    #这里的重复数据来源(肿瘤组合和癌旁正常组织来源于同一患者)
    library(pheatmap)
    
    
    #基本做图结果
    pheatmap::pheatmap(datDEGs,scale = "row",show_rownames = F,show_colnames = F)
    #增加metadata信息
    col.mdat <- data.frame(Sex=m$gender,
                           status=m$vital_status,
                           group=c(rep("tumor",52),rep("normal",54)))
    rownames(col.mdat) <- colnames(datDEGs)  
    #保证列注释信息的行名与样本名(对应列)一致
    
    #设置图例的范围
    bk <- c(seq(-1,6,by=0.01))
    
    #绘制热图
    pheatmap::pheatmap(datDEGs,scale = "row",show_rownames = F,show_colnames = F,
                       annotation_col = col.mdat,
                       border_color=NA,
                       main = "Heatmap by pheatmap(edgeR)",
                       filename = "Heatmap_by_pheatmap.pdf",
                       color =c(colorRampPalette(colors = c("blue","white"))(length(bk)/2),
                                colorRampPalette(colors = c("white","red"))(length(bk)/2)),
                       legend_breaks=seq(-1,6,2),
                       breaks=bk)
    
    DEG.PRAD.filt<-DEG.PRAD.edgeR[which(abs(DEG.PRAD.edgeR$logFC) >= 4), ]
    str(DEG.PRAD.filt)
    #'data.frame':    24 obs. of  5 variables:
    #共有24个基因满足|logFC|≥4
    TCGAVisualize_volcano(DEG.PRAD.edgeR$logFC, 
                          DEG.PRAD.edgeR$FDR,
                          filename = "TumorvsNormal_FC8.edgeR.pdf", 
                          xlab = "logFC",
                          names = rownames(DEG.PRAD.edgeR), 
                          show.names = "highlighted",
                          x.cut = 1, 
                          y.cut = 0.01, 
                          highlight = rownames(DEG.PRAD.edgeR),
                          highlight.color = "orange",
                          title = "volcano plot by edgeR")
    title = "volcano plot by limma"
    plot(1:5,1:5)
    dev.off()
    dev.new()
    TCGAVisualize_volcano(x = DEG.PRAD.edgeR$logFC,
                          y = DEG.PRAD.edgeR$FDR,
                          filename = "volcanoexp.png",
                          x.cut = 1,
                          y.cut = 0.01,
                          names = rownames(DEG.PRAD.edgeR),
                          color = c("gray80","#99CC00","#FF99CC"),
                          names.size = 2,
                          xlab = " Gene expression fold change (Log2)",
                          legend = "State",
                          title = "Volcano plot",
                          width = 10)
    
    
    volcanoexp.png

    相关文章

      网友评论

          本文标题:2022-03-20 TCGA 差异基因处理2

          本文链接:https://www.haomeiwen.com/subject/bmiodrtx.html