硬着头皮往下走PCA|GSEA

作者: Juan_NF | 来源:发表于2019-08-27 16:36 被阅读0次

    Paper:Tumor Evolution and Drug Response in Patient-Derived Organoid Models of Bladder Cancer
    Figures:

    PCA
    ssGSEA
    其实,是很简单的处理:
    • 差异分析后提取top1000进行PCA,要表达的意思是,tumor和TCGA的tumor能聚在一起;
    • 对差异基因进行GSEA的KEGG富集(还有GO富集,当时眼睛可能有点儿瞎,一直在想为啥extracellular的没有富集到。。。那不是KEGG啊,傻孩子,此处感谢jimmy);
    • 有了其实,必有但是;作者上传的矩阵是辣个样子的,a.GEOquery拿不到矩阵;b.网页下载的矩阵是DESeq2-normalized countsc.英文不翻之DESeq2 doesn’t actually use normalized counts, rather it uses the raw counts and models the normalization inside the Generalized Linear Model (GLM). These normalized counts will be useful for downstream visualization of results, but cannot be used as input to DESeq2 or any other tools that peform differential expression analysis which use the negative binomial model.

    纯代码:

    Step1-download
    ###一些常规的设置
    rm(list = ls())#清空环境变量
    options(stringsAsFactors = F)##字符不作为因子读入
    #####数据下载
    library(GEOquery)
    h<-'GSE103990.Rdata'
    ####getGPL获得平台的注释信息,但下载速度会慢很多
    ####而且注释文件格式大多不如bioconductor包好用
    if(!file.exists(h)){
      gset<-getGEO('GSE103990',destdir='.',
                   AnnotGPL=F,
                   getGPL=F)
      save(gset,file=h)
    }
    load('GSE103990.Rdata')
    ex<- exprs(gset[[1]])
    pd <- pData(gset[[1]])
    #system('nohup wget ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE103nnn/GSE103990/suppl/GSE103990_Normalized_counts.txt.gz &')
    counts_nor <- read.table('GSE103990_Normalized_counts.txt.gz')
    save(counts_nor,pd,file='Nor.Rdata')
    
    Step2-差异分析(数据不合适,但代码木有问题)
    rm(list = ls())
    options(stringsAsFactors = F)
    load('Nor.Rdata')
    rownames(counts_nor)<- substr(rownames(counts_nor),1,15)
    exprSet<- floor(counts_nor)
    pd <- pd[match(colnames(exprSet),pd$description.1),]
    group_list <- ifelse(grepl('org',pd$title),'org','tumor')
    suppressMessages(library(DESeq2))
    #### 第一步,构建DESeq2的DESeq对象
    colData <- data.frame(row.names=colnames(exprSet),group_list=group_list)
    dds <- DESeqDataSetFromMatrix(countData = exprSet,colData = colData,
                                  design = ~ group_list)
    #### 第二步,进行差异表达分析
    dds2 <- DESeq(dds)
    res <- results(dds2,contrast=c("group_list","org","tumor"))
    resOrdered <- res[order(res$padj),]
    DEG <- as.data.frame(resOrdered)
    DESeq2_DEG = na.omit(DEG)
    nrDEG=DESeq2_DEG[,c(2,6)]
    colnames(nrDEG)=c('log2FoldChange','pvalue')  
    save(nrDEG, DESeq2_DEG, file = "DEG.Rdata")
    
    Step3-PCA
    rm(list=ls())
    library(edgeR)
    load('Nor.Rdata')
    load('BLC.Rdata')
    load('DEG.Rdata')
    rownames(counts_nor)<- substr(rownames(counts_nor),1,15)
    nor_BLCA <- edgeR::cpm(expr_BLC,log=T)
    nor_paper <- edgeR::cpm(floor(counts_nor),log=T)
    inter_gene<- intersect(rownames(nor_BLCA),rownames(nor_paper))
    choose_gene <- rownames(nrDEG)[abs(nrDEG$log2FoldChange)>1.5&nrDEG$pvalue<0.01]
    nrDEG <- nrDEG[order(abs(nrDEG$log2FoldChange),nrDEG$pvalue,decreasing = T),]
    final_choose<- rownames(nrDEG)[rownames(nrDEG)%in%inter_gene][1:1000]
    nr_paper <- nor_paper[final_choose,]
    nr_BLC <- nor_BLCA[final_choose,]
    nr_pca <- cbind(nr_paper,nr_BLC)
    group_list <- ifelse(grepl('TCGA',colnames(nr_pca)),'TCGA',ifelse(grepl('org',pd$title),'org','tumor'))
    library("FactoMineR")
    library("factoextra") 
    ####
    df.pca <- PCA(t(nr_pca), graph = FALSE)
    fviz_pca_ind(df.pca,
                 geom.ind = "point",
                 col.ind = group_list, 
                 addEllipses = F, 
                 legend.title = "Groups")
    
    Step4-GSEA

    如果对GSEA中的细节感兴趣,看https://www.jianshu.com/p/be8fe1318850

    rm(list=ls())
    load('DEG.Rdata')
    library(org.Hs.eg.db)
    library(clusterProfiler)
    gene <- bitr(rownames(nrDEG), fromType = "ENSEMBL",
                    toType =  "ENTREZID",
                    OrgDb = org.Hs.eg.db)
    gene$logfc <- nrDEG$log2FoldChange[match(gene$ENSEMBL,rownames(nrDEG))]
    geneList=nrDEG$log2FoldChange
    names(geneList)=gene$ENTREZID
    geneList=sort(geneList,decreasing = T)
    head(geneList)
    library(clusterProfiler)
    kk_gse <- gseKEGG(geneList     = geneList,
                      organism     = 'hsa',
                      nPerm        = 1000,
                      minGSSize    = 10,
                      pvalueCutoff = 0.9,
                      verbose      = FALSE)
    go_gse <- gseGO(geneList     = geneList,
                    OrgDb=org.Hs.eg.db,
                      nPerm        = 1000,
                      minGSSize    = 10,
                      pvalueCutoff = 0.9,
                      verbose      = FALSE)
    paper_choose <- c('Cell adhesion molecules (CAMs)',
                      'Cell cycle',
                      'ErbB signaling pathway',
                      'extracellular structure organization')
    a <- list()
    for (i in 1:3){
    a[[i]]<- gseaplot2(kk_gse, 
             geneSetID = kk_gse@result$ID[kk_gse@result$Description==paper_choose[i]],
             pvalue_table = T)
      }
    a
    gseaplot2(go_gse, 
              geneSetID = go_gse@result$ID[go_gse@result$Description==paper_choose[4]],
              pvalue_table = T)
    
    Results:
    • 写在最后,其实PCA的目的是为了说明,肿瘤和TCGA的肿瘤可以聚在一起,而org聚在一起;
    • 如果感兴趣的话,就自己从上游走一遍RNA-seq流程,再走一遍我这里R的代码;再如果,你重复出来了,一定一定一定要回复告诉我,我给你个小福利;


      image.png
    image.png ErbB
    Cell cycle
    CAM
    extracellular

    课程分享
    生信技能树全球公益巡讲
    https://mp.weixin.qq.com/s/E9ykuIbc-2Ja9HOY0bn_6g
    B站公益74小时生信工程师教学视频合辑
    https://mp.weixin.qq.com/s/IyFK7l_WBAiUgqQi8O7Hxw
    招学徒:
    https://mp.weixin.qq.com/s/KgbilzXnFjbKKunuw7NVfw

    相关文章

      网友评论

        本文标题:硬着头皮往下走PCA|GSEA

        本文链接:https://www.haomeiwen.com/subject/vtveectx.html