美文网首页
富集分析与可视化:R 实现

富集分析与可视化:R 实现

作者: 挽山 | 来源:发表于2020-02-27 00:07 被阅读0次

    一. GO/KEGG

    (一)富集分析

    #source("https://bioconductor.org/biocLite.R")
    #options(BioC_mirror="http://mirrors.ustc.edu.cn/bioc/")
    #biocLite("DOSE") #画dotplot
    #biocLite("clusterProfiler")
    #biocLite("pathview") #通路画图
    #biocLite("DO.db")
    #biocLite("enrichplot")
    #biocLite("Cairo")
    #install.packages('xlsx')
    #install.packages('rJava')
    #install.packages('xlsxjars') 切换java10 但我忘了他是干嘛的了
    ## 
    library(DOSE)
    library(GO.db)
    library(org.Hs.eg.db)
    library(topGO)
    library(clusterProfiler)
    library(pathview)
    library(enrichplot)
    library(cowplot)
    library(ggplot2)
    library(Cairo)#go,kegg
    library(stringr)#kegg
    
    library(rJava)
    library(xlsxjars)
    library(xlsx)
    
    #这里是接WGCNA模块富集
    #GO
    Enrichment <- read.delim(file.choose(),sep = "\t",stringsAsFactors = F,header = T) # file=module+ID
    head(Enrichment)
    
    #选择需要富集的模块号
    cc<-c('4','6','7','9','17','29')
    pfc<-c('2','6','8','11',24)
    
    #模块内循环
    for (i in cc){
      #i=4
      Enrich_Source <- Enrichment[Enrichment$module== i,]
      gene <- as.character(Enrich_Source[,4]) #用ID列
      gene<-Enrichment[,2] #一般基因集合从这里跑,改ID所在列
      
      ego_BP <- enrichGO(gene = gene, #注意用的是哪个集合
                         OrgDb=org.Hs.eg.db,
                         ont = "BP",#生物过程(biological process)
                         pAdjustMethod = "BH",
                         minGSSize = 1,
                         pvalueCutoff = 0.05, #默认0.01
                         qvalueCutoff = 0.2,  #默认0.05,还见过设0.1的
                         readable = TRUE
                         )
      BP = as.data.frame(ego_BP @ result)
      write.xlsx(BP, paste('CC_1.5_10_0.15_GO_BP_Module_', i, '.xlsx', sep=""),row.names = F)
      write.xlsx(BP,'30-2233p-target_GO_BP.xlsx') #对应一般基因集合
      
     KEGG <- enrichKEGG(gene = gene, 
                         organism = "hsa", #R 3.5的版本就要这样写,而且必须联网
                         minGSSize = 1,
                         pvalueCutoff = 0.05,      #默认0.01
                         pAdjustMethod = "BH",
                         qvalueCutoff = 0.2,       #默认0.05,还见过设0.1的
                         use_internal_data = F
                         #readable = T
                        )
      KEGG_Pathway <- as.data.frame(KEGG @ result)
     write.xlsx(KEGG_Pathway,paste('CC_CC_1.5_10_0.15_KEGG_Pathway_Module_',i,'.xlsx',sep = ""),row.names = F)
     write.xlsx(KEGG_Pathway,'30-2233p-target_KEGG_Pathway.xlsx') #注意选择保存对象
     }
    
    #一般条图和气泡图就够用
    
    

    (二)GOChord、弦图什么的

    • 脱离clusterprofiler包,需要从以上富集结果整理输入文件
    #BiocManager::install("GOplot",ask = F,update = F)
    library(GOplot)
    library(readr)
    library(stringr)
    #https://www.jianshu.com/p/a50310f9418f
    #https://www.jianshu.com/p/48ac98098760
    #https://www.jianshu.com/p/9bda0ab65717
    #http://www.360doc.com/content/19/0416/12/52645714_829160785.shtml
    #输入文件准备
    #GO: ID term    genes   pvalue  category (可data.fram(ego))
    #genes: geneID  log2FC
    setwd('C:/Users/Documents/WORK/results')
    GO<-read_csv("27-GOChord-GO.csv", col_names = T);GO=data.frame(GO) 
    gene<-read_csv("27-GOChord-gene.csv", col_names = T);gene=data.frame(gene)
    GO$genes=str_replace_all(GO$genes, '/', ',')
    names(GO)=c('ID','term','genes','adj_pval','Category')
    names(gene)=c('ID','logFC')
    
    plot.data<-list(DA=GO, GE=gene)
    circ=data.frame();circ<-circle_dat(plot.data$DA, plot.data$GE)#创建绘图对象
    circ;names(circ)#"category" "ID"  "term"  "count"  "genes"  "logFC"  "adj_pval" "zscore"  
    process=unique(circ$term)
    #?????
    chord<-chord_dat(circ) #data,gene,process,error!
    head(chord)
    GOChord(chord)
    
    GOChord(chord, title="GOChord plot",#标题设置
            space = 0.03, #GO term处间隔大小设置
            limit = c(3, 5),
            #第一个数值为至少分配给一个基因的Goterm数,
            #第二个数值为至少分配给一个GOterm的基因数
            gene.order = 'logFC', gene.space = 0.25, gene.size = 5,#基因排序,间隔,名字大小设置
            #lfc.col=c('firebrick3', 'white','royalblue3'),##上调下调颜色设置
            #ribbon.col=colorRampPalette(c("blue", "red"))(length(EC$process)),#GO term 颜色设置
            ribbon.col=brewer.pal(length(EC$process), "Set3"))
    
    

    (三)进阶富集分析比较与可视化

    library(clusterProfiler)
    library(org.Hs.eg.db)
    library(dplyr)
    
    #多集合比较
    #https://blog.csdn.net/qazplm12_3/article/details/76474668
    #http://www.360doc.com/content/17/0728/08/19913717_674694421.shtml(修饰)
    #https://www.cnblogs.com/wangshicheng/articles/10122804.html(长文本截断,气泡大小)
    
    #data1
    data<-read.table(file = file.choose(),header = T,sep = '\t',stringsAsFactors = F)
    head(data)
    #模块号+ID
    cc_m4<-data[data$module=='4',3]
    cc_m6<-data[data$module=='6',3]
    cc_m7<-data[data$module=='7',3]
    cc_m9<-data[data$module=='9',3]
    cc_m17<-data[data$module=='17',3]
    cc_m29<-data[data$module=='29',3]
    
    CC<-list(M4=cc_m4,M6=cc_m6,M17=cc_m17,M29=cc_m29)
    
    #多组富集
    kegg_cc<-compareCluster(CC, 'enrichKEGG', organism = "hsa",pvalueCutoff = 0.05)
    dotplot(kegg_cc,showCategory = 30,color='pvalue') #可选呈现多少条目
    
    GO_bp_cc<-compareCluster(CC, 'enrichGO', ont = "BP", OrgDb=org.Hs.eg.db)
    dotplot(GO_bp_cc,showCategory = 12)
    
    #data2
    data2<-read.table(file = file.choose(),header = T,sep = '\t',stringsAsFactors = F)
    head(data2)
    
    pfc_m2<-data2[data2$module=='2',3]
    pfc_m5<-data2[data2$module=='5',3]
    pfc_m6<-data2[data2$module=='6',3]
    pfc_m8<-data2[data2$module=='8',3]
    pfc_m11<-data2[data2$module=='11',3]
    pfc_m24<-data2[data2$module=='24',3]
    
    PFC<-list(M2=pfc_m2,M6=pfc_m6,M8=pfc_m8)
    
    kegg_pfc<-compareCluster(PFC, 'enrichKEGG', organism = "hsa",pvalueCutoff = 0.05)
    dotplot(kegg_pfc,showCategory = 30,color='pvalue')
    
    GO_bp_pfc<-compareCluster(PFC, 'enrichGO', ont = "BP", OrgDb=org.Hs.eg.db)
    dotplot(GO_bp_pfc,showCategory = 12)
    
    
    # 保存结果 
    library(xlsx)
    setwd()
    res_kegg<-kegg@compareClusterResult
    write.table(res_kegg, file="CC_CompareEnrichment_KEGG.xlsl", row.names=F)
    res_GO_bp<-GO_bp@compareClusterResult
    write.table(res_GO_bp, file="CompareEnrichment_GOBP.xlsx", row.names=F)
    
    #结果筛选
    sub = res %>% group_by(Cluster) %>% top_n(-10, pvalue)
    sub10 = res[res$Description %in% sub$Description,]
    
    # 多模块间比较-----------------------------------------------------------------
    A<-list(CC_M4=cc_m4,
             CC_M6=cc_m6,
             #CC_M7=cc_m7,
             #CC_M9=cc_m9,
             CC_M17=cc_m17,
             CC_M29=cc_m29,
             
             PFC_M2=pfc_m2,
             #PFC_M5=pfc_m5, 
             PFC_M6=pfc_m6,
             PFC_M8=pfc_m8)
             #PFC_M11=pfc_m11,
             #PFC_M24=pfc_m24)
    
    kegg<-compareCluster(A, 'enrichKEGG', organism = "hsa",pvalueCutoff = 0.05)
    
    GO_bp<-compareCluster(A, 'enrichGO', ont = "BP", OrgDb=org.Hs.eg.db,pvalueCutoff = 0.05)
    GO_cc<-compareCluster(A, 'enrichGO', ont = "CC", OrgDb=org.Hs.eg.db,pvalueCutoff = 0.05)
    GO_mf<-compareCluster(A, 'enrichGO', ont = "MF", OrgDb=org.Hs.eg.db,pvalueCutoff = 0.05)
    
    #可视化-------------------------------------------------------------------------------------
    library(ggplot2)
    library(stringr)
    p=dotplot(kegg,
            showCategory = 6, #自定义
            includeAll=T,
            #color='p.adjust',
            font.size=10,
            title='xxxxxx', #一般不设置
            by='geneRatio'
            #by='count'
            )+
      theme(axis.text.x = element_text(angle=90))
    p2<-p + scale_color_continuous(low='purple',high='green')
    p3<-p2+scale_y_discrete(labels=function(kegg) str_wrap(kegg, width=40)) #控制term长度
    p4<-p3+scale_size(range=c(2, 8));p4
    #p5<-p4 + aes(shape=GeneRatio > 0.1)
    
    q=dotplot(GO_bp,
            showCategory = 5,
            #color='pvalue',
            color='p.adjust',
            font.size=10,
            title='',
            by='geneRatio'
            )+
      theme(axis.text.x = element_text(angle=90))
    q2<-q + scale_color_continuous(low='purple',high='green')
    q3<-q2+scale_y_discrete(labels=function(kegg) str_wrap(kegg, width=50))
    q4<-q3+scale_size(range=c(2, 5));q4
    
    #拼图(好像效果不好?可以用AI拼)
    library("gridExtra")
    grid.arrange(q4, p4, ncol = 2)
    

    (四)GOplot包(需要FC数据)

    (五)clusterProfiler + ggplot2

    library(clusterProfiler)
    library(org.Hs.eg.db)
    library(rJava)
    library(xlsxjars)
    library(xlsx)
    
    Enrichment <- read.delim(file.choose(),sep = "\t",stringsAsFactors = F,header = T) # file=module+ID
    names(Enrichment)
    gene<-Enrichment[,2] 
    
    GO<-enrichGO(gene=gene,            #基因对应的向量
                 OrgDb = "org.Hs.eg.db",  #OrgDb指定该物种对应的org包的名字
                 keyType = "ENTREZID",    #基因ID的类型,默认为ENTREZID,也可用ENSEMBL
                 ont="ALL",               #GO类别,BP, CC, MF,ALL
                 qvalueCutoff = 0.05,
                 pAdjustMethod ="BH",
                 readable = T) 
    
    go<-as.data.frame(GO)
    View(go)
    table(go[,1]) #查看BP,CC,MF的统计数目
    
    #导出结果表格
    res = as.data.frame(GO @ result)
    write.xlsx(res,'31-10a_5p_target-GO.xlsx') #对应一般基因集合
    
    #输入数据整理
    go_MF<-go[go$ONTOLOGY=="MF",][1:10,]
    go_CC<-go[go$ONTOLOGY=="CC",][1:10,]
    go_BP<-go[go$ONTOLOGY=="BP",][1:10,]
    
    go_enrich_df<-data.frame(ID=c(go_BP$ID, go_CC$ID, go_MF$ID),
                             Description=c(go_BP$Description, go_CC$Description, go_MF$Description),
                             GeneNumber=c(go_BP$Count, go_CC$Count, go_MF$Count),
                             type=factor(c(rep("biological process", 10), 
                                           rep("cellular component", 10),
                                           rep("molecular function",10)),
                                         levels=c("molecular function", "cellular component", "biological process")))
    
    #GO条目可视化预处理
    ## numbers as data on x axis
    go_enrich_df$number <- factor(rev(1:nrow(go_enrich_df)))
    ## shorten the names of GO terms
    shorten_names <- function(x, n_word=4, n_char=40){
      if (length(strsplit(x, " ")[[1]]) > n_word || (nchar(x) > 40))
      {
        if (nchar(x) > 40) x <- substr(x, 1, 40)
        x <- paste(paste(strsplit(x, " ")[[1]][1:min(length(strsplit(x," ")[[1]]), n_word)],
                         collapse=" "), "...", sep="")
        return(x)
      } 
      else
      {
        return(x)
      }
    }
    
    labels=(sapply(
      levels(go_enrich_df$Description)[as.numeric(go_enrich_df$Description)],
      shorten_names))
    names(labels) = rev(1:nrow(go_enrich_df))
    
    #ggplot2绘图
    ## colors for bar // green, blue, orange
    CPCOLS <- c("#8DA1CB", "#FD8D62", "#66C3A5")
    library(ggplot2)
    p <- ggplot(data=go_enrich_df, aes(x=number, y=GeneNumber, fill=type)) +
      geom_bar(stat="identity", width=0.8) + coord_flip() + 
      scale_fill_manual(values = CPCOLS) + theme_test() + 
      scale_x_discrete(labels=labels) +
      xlab("GO term") + 
      theme(axis.text=element_text(face = "bold", color="gray50")) +
      labs(title = "The Most Enriched GO Terms")
    #coord_flip(...)横向转换坐标:把x轴和y轴互换,没有特殊参数
    p
    

    其他:

    #单集合图--------------------------------------------------------------------------------
    #对于基因和富集的pathways之间的对应关系进行展示,如果一个基因位于一个pathway下,则将该基因与pathway连线
    cnetplot(KEGG, 
             showCategory = 20,
             #colorEdge='',
             circular=F,
             node_label=T)
    
    #在pathway通路图上标记富集到的基因(会给出一个url链接)
    browseKEGG(KEGG, "hsa04080")
    
    barplot(ego_BP,showCategory = 6,title="The GO_BP enrichment analysis of all DEGs ")+ 
      scale_size(range=c(2, 12))+
      scale_x_discrete(labels=function(ego_BP) str_wrap(ego_BP,width = 25))
    

    二. GSEA

    library(topGO)
    library(enrichplot)
    library(ggplot2)
    library(org.Hs.eg.db) #人类基因组注释相关的包
    library(DO.db)
    library(clusterProfiler)
    #library(OrgDb) 注意,该包在R3.5中不兼容,把gseaGO里的此位置直接换成org.Hs.eg.db就行!!
    
    # 导入差异表达结果,筛选
    diff<-read.table(file = file.choose(), sep="\t", quote="", header= T, row.names = 1)
    head(diff)
    genelist<-diff[(which(diff$FDR < 0.05)),]
    head(genelist)
    
    # id转换
    genelist_id<-bitr(unique(row.names(genelist)),
                        fromType="ENSEMBL",toType="ENTREZID",
                        OrgDb="org.Hs.eg.db",drop = TRUE)#转换ID
    
    # 信息合并
    genelist<-as.data.frame(cbind(row.names(genelist),genelist$FDR))
    names(genelist)<-c("ENSEMBL","FDR")
    
    gene<-merge(genelist_id, genelist, all=F)#将entrezID、ensemblID和logFC合并
    gene<-gene[,-1]#去掉ensemblID只保留entrezID
    
    # 排序(gseKEGG的输入必须是排序后的geneList;需要两列:命名(每一个数字都有一个对应的名字,就是相应的基因ID了);排序(是一串数字,数字是从高到低排序的))
    geneList<-as.numeric(as.character(gene[,2]))
    names(geneList) = as.character(gene[,1])
    geneList= sort(geneList, decreasing = TRUE) #构建geneList,并根据logFC由高到低排列
    
    # 分别做下调基因和上调基因的GSEA
    #library(OrgDb) 注意,该包在R3.5中不兼容,把gseaGO里的此位置直接换成org.Hs.eg.db就行!!
    
    # 排序(gseKEGG的输入必须是排序后的geneList;需要两列:命名(每一个数字都有一个对应的名字,就是相应的基因ID了);表达量或FC列
    gene<-read.table(file = file.choose(), sep="\t", quote="", header= T, row.names = 1)
    #regulation<-gene[gene$log2FoldChange < 0,]
    geneList<-as.numeric(as.character(gene[,3])) 
    names(gene)
    names(geneList) = as.character(gene[,1]) #ID列
    geneList= sort(geneList, decreasing = TRUE)#geneList根据logFC由高到低排列
    head(geneList)
    
    # GSEA——KEGG分析
    GSEA_KEGG<-gseKEGG(
      geneList =geneList,
      nPerm = 1000,#
      keyType = 'kegg',#可以选择"kegg",'ncbi-geneid', 'ncib-proteinid' and 'uniprot'
      organism = 'hsa',#定义物种,
      pvalueCutoff = 0.1,#自定义pvalue的范围
      pAdjustMethod = "BH" #校正p值的方法
    )
    GSEA_KEGG$Description#富集到那些基因集上
    GSEA_KEGG$enrichmentScore#富集得分
    
    # 根据enrichmentScore对GSEA的结果进行排序
    sort_GSEA_KEGG<-GSEA_KEGG[order(GSEA_KEGG$enrichmentScore,decreasing=T)]
    
    # 画图
    gseaplot2(GSEA_KEGG,row.names(sort_GSEA_KEGG))
    
    # 美化
    gseaplot2(GSEA_KEGG,row.names(sort_GSEA_KEGG),#只显示前三个GSEA的结果
              title="GSEA_KEGG",#标题
              color = c("#626262","#8989FF","#FF0404"),#颜色
              pvalue_table = FALSE,
              ES_geom = "line" ) #enrichment scored的展现方式 'line' or 'dot'
    
    
    # GSEA——GO分析------------------------------------------------------------------
    GSEA_GO<-gseGO(geneList, ont = "BP", org.Hs.eg.db, keyType = "ENTREZID",
      exponent = 1, nPerm = 1000, minGSSize = 10, maxGSSize = 500,
      pvalueCutoff = 0.1, pAdjustMethod = "BH", verbose = TRUE,
      seed = FALSE, by = "fgsea")
    
    GSEA_GO$Description#富集到那些基因集上
    GSEA_GO$enrichmentScore#富集得分
    
    # 根据enrichmentScore对GSEA的结果进行排序
    sort_GSEA_GO<-GSEA_GO[order(GSEA_GO$enrichmentScore,decreasing=T)]
    
    # 画图
    gseaplot2(GSEA_GO,row.names(sort_GSEA_GO))
    
    # 美化
    gseaplot2(GSEA_GO,row.names(sort_GSEA_GO),#只显示前三个GSEA的结果
              title="GSEA_KEGG",#标题
              color = c("#626262","#8989FF","#FF0404"),#颜色
              pvalue_table = FALSE,
              ES_geom = "line" ) #enrichment scored的展现方式 'line' or 'dot'
    
    

    三、miRNA的富集分析

    1、DIANA TOOLS -mirPathv.3

    四、基本概念

    相关文章

      网友评论

          本文标题:富集分析与可视化:R 实现

          本文链接:https://www.haomeiwen.com/subject/ezfvqhtx.html