富集分析与可视化：R 实现

作者: 挽山 | 来源:发表于2020-02-27 00:07 被阅读0次

一. GO/KEGG

（一）富集分析

#source("https://bioconductor.org/biocLite.R")
#options(BioC_mirror="http://mirrors.ustc.edu.cn/bioc/")
#biocLite("DOSE") #画dotplot
#biocLite("clusterProfiler")
#biocLite("pathview") #通路画图
#biocLite("DO.db")
#biocLite("enrichplot")
#biocLite("Cairo")
#install.packages('xlsx')
#install.packages('rJava')
#install.packages('xlsxjars') 切换java10 但我忘了他是干嘛的了
## 
library(DOSE)
library(GO.db)
library(org.Hs.eg.db)
library(topGO)
library(clusterProfiler)
library(pathview)
library(enrichplot)
library(cowplot)
library(ggplot2)
library(Cairo)#go,kegg
library(stringr)#kegg

library(rJava)
library(xlsxjars)
library(xlsx)

#这里是接WGCNA模块富集
#GO
Enrichment <- read.delim(file.choose(),sep = "\t",stringsAsFactors = F,header = T) # file=module+ID
head(Enrichment)

#选择需要富集的模块号
cc<-c('4','6','7','9','17','29')
pfc<-c('2','6','8','11',24)

#模块内循环
for (i in cc){
  #i=4
  Enrich_Source <- Enrichment[Enrichment$module== i,]
  gene <- as.character(Enrich_Source[,4]) #用ID列
  gene<-Enrichment[,2] #一般基因集合从这里跑，改ID所在列
  
  ego_BP <- enrichGO(gene = gene, #注意用的是哪个集合
                     OrgDb=org.Hs.eg.db,
                     ont = "BP",#生物过程（biological process）
                     pAdjustMethod = "BH",
                     minGSSize = 1,
                     pvalueCutoff = 0.05, #默认0.01
                     qvalueCutoff = 0.2,  #默认0.05，还见过设0.1的
                     readable = TRUE
                     )
  BP = as.data.frame(ego_BP @ result)
  write.xlsx(BP, paste('CC_1.5_10_0.15_GO_BP_Module_', i, '.xlsx', sep=""),row.names = F)
  write.xlsx(BP,'30-2233p-target_GO_BP.xlsx') #对应一般基因集合
  
 KEGG <- enrichKEGG(gene = gene, 
                     organism = "hsa", #R 3.5的版本就要这样写,而且必须联网
                     minGSSize = 1,
                     pvalueCutoff = 0.05,      #默认0.01
                     pAdjustMethod = "BH",
                     qvalueCutoff = 0.2,       #默认0.05，还见过设0.1的
                     use_internal_data = F
                     #readable = T
                    )
  KEGG_Pathway <- as.data.frame(KEGG @ result)
 write.xlsx(KEGG_Pathway,paste('CC_CC_1.5_10_0.15_KEGG_Pathway_Module_',i,'.xlsx',sep = ""),row.names = F)
 write.xlsx(KEGG_Pathway,'30-2233p-target_KEGG_Pathway.xlsx') #注意选择保存对象
 }

#一般条图和气泡图就够用

（二）GOChord、弦图什么的

脱离clusterprofiler包，需要从以上富集结果整理输入文件

#BiocManager::install("GOplot",ask = F,update = F)
library(GOplot)
library(readr)
library(stringr)
#https://www.jianshu.com/p/a50310f9418f
#https://www.jianshu.com/p/48ac98098760
#https://www.jianshu.com/p/9bda0ab65717
#http://www.360doc.com/content/19/0416/12/52645714_829160785.shtml
#输入文件准备
#GO: ID term    genes   pvalue  category (可data.fram(ego))
#genes: geneID  log2FC
setwd('C:/Users/Documents/WORK/results')
GO<-read_csv("27-GOChord-GO.csv", col_names = T);GO=data.frame(GO) 
gene<-read_csv("27-GOChord-gene.csv", col_names = T);gene=data.frame(gene)
GO$genes=str_replace_all(GO$genes, '/', ',')
names(GO)=c('ID','term','genes','adj_pval','Category')
names(gene)=c('ID','logFC')

plot.data<-list(DA=GO, GE=gene)
circ=data.frame();circ<-circle_dat(plot.data$DA, plot.data$GE)#创建绘图对象
circ;names(circ)#"category" "ID"  "term"  "count"  "genes"  "logFC"  "adj_pval" "zscore"  
process=unique(circ$term)
#?????
chord<-chord_dat(circ) #data,gene,process,error!
head(chord)
GOChord(chord)

GOChord(chord, title="GOChord plot",#标题设置
        space = 0.03, #GO term处间隔大小设置
        limit = c(3, 5),
        #第一个数值为至少分配给一个基因的Goterm数，
        #第二个数值为至少分配给一个GOterm的基因数
        gene.order = 'logFC', gene.space = 0.25, gene.size = 5,#基因排序，间隔，名字大小设置
        #lfc.col=c('firebrick3', 'white','royalblue3'),##上调下调颜色设置
        #ribbon.col=colorRampPalette(c("blue", "red"))(length(EC$process)),#GO term 颜色设置
        ribbon.col=brewer.pal(length(EC$process), "Set3"))

（三）进阶富集分析比较与可视化

library(clusterProfiler)
library(org.Hs.eg.db)
library(dplyr)

#多集合比较
#https://blog.csdn.net/qazplm12_3/article/details/76474668
#http://www.360doc.com/content/17/0728/08/19913717_674694421.shtml(修饰)
#https://www.cnblogs.com/wangshicheng/articles/10122804.html(长文本截断，气泡大小)

#data1
data<-read.table(file = file.choose(),header = T,sep = '\t',stringsAsFactors = F)
head(data)
#模块号+ID
cc_m4<-data[data$module=='4',3]
cc_m6<-data[data$module=='6',3]
cc_m7<-data[data$module=='7',3]
cc_m9<-data[data$module=='9',3]
cc_m17<-data[data$module=='17',3]
cc_m29<-data[data$module=='29',3]

CC<-list(M4=cc_m4,M6=cc_m6,M17=cc_m17,M29=cc_m29)

#多组富集
kegg_cc<-compareCluster(CC, 'enrichKEGG', organism = "hsa",pvalueCutoff = 0.05)
dotplot(kegg_cc,showCategory = 30,color='pvalue') #可选呈现多少条目

GO_bp_cc<-compareCluster(CC, 'enrichGO', ont = "BP", OrgDb=org.Hs.eg.db)
dotplot(GO_bp_cc,showCategory = 12)

#data2
data2<-read.table(file = file.choose(),header = T,sep = '\t',stringsAsFactors = F)
head(data2)

pfc_m2<-data2[data2$module=='2',3]
pfc_m5<-data2[data2$module=='5',3]
pfc_m6<-data2[data2$module=='6',3]
pfc_m8<-data2[data2$module=='8',3]
pfc_m11<-data2[data2$module=='11',3]
pfc_m24<-data2[data2$module=='24',3]

PFC<-list(M2=pfc_m2,M6=pfc_m6,M8=pfc_m8)

kegg_pfc<-compareCluster(PFC, 'enrichKEGG', organism = "hsa",pvalueCutoff = 0.05)
dotplot(kegg_pfc,showCategory = 30,color='pvalue')

GO_bp_pfc<-compareCluster(PFC, 'enrichGO', ont = "BP", OrgDb=org.Hs.eg.db)
dotplot(GO_bp_pfc,showCategory = 12)


# 保存结果 
library(xlsx)
setwd()
res_kegg<-kegg@compareClusterResult
write.table(res_kegg, file="CC_CompareEnrichment_KEGG.xlsl", row.names=F)
res_GO_bp<-GO_bp@compareClusterResult
write.table(res_GO_bp, file="CompareEnrichment_GOBP.xlsx", row.names=F)

#结果筛选
sub = res %>% group_by(Cluster) %>% top_n(-10, pvalue)
sub10 = res[res$Description %in% sub$Description,]

# 多模块间比较-----------------------------------------------------------------
A<-list(CC_M4=cc_m4,
         CC_M6=cc_m6,
         #CC_M7=cc_m7,
         #CC_M9=cc_m9,
         CC_M17=cc_m17,
         CC_M29=cc_m29,
         
         PFC_M2=pfc_m2,
         #PFC_M5=pfc_m5, 
         PFC_M6=pfc_m6,
         PFC_M8=pfc_m8)
         #PFC_M11=pfc_m11,
         #PFC_M24=pfc_m24)

kegg<-compareCluster(A, 'enrichKEGG', organism = "hsa",pvalueCutoff = 0.05)

GO_bp<-compareCluster(A, 'enrichGO', ont = "BP", OrgDb=org.Hs.eg.db,pvalueCutoff = 0.05)
GO_cc<-compareCluster(A, 'enrichGO', ont = "CC", OrgDb=org.Hs.eg.db,pvalueCutoff = 0.05)
GO_mf<-compareCluster(A, 'enrichGO', ont = "MF", OrgDb=org.Hs.eg.db,pvalueCutoff = 0.05)

#可视化-------------------------------------------------------------------------------------
library(ggplot2)
library(stringr)
p=dotplot(kegg,
        showCategory = 6, #自定义
        includeAll=T,
        #color='p.adjust',
        font.size=10,
        title='xxxxxx', #一般不设置
        by='geneRatio'
        #by='count'
        )+
  theme(axis.text.x = element_text(angle=90))
p2<-p + scale_color_continuous(low='purple',high='green')
p3<-p2+scale_y_discrete(labels=function(kegg) str_wrap(kegg, width=40)) #控制term长度
p4<-p3+scale_size(range=c(2, 8));p4
#p5<-p4 + aes(shape=GeneRatio > 0.1)

q=dotplot(GO_bp,
        showCategory = 5,
        #color='pvalue',
        color='p.adjust',
        font.size=10,
        title='',
        by='geneRatio'
        )+
  theme(axis.text.x = element_text(angle=90))
q2<-q + scale_color_continuous(low='purple',high='green')
q3<-q2+scale_y_discrete(labels=function(kegg) str_wrap(kegg, width=50))
q4<-q3+scale_size(range=c(2, 5));q4

#拼图（好像效果不好？可以用AI拼）
library("gridExtra")
grid.arrange(q4, p4, ncol = 2)

（四）GOplot包（需要FC数据）

参考：
https://www.bioinfo-scrounger.com/archives/799/ （全：包括聚类、环图等）
https://www.jianshu.com/p/8bf3cb8e1e1c （水平条图）

（五）clusterProfiler + ggplot2

参考：
https://www.cnblogs.com/yanjiamin/p/12149542.html （KEGG气泡图）
https://www.cnblogs.com/yanjiamin/p/12122215.html（GO-BP、MF、CC条图）
特别操作：缩减term长度

library(clusterProfiler)
library(org.Hs.eg.db)
library(rJava)
library(xlsxjars)
library(xlsx)

Enrichment <- read.delim(file.choose(),sep = "\t",stringsAsFactors = F,header = T) # file=module+ID
names(Enrichment)
gene<-Enrichment[,2] 

GO<-enrichGO(gene=gene,            #基因对应的向量
             OrgDb = "org.Hs.eg.db",  #OrgDb指定该物种对应的org包的名字
             keyType = "ENTREZID",    #基因ID的类型，默认为ENTREZID，也可用ENSEMBL
             ont="ALL",               #GO类别，BP, CC, MF，ALL
             qvalueCutoff = 0.05,
             pAdjustMethod ="BH",
             readable = T) 

go<-as.data.frame(GO)
View(go)
table(go[,1]) #查看BP,CC,MF的统计数目

#导出结果表格
res = as.data.frame(GO @ result)
write.xlsx(res,'31-10a_5p_target-GO.xlsx') #对应一般基因集合

#输入数据整理
go_MF<-go[go$ONTOLOGY=="MF",][1:10,]
go_CC<-go[go$ONTOLOGY=="CC",][1:10,]
go_BP<-go[go$ONTOLOGY=="BP",][1:10,]

go_enrich_df<-data.frame(ID=c(go_BP$ID, go_CC$ID, go_MF$ID),
                         Description=c(go_BP$Description, go_CC$Description, go_MF$Description),
                         GeneNumber=c(go_BP$Count, go_CC$Count, go_MF$Count),
                         type=factor(c(rep("biological process", 10), 
                                       rep("cellular component", 10),
                                       rep("molecular function",10)),
                                     levels=c("molecular function", "cellular component", "biological process")))

#GO条目可视化预处理
## numbers as data on x axis
go_enrich_df$number <- factor(rev(1:nrow(go_enrich_df)))
## shorten the names of GO terms
shorten_names <- function(x, n_word=4, n_char=40){
  if (length(strsplit(x, " ")[[1]]) > n_word || (nchar(x) > 40))
  {
    if (nchar(x) > 40) x <- substr(x, 1, 40)
    x <- paste(paste(strsplit(x, " ")[[1]][1:min(length(strsplit(x," ")[[1]]), n_word)],
                     collapse=" "), "...", sep="")
    return(x)
  } 
  else
  {
    return(x)
  }
}

labels=(sapply(
  levels(go_enrich_df$Description)[as.numeric(go_enrich_df$Description)],
  shorten_names))
names(labels) = rev(1:nrow(go_enrich_df))

#ggplot2绘图
## colors for bar // green, blue, orange
CPCOLS <- c("#8DA1CB", "#FD8D62", "#66C3A5")
library(ggplot2)
p <- ggplot(data=go_enrich_df, aes(x=number, y=GeneNumber, fill=type)) +
  geom_bar(stat="identity", width=0.8) + coord_flip() + 
  scale_fill_manual(values = CPCOLS) + theme_test() + 
  scale_x_discrete(labels=labels) +
  xlab("GO term") + 
  theme(axis.text=element_text(face = "bold", color="gray50")) +
  labs(title = "The Most Enriched GO Terms")
#coord_flip(...)横向转换坐标：把x轴和y轴互换，没有特殊参数
p

其他：

#单集合图--------------------------------------------------------------------------------
#对于基因和富集的pathways之间的对应关系进行展示，如果一个基因位于一个pathway下，则将该基因与pathway连线
cnetplot(KEGG, 
         showCategory = 20,
         #colorEdge='',
         circular=F,
         node_label=T)

#在pathway通路图上标记富集到的基因（会给出一个url链接）
browseKEGG(KEGG, "hsa04080")

barplot(ego_BP,showCategory = 6,title="The GO_BP enrichment analysis of all DEGs ")+ 
  scale_size(range=c(2, 12))+
  scale_x_discrete(labels=function(ego_BP) str_wrap(ego_BP,width = 25))

二. GSEA

library(topGO)
library(enrichplot)
library(ggplot2)
library(org.Hs.eg.db) #人类基因组注释相关的包
library(DO.db)
library(clusterProfiler)
#library(OrgDb) 注意，该包在R3.5中不兼容，把gseaGO里的此位置直接换成org.Hs.eg.db就行！！

# 导入差异表达结果，筛选
diff<-read.table(file = file.choose(), sep="\t", quote="", header= T, row.names = 1)
head(diff)
genelist<-diff[(which(diff$FDR < 0.05)),]
head(genelist)

# id转换
genelist_id<-bitr(unique(row.names(genelist)),
                    fromType="ENSEMBL",toType="ENTREZID",
                    OrgDb="org.Hs.eg.db",drop = TRUE)#转换ID

# 信息合并
genelist<-as.data.frame(cbind(row.names(genelist),genelist$FDR))
names(genelist)<-c("ENSEMBL","FDR")

gene<-merge(genelist_id, genelist, all=F)#将entrezID、ensemblID和logFC合并
gene<-gene[,-1]#去掉ensemblID只保留entrezID

# 排序（gseKEGG的输入必须是排序后的geneList；需要两列：命名(每一个数字都有一个对应的名字，就是相应的基因ID了)；排序(是一串数字，数字是从高到低排序的)）
geneList<-as.numeric(as.character(gene[,2]))
names(geneList) = as.character(gene[,1])
geneList= sort(geneList, decreasing = TRUE) #构建geneList,并根据logFC由高到低排列

# 分别做下调基因和上调基因的GSEA
#library(OrgDb) 注意，该包在R3.5中不兼容，把gseaGO里的此位置直接换成org.Hs.eg.db就行！！

# 排序（gseKEGG的输入必须是排序后的geneList；需要两列：命名(每一个数字都有一个对应的名字，就是相应的基因ID了)；表达量或FC列
gene<-read.table(file = file.choose(), sep="\t", quote="", header= T, row.names = 1)
#regulation<-gene[gene$log2FoldChange < 0,]
geneList<-as.numeric(as.character(gene[,3])) 
names(gene)
names(geneList) = as.character(gene[,1]) #ID列
geneList= sort(geneList, decreasing = TRUE)#geneList根据logFC由高到低排列
head(geneList)

# GSEA——KEGG分析
GSEA_KEGG<-gseKEGG(
  geneList =geneList,
  nPerm = 1000,#
  keyType = 'kegg',#可以选择"kegg",'ncbi-geneid', 'ncib-proteinid' and 'uniprot'
  organism = 'hsa',#定义物种,
  pvalueCutoff = 0.1,#自定义pvalue的范围
  pAdjustMethod = "BH" #校正p值的方法
)
GSEA_KEGG$Description#富集到那些基因集上
GSEA_KEGG$enrichmentScore#富集得分

# 根据enrichmentScore对GSEA的结果进行排序
sort_GSEA_KEGG<-GSEA_KEGG[order(GSEA_KEGG$enrichmentScore,decreasing=T)]

# 画图
gseaplot2(GSEA_KEGG,row.names(sort_GSEA_KEGG))

# 美化
gseaplot2(GSEA_KEGG,row.names(sort_GSEA_KEGG),#只显示前三个GSEA的结果
          title="GSEA_KEGG",#标题
          color = c("#626262","#8989FF","#FF0404"),#颜色
          pvalue_table = FALSE,
          ES_geom = "line" ) #enrichment scored的展现方式 'line' or 'dot'


# GSEA——GO分析------------------------------------------------------------------
GSEA_GO<-gseGO(geneList, ont = "BP", org.Hs.eg.db, keyType = "ENTREZID",
  exponent = 1, nPerm = 1000, minGSSize = 10, maxGSSize = 500,
  pvalueCutoff = 0.1, pAdjustMethod = "BH", verbose = TRUE,
  seed = FALSE, by = "fgsea")

GSEA_GO$Description#富集到那些基因集上
GSEA_GO$enrichmentScore#富集得分

# 根据enrichmentScore对GSEA的结果进行排序
sort_GSEA_GO<-GSEA_GO[order(GSEA_GO$enrichmentScore,decreasing=T)]

# 画图
gseaplot2(GSEA_GO,row.names(sort_GSEA_GO))

# 美化
gseaplot2(GSEA_GO,row.names(sort_GSEA_GO),#只显示前三个GSEA的结果
          title="GSEA_KEGG",#标题
          color = c("#626262","#8989FF","#FF0404"),#颜色
          pvalue_table = FALSE,
          ES_geom = "line" ) #enrichment scored的展现方式 'line' or 'dot'

三、miRNA的富集分析

1、DIANA TOOLS -mirPathv.3

参考：miRPath:miRNA相关GO和KEGG功能分析 - 简书

四、基本概念

参考：https://www.jianshu.com/p/988d90484f77

富集分析与可视化：R 实现

一. GO/KEGG

（一）富集分析

（二）GOChord、弦图什么的

（三）进阶富集分析比较与可视化

（四）GOplot包（需要FC数据）

（五）clusterProfiler + ggplot2

其他：

二. GSEA

三、miRNA的富集分析

四、基本概念

相关文章

网友评论

延伸阅读

深度阅读

栏目导航

热点阅读