一. GO/KEGG
(一)富集分析
#source("https://bioconductor.org/biocLite.R")
#options(BioC_mirror="http://mirrors.ustc.edu.cn/bioc/")
#biocLite("DOSE") #画dotplot
#biocLite("clusterProfiler")
#biocLite("pathview") #通路画图
#biocLite("DO.db")
#biocLite("enrichplot")
#biocLite("Cairo")
#install.packages('xlsx')
#install.packages('rJava')
#install.packages('xlsxjars') 切换java10 但我忘了他是干嘛的了
##
library(DOSE)
library(GO.db)
library(org.Hs.eg.db)
library(topGO)
library(clusterProfiler)
library(pathview)
library(enrichplot)
library(cowplot)
library(ggplot2)
library(Cairo)#go,kegg
library(stringr)#kegg
library(rJava)
library(xlsxjars)
library(xlsx)
#这里是接WGCNA模块富集
#GO
Enrichment <- read.delim(file.choose(),sep = "\t",stringsAsFactors = F,header = T) # file=module+ID
head(Enrichment)
#选择需要富集的模块号
cc<-c('4','6','7','9','17','29')
pfc<-c('2','6','8','11',24)
#模块内循环
for (i in cc){
#i=4
Enrich_Source <- Enrichment[Enrichment$module== i,]
gene <- as.character(Enrich_Source[,4]) #用ID列
gene<-Enrichment[,2] #一般基因集合从这里跑,改ID所在列
ego_BP <- enrichGO(gene = gene, #注意用的是哪个集合
OrgDb=org.Hs.eg.db,
ont = "BP",#生物过程(biological process)
pAdjustMethod = "BH",
minGSSize = 1,
pvalueCutoff = 0.05, #默认0.01
qvalueCutoff = 0.2, #默认0.05,还见过设0.1的
readable = TRUE
)
BP = as.data.frame(ego_BP @ result)
write.xlsx(BP, paste('CC_1.5_10_0.15_GO_BP_Module_', i, '.xlsx', sep=""),row.names = F)
write.xlsx(BP,'30-2233p-target_GO_BP.xlsx') #对应一般基因集合
KEGG <- enrichKEGG(gene = gene,
organism = "hsa", #R 3.5的版本就要这样写,而且必须联网
minGSSize = 1,
pvalueCutoff = 0.05, #默认0.01
pAdjustMethod = "BH",
qvalueCutoff = 0.2, #默认0.05,还见过设0.1的
use_internal_data = F
#readable = T
)
KEGG_Pathway <- as.data.frame(KEGG @ result)
write.xlsx(KEGG_Pathway,paste('CC_CC_1.5_10_0.15_KEGG_Pathway_Module_',i,'.xlsx',sep = ""),row.names = F)
write.xlsx(KEGG_Pathway,'30-2233p-target_KEGG_Pathway.xlsx') #注意选择保存对象
}
#一般条图和气泡图就够用
(二)GOChord、弦图什么的
- 脱离clusterprofiler包,需要从以上富集结果整理输入文件
#BiocManager::install("GOplot",ask = F,update = F)
library(GOplot)
library(readr)
library(stringr)
#https://www.jianshu.com/p/a50310f9418f
#https://www.jianshu.com/p/48ac98098760
#https://www.jianshu.com/p/9bda0ab65717
#http://www.360doc.com/content/19/0416/12/52645714_829160785.shtml
#输入文件准备
#GO: ID term genes pvalue category (可data.fram(ego))
#genes: geneID log2FC
setwd('C:/Users/Documents/WORK/results')
GO<-read_csv("27-GOChord-GO.csv", col_names = T);GO=data.frame(GO)
gene<-read_csv("27-GOChord-gene.csv", col_names = T);gene=data.frame(gene)
GO$genes=str_replace_all(GO$genes, '/', ',')
names(GO)=c('ID','term','genes','adj_pval','Category')
names(gene)=c('ID','logFC')
plot.data<-list(DA=GO, GE=gene)
circ=data.frame();circ<-circle_dat(plot.data$DA, plot.data$GE)#创建绘图对象
circ;names(circ)#"category" "ID" "term" "count" "genes" "logFC" "adj_pval" "zscore"
process=unique(circ$term)
#?????
chord<-chord_dat(circ) #data,gene,process,error!
head(chord)
GOChord(chord)
GOChord(chord, title="GOChord plot",#标题设置
space = 0.03, #GO term处间隔大小设置
limit = c(3, 5),
#第一个数值为至少分配给一个基因的Goterm数,
#第二个数值为至少分配给一个GOterm的基因数
gene.order = 'logFC', gene.space = 0.25, gene.size = 5,#基因排序,间隔,名字大小设置
#lfc.col=c('firebrick3', 'white','royalblue3'),##上调下调颜色设置
#ribbon.col=colorRampPalette(c("blue", "red"))(length(EC$process)),#GO term 颜色设置
ribbon.col=brewer.pal(length(EC$process), "Set3"))
(三)进阶富集分析比较与可视化
library(clusterProfiler)
library(org.Hs.eg.db)
library(dplyr)
#多集合比较
#https://blog.csdn.net/qazplm12_3/article/details/76474668
#http://www.360doc.com/content/17/0728/08/19913717_674694421.shtml(修饰)
#https://www.cnblogs.com/wangshicheng/articles/10122804.html(长文本截断,气泡大小)
#data1
data<-read.table(file = file.choose(),header = T,sep = '\t',stringsAsFactors = F)
head(data)
#模块号+ID
cc_m4<-data[data$module=='4',3]
cc_m6<-data[data$module=='6',3]
cc_m7<-data[data$module=='7',3]
cc_m9<-data[data$module=='9',3]
cc_m17<-data[data$module=='17',3]
cc_m29<-data[data$module=='29',3]
CC<-list(M4=cc_m4,M6=cc_m6,M17=cc_m17,M29=cc_m29)
#多组富集
kegg_cc<-compareCluster(CC, 'enrichKEGG', organism = "hsa",pvalueCutoff = 0.05)
dotplot(kegg_cc,showCategory = 30,color='pvalue') #可选呈现多少条目
GO_bp_cc<-compareCluster(CC, 'enrichGO', ont = "BP", OrgDb=org.Hs.eg.db)
dotplot(GO_bp_cc,showCategory = 12)
#data2
data2<-read.table(file = file.choose(),header = T,sep = '\t',stringsAsFactors = F)
head(data2)
pfc_m2<-data2[data2$module=='2',3]
pfc_m5<-data2[data2$module=='5',3]
pfc_m6<-data2[data2$module=='6',3]
pfc_m8<-data2[data2$module=='8',3]
pfc_m11<-data2[data2$module=='11',3]
pfc_m24<-data2[data2$module=='24',3]
PFC<-list(M2=pfc_m2,M6=pfc_m6,M8=pfc_m8)
kegg_pfc<-compareCluster(PFC, 'enrichKEGG', organism = "hsa",pvalueCutoff = 0.05)
dotplot(kegg_pfc,showCategory = 30,color='pvalue')
GO_bp_pfc<-compareCluster(PFC, 'enrichGO', ont = "BP", OrgDb=org.Hs.eg.db)
dotplot(GO_bp_pfc,showCategory = 12)
# 保存结果
library(xlsx)
setwd()
res_kegg<-kegg@compareClusterResult
write.table(res_kegg, file="CC_CompareEnrichment_KEGG.xlsl", row.names=F)
res_GO_bp<-GO_bp@compareClusterResult
write.table(res_GO_bp, file="CompareEnrichment_GOBP.xlsx", row.names=F)
#结果筛选
sub = res %>% group_by(Cluster) %>% top_n(-10, pvalue)
sub10 = res[res$Description %in% sub$Description,]
# 多模块间比较-----------------------------------------------------------------
A<-list(CC_M4=cc_m4,
CC_M6=cc_m6,
#CC_M7=cc_m7,
#CC_M9=cc_m9,
CC_M17=cc_m17,
CC_M29=cc_m29,
PFC_M2=pfc_m2,
#PFC_M5=pfc_m5,
PFC_M6=pfc_m6,
PFC_M8=pfc_m8)
#PFC_M11=pfc_m11,
#PFC_M24=pfc_m24)
kegg<-compareCluster(A, 'enrichKEGG', organism = "hsa",pvalueCutoff = 0.05)
GO_bp<-compareCluster(A, 'enrichGO', ont = "BP", OrgDb=org.Hs.eg.db,pvalueCutoff = 0.05)
GO_cc<-compareCluster(A, 'enrichGO', ont = "CC", OrgDb=org.Hs.eg.db,pvalueCutoff = 0.05)
GO_mf<-compareCluster(A, 'enrichGO', ont = "MF", OrgDb=org.Hs.eg.db,pvalueCutoff = 0.05)
#可视化-------------------------------------------------------------------------------------
library(ggplot2)
library(stringr)
p=dotplot(kegg,
showCategory = 6, #自定义
includeAll=T,
#color='p.adjust',
font.size=10,
title='xxxxxx', #一般不设置
by='geneRatio'
#by='count'
)+
theme(axis.text.x = element_text(angle=90))
p2<-p + scale_color_continuous(low='purple',high='green')
p3<-p2+scale_y_discrete(labels=function(kegg) str_wrap(kegg, width=40)) #控制term长度
p4<-p3+scale_size(range=c(2, 8));p4
#p5<-p4 + aes(shape=GeneRatio > 0.1)
q=dotplot(GO_bp,
showCategory = 5,
#color='pvalue',
color='p.adjust',
font.size=10,
title='',
by='geneRatio'
)+
theme(axis.text.x = element_text(angle=90))
q2<-q + scale_color_continuous(low='purple',high='green')
q3<-q2+scale_y_discrete(labels=function(kegg) str_wrap(kegg, width=50))
q4<-q3+scale_size(range=c(2, 5));q4
#拼图(好像效果不好?可以用AI拼)
library("gridExtra")
grid.arrange(q4, p4, ncol = 2)
(四)GOplot包(需要FC数据)
(五)clusterProfiler + ggplot2
library(clusterProfiler)
library(org.Hs.eg.db)
library(rJava)
library(xlsxjars)
library(xlsx)
Enrichment <- read.delim(file.choose(),sep = "\t",stringsAsFactors = F,header = T) # file=module+ID
names(Enrichment)
gene<-Enrichment[,2]
GO<-enrichGO(gene=gene, #基因对应的向量
OrgDb = "org.Hs.eg.db", #OrgDb指定该物种对应的org包的名字
keyType = "ENTREZID", #基因ID的类型,默认为ENTREZID,也可用ENSEMBL
ont="ALL", #GO类别,BP, CC, MF,ALL
qvalueCutoff = 0.05,
pAdjustMethod ="BH",
readable = T)
go<-as.data.frame(GO)
View(go)
table(go[,1]) #查看BP,CC,MF的统计数目
#导出结果表格
res = as.data.frame(GO @ result)
write.xlsx(res,'31-10a_5p_target-GO.xlsx') #对应一般基因集合
#输入数据整理
go_MF<-go[go$ONTOLOGY=="MF",][1:10,]
go_CC<-go[go$ONTOLOGY=="CC",][1:10,]
go_BP<-go[go$ONTOLOGY=="BP",][1:10,]
go_enrich_df<-data.frame(ID=c(go_BP$ID, go_CC$ID, go_MF$ID),
Description=c(go_BP$Description, go_CC$Description, go_MF$Description),
GeneNumber=c(go_BP$Count, go_CC$Count, go_MF$Count),
type=factor(c(rep("biological process", 10),
rep("cellular component", 10),
rep("molecular function",10)),
levels=c("molecular function", "cellular component", "biological process")))
#GO条目可视化预处理
## numbers as data on x axis
go_enrich_df$number <- factor(rev(1:nrow(go_enrich_df)))
## shorten the names of GO terms
shorten_names <- function(x, n_word=4, n_char=40){
if (length(strsplit(x, " ")[[1]]) > n_word || (nchar(x) > 40))
{
if (nchar(x) > 40) x <- substr(x, 1, 40)
x <- paste(paste(strsplit(x, " ")[[1]][1:min(length(strsplit(x," ")[[1]]), n_word)],
collapse=" "), "...", sep="")
return(x)
}
else
{
return(x)
}
}
labels=(sapply(
levels(go_enrich_df$Description)[as.numeric(go_enrich_df$Description)],
shorten_names))
names(labels) = rev(1:nrow(go_enrich_df))
#ggplot2绘图
## colors for bar // green, blue, orange
CPCOLS <- c("#8DA1CB", "#FD8D62", "#66C3A5")
library(ggplot2)
p <- ggplot(data=go_enrich_df, aes(x=number, y=GeneNumber, fill=type)) +
geom_bar(stat="identity", width=0.8) + coord_flip() +
scale_fill_manual(values = CPCOLS) + theme_test() +
scale_x_discrete(labels=labels) +
xlab("GO term") +
theme(axis.text=element_text(face = "bold", color="gray50")) +
labs(title = "The Most Enriched GO Terms")
#coord_flip(...)横向转换坐标:把x轴和y轴互换,没有特殊参数
p
其他:
#单集合图--------------------------------------------------------------------------------
#对于基因和富集的pathways之间的对应关系进行展示,如果一个基因位于一个pathway下,则将该基因与pathway连线
cnetplot(KEGG,
showCategory = 20,
#colorEdge='',
circular=F,
node_label=T)
#在pathway通路图上标记富集到的基因(会给出一个url链接)
browseKEGG(KEGG, "hsa04080")
barplot(ego_BP,showCategory = 6,title="The GO_BP enrichment analysis of all DEGs ")+
scale_size(range=c(2, 12))+
scale_x_discrete(labels=function(ego_BP) str_wrap(ego_BP,width = 25))
二. GSEA
library(topGO)
library(enrichplot)
library(ggplot2)
library(org.Hs.eg.db) #人类基因组注释相关的包
library(DO.db)
library(clusterProfiler)
#library(OrgDb) 注意,该包在R3.5中不兼容,把gseaGO里的此位置直接换成org.Hs.eg.db就行!!
# 导入差异表达结果,筛选
diff<-read.table(file = file.choose(), sep="\t", quote="", header= T, row.names = 1)
head(diff)
genelist<-diff[(which(diff$FDR < 0.05)),]
head(genelist)
# id转换
genelist_id<-bitr(unique(row.names(genelist)),
fromType="ENSEMBL",toType="ENTREZID",
OrgDb="org.Hs.eg.db",drop = TRUE)#转换ID
# 信息合并
genelist<-as.data.frame(cbind(row.names(genelist),genelist$FDR))
names(genelist)<-c("ENSEMBL","FDR")
gene<-merge(genelist_id, genelist, all=F)#将entrezID、ensemblID和logFC合并
gene<-gene[,-1]#去掉ensemblID只保留entrezID
# 排序(gseKEGG的输入必须是排序后的geneList;需要两列:命名(每一个数字都有一个对应的名字,就是相应的基因ID了);排序(是一串数字,数字是从高到低排序的))
geneList<-as.numeric(as.character(gene[,2]))
names(geneList) = as.character(gene[,1])
geneList= sort(geneList, decreasing = TRUE) #构建geneList,并根据logFC由高到低排列
# 分别做下调基因和上调基因的GSEA
#library(OrgDb) 注意,该包在R3.5中不兼容,把gseaGO里的此位置直接换成org.Hs.eg.db就行!!
# 排序(gseKEGG的输入必须是排序后的geneList;需要两列:命名(每一个数字都有一个对应的名字,就是相应的基因ID了);表达量或FC列
gene<-read.table(file = file.choose(), sep="\t", quote="", header= T, row.names = 1)
#regulation<-gene[gene$log2FoldChange < 0,]
geneList<-as.numeric(as.character(gene[,3]))
names(gene)
names(geneList) = as.character(gene[,1]) #ID列
geneList= sort(geneList, decreasing = TRUE)#geneList根据logFC由高到低排列
head(geneList)
# GSEA——KEGG分析
GSEA_KEGG<-gseKEGG(
geneList =geneList,
nPerm = 1000,#
keyType = 'kegg',#可以选择"kegg",'ncbi-geneid', 'ncib-proteinid' and 'uniprot'
organism = 'hsa',#定义物种,
pvalueCutoff = 0.1,#自定义pvalue的范围
pAdjustMethod = "BH" #校正p值的方法
)
GSEA_KEGG$Description#富集到那些基因集上
GSEA_KEGG$enrichmentScore#富集得分
# 根据enrichmentScore对GSEA的结果进行排序
sort_GSEA_KEGG<-GSEA_KEGG[order(GSEA_KEGG$enrichmentScore,decreasing=T)]
# 画图
gseaplot2(GSEA_KEGG,row.names(sort_GSEA_KEGG))
# 美化
gseaplot2(GSEA_KEGG,row.names(sort_GSEA_KEGG),#只显示前三个GSEA的结果
title="GSEA_KEGG",#标题
color = c("#626262","#8989FF","#FF0404"),#颜色
pvalue_table = FALSE,
ES_geom = "line" ) #enrichment scored的展现方式 'line' or 'dot'
# GSEA——GO分析------------------------------------------------------------------
GSEA_GO<-gseGO(geneList, ont = "BP", org.Hs.eg.db, keyType = "ENTREZID",
exponent = 1, nPerm = 1000, minGSSize = 10, maxGSSize = 500,
pvalueCutoff = 0.1, pAdjustMethod = "BH", verbose = TRUE,
seed = FALSE, by = "fgsea")
GSEA_GO$Description#富集到那些基因集上
GSEA_GO$enrichmentScore#富集得分
# 根据enrichmentScore对GSEA的结果进行排序
sort_GSEA_GO<-GSEA_GO[order(GSEA_GO$enrichmentScore,decreasing=T)]
# 画图
gseaplot2(GSEA_GO,row.names(sort_GSEA_GO))
# 美化
gseaplot2(GSEA_GO,row.names(sort_GSEA_GO),#只显示前三个GSEA的结果
title="GSEA_KEGG",#标题
color = c("#626262","#8989FF","#FF0404"),#颜色
pvalue_table = FALSE,
ES_geom = "line" ) #enrichment scored的展现方式 'line' or 'dot'
三、miRNA的富集分析
1、DIANA TOOLS -mirPathv.3
四、基本概念
网友评论