前提
代谢物富集分析的目的是为了解析某些差异代谢物是否落在某些pathway上(可简单理解为单个差异代谢物解释pathway较弱,同一pathway的代谢物共同解释该通路变化则证据较为robust,在很多生物领域均存在类似的处理逻辑),进而影响pathway的功能。基于pathway可分析代谢物的上下游基因或酶等影响,最终存在阐明作用机制的可能性。
代谢组服务公司反馈回来的代谢组表一般是基于intensity也即是质谱峰强度的数据,需要做前处理方可使用。本次使用直接处理后的数据用于分析。
PS:代谢组ID也有类似基因ID的多平台多命名问题,可以参考类似的名称转换工具包。本次直接使用cpdid,可直接用于KEGG COMPOUND数据库。
输入数据
因为本次研究是内部暂未公开的数据,所以不能提供给大家使用,但可以细说一下有哪些列。
![](https://img.haomeiwen.com/i10780526/b0c9d0125fb52a66.png)
datSignif <- read.csv("TM_plasma_FC_VIP_ttest.csv")
![](https://img.haomeiwen.com/i10780526/777b4596470b1e28.png)
步骤
合并1和2步骤为一个脚本,步骤1推荐使用massdatabase
R包,该步是为了获取特定物种的pathway和compound对应关系。
-
下载KEGG pathway对应的compound信息。本次分析是针对小鼠,因此在选择pathway的时候物种设置为小鼠。
-
读取每个pathway包含的compound后,整理成pathway和compound一一对应的关系表;
library(dplyr)
library(tibble)
library(ggplot2)
library(massdatabase)
library(clusterProfiler)
get_kegg_pathway_compound <- function(
org_id = c("hsa", "mmu")) {
pathway_names <- request_kegg_pathway_info(organism = org_id)
res <- data.frame()
for (i in 1:nrow(pathway_names)) {
temp_pathway <- request_kegg_pathway(pathway_id = pathway_names$KEGG.ID[i])
temp_colnames <- names(temp_pathway)
if (all(c("COMPOUND", "ENTRY", "NAME", "CLASS", "PATHWAY_MAP") %in% temp_colnames)) {
print(i)
temp_compound <- temp_pathway$COMPOUND
temp_class <- unlist(strsplit(temp_pathway$CLASS, "; "))
temp_df1 <- data.frame(Pathway = temp_pathway$ENTRY,
NAME = temp_pathway$NAME,
#DESCRIPTION = temp_pathway$DESCRIPTION,
CLASS1 = temp_class[1],
CLASS2 = temp_class[2],
PATHWAY_MAP = temp_pathway$PATHWAY_MAP)
temp_df2 <- data.frame(Pathway = rep(temp_pathway$ENTRY, length(temp_compound)),
COMPOUND = names(temp_compound),
COMPOUND_DESCRIPTION = temp_compound)
temp_df <- temp_df1 %>%
dplyr::full_join(temp_df2, by = "Pathway")
res <- rbind(res, temp_df)
}
}
return(res)
}
if (file.exists("KEGG_COMPOUND_PATHWAY_mmu.csv")) {
mmu_kegg_compound <- read.csv("KEGG_COMPOUND_PATHWAY/KEGG_COMPOUND_PATHWAY_mmu.csv")
} else {
mmu_kegg_compound <- get_kegg_pathway_compound(org_id = "mmu")
write.csv(mmu_kegg_compound, "KEGG_COMPOUND_PATHWAY_mmu.csv", row.names = F)
}
- 用ClusterProfiler提供的enrichr和GESA做分析,这两个函数可自行配置输入pathway database,然后采用超几何检验和富集rank得分评估富集状况。
get_enrichment <- function(
dat = datSignif,
ref = mmu_kegg_compound,
group = c("G1 vs G2", "G1 vs G3", "G2 vs G3"),
direction = c(NULL, "DownRegulated", "UpRegulated"), # DownRegulated->1st group; UpRegulated->2nd group
index_names = c("FoldChange", "Log2FoldChange", "VIP", "CorPvalue", "Pvalue", "AdjustedPvalue"),
index_cutoff = c(1, 1, 1, 0.05, 0.05, 0.05),
enrich_type = c("ORA", "GSEA")) {
signif_df <- dat %>%
dplyr::filter(Block2 %in% group)
colnames(signif_df)[which(colnames(signif_df) == index_names[1])] <- "DA_index1"
colnames(signif_df)[which(colnames(signif_df) == index_names[2])] <- "DA_index2"
signif_df_cln <- signif_df %>%
dplyr::filter(abs(DA_index1) > index_cutoff[1]) %>%
dplyr::filter(DA_index2 < index_cutoff[2]) %>%
dplyr::filter(cpd_ID != "-")
signif_df_cln$cpd_ID <- gsub("\\s+\\|\\s+\\w+\\d+", "", signif_df_cln$cpd_ID)
if (nrow(signif_df_cln) == 0) {
stop("Beyond these thresholds, no significant metabolites were selected")
}
signif_temp <- signif_df_cln %>%
dplyr::select(FeatureID, Block2, cpd_ID, Compounds, DA_index1, DA_index2) %>%
dplyr::arrange(desc(DA_index1))
if (all(index_names[1] == "Log2FoldChange", direction == "DownRegulated", !is.null(direction))) {
inputdata <- signif_temp %>%
dplyr::filter(DA_index1 > 0)
} else if (all(index_names[1] == "Log2FoldChange", direction == "UpRegulated", !is.null(direction))) {
inputdata <- signif_temp %>%
dplyr::filter(DA_index1 < 0)
} else if (any(index_names[1] != "Log2FoldChange", is.null(direction))) {
inputdata <- signif_temp
}
if (nrow(inputdata) == 0) {
stop("Beyond these thresholds, no significant metabolites were selected when using direction")
}
ref_cln <- ref %>%
dplyr::select(Pathway, COMPOUND)
if (enrich_type == "ORA") {
fit <- clusterProfiler::enricher(
gene = inputdata$cpd_ID,
pvalueCutoff = 0.05,
pAdjustMethod = "BH",
minGSSize = 10,
maxGSSize = 500,
qvalueCutoff = 0.2,
TERM2GENE = ref_cln)
} else if (enrich_type == "GSEA") {
cpd_list <- inputdata$DA_index1
names(cpd_list) <- inputdata$cpd_ID
fit <- clusterProfiler::GSEA(
geneList = cpd_list,
pvalueCutoff = 0.05,
pAdjustMethod = "BH",
minGSSize = 10,
maxGSSize = 500,
TERM2GENE = ref_cln)
}
if (nrow(fit@result) != 0) {
if (enrich_type == "ORA") {
EA_res <- fit@result %>%
dplyr::select(-Description) %>%
dplyr::inner_join(ref %>%
dplyr::select(Pathway, NAME, CLASS1, CLASS2, PATHWAY_MAP) %>%
dplyr::distinct(),
by = c("ID" = "Pathway")) %>%
dplyr::mutate(Group = group) %>%
dplyr::select(ID, NAME, everything()) %>%
dplyr::rename(CompoundRatio = GeneRatio,
CompoundID = geneID)
} else if (enrich_type == "GSEA") {
EA_res <- fit@result %>%
dplyr::select(-Description) %>%
dplyr::inner_join(ref %>%
dplyr::select(Pathway, NAME, CLASS1, CLASS2, PATHWAY_MAP) %>%
dplyr::distinct(),
by = c("ID" = "Pathway")) %>%
dplyr::mutate(Group = group) %>%
dplyr::select(ID, NAME, everything())
}
} else {
EA_res <- NULL
}
colnames(inputdata)[which(colnames(inputdata) == "DA_index1")] <- index_names[1]
colnames(inputdata)[which(colnames(inputdata) == "DA_index2")] <- index_names[2]
res <- list(data = inputdata,
enrich = EA_res)
return(res)
}
- 画气泡图或者条形图,根据qvalue和CompoundRatio以及Count画图
plot_enrichment <- function(
inputdata,
qval_cutoff = 0.05,
topN = 10,
plot_type = c("bubble", "bar"),
enrich_type = c("ORA", "GSEA")) {
if (nrow(inputdata) == 0) {
stop("No pathway please check your input")
}
if (enrich_type == "ORA") {
if (any(is.na(inputdata$qvalue))) {
inputdata$qvalue <- inputdata$p.adjust
}
plotdata_temp <- inputdata %>%
dplyr::select(ID, NAME, CompoundRatio, qvalue, Count, CLASS1, CLASS2, PATHWAY_MAP, Group) %>%
dplyr::filter(qvalue < qval_cutoff)
if (nrow(plotdata_temp) == 0) {
stop("No pathway met the threshold of qvalue")
}
plotdata <- plotdata_temp %>%
dplyr::slice(1:topN) %>%
dplyr::mutate(qvalue = as.numeric(qvalue),
Count = as.numeric(Count)) %>%
dplyr::group_by(ID) %>%
dplyr::mutate(CompoundRatio1 = unlist(strsplit(CompoundRatio, "/"))[1],
CompoundRatio2 = unlist(strsplit(CompoundRatio, "/"))[2],
CompoundRatio = as.numeric(CompoundRatio1) / as.numeric(CompoundRatio2)) %>%
dplyr::select(-all_of(c("CompoundRatio1", "CompoundRatio2"))) %>%
dplyr::mutate(NAME = gsub(" - Mus musculus \\(house mouse\\)", "", NAME)) %>%
dplyr::arrange(CompoundRatio) %>%
dplyr::rename(Yvalue = CompoundRatio)
y_lab <- "CompoundRatio"
} else if (enrich_type == "GSEA") {
if (any(is.na(inputdata$qvalue))) {
inputdata$qvalues <- inputdata$p.adjust
}
plotdata_temp <- inputdata %>%
dplyr::select(ID, NAME, enrichmentScore, qvalues, setSize, CLASS1, CLASS2, PATHWAY_MAP, Group) %>%
dplyr::filter(qvalues < qval_cutoff)
if (nrow(plotdata_temp) == 0) {
stop("No pathway met the threshold of qvalue")
}
plotdata <- plotdata_temp %>%
dplyr::slice(1:topN) %>%
dplyr::rename(qvalue = qvalues,
Count = setSize) %>%
dplyr::mutate(qvalue = as.numeric(qvalue),
Count = as.numeric(Count),
enrichmentScore = as.numeric(enrichmentScore)) %>%
dplyr::mutate(NAME = gsub(" - Mus musculus \\(house mouse\\)", "", NAME)) %>%
dplyr::arrange(enrichmentScore) %>%
dplyr::rename(Yvalue = enrichmentScore)
y_lab <- "enrichmentScore"
}
plotdata$NAME <- factor(plotdata$NAME, levels = as.character(plotdata$NAME))
final_topN <- nrow(plotdata)
if (plot_type == "bubble") {
pl <- ggplot(plotdata, aes(x = NAME, y = Yvalue)) +
geom_point(aes(size = Count, color = qvalue)) +
coord_flip() +
labs(x = "", y = y_lab, title = paste("Top", final_topN, "of Enrichment")) +
theme_bw() +
guides(size = guide_legend(order = 1),
color = guide_legend(order = 2))+
scale_color_gradientn(name = "q value",
colours = rainbow(5)) +
theme(plot.title = element_text(face = 'bold', size = 12, hjust = .5),
axis.title = element_text(face = 'bold', size = 11),
axis.text.y = element_text(face = 'bold', size = 10),
legend.position = "right")
} else if (plot_type == "bar") {
pl <- ggplot(plotdata, aes(x = NAME, y = Yvalue, fill = -log(qvalue))) +
geom_bar(stat = "identity", position = position_dodge()) +
coord_flip() +
labs(x = "", y = y_lab, title = paste("Top", final_topN, "of Enrichment")) +
theme_bw() +
guides(fill = guide_legend(order = 1))+
scale_color_gradientn(name = "-log10(q value)",
colours = rainbow(5)) +
theme(plot.title = element_text(face = 'bold', size = 12, hjust = .5),
axis.title = element_text(face = 'bold', size = 11),
axis.text.y = element_text(face = 'bold', size = 10),
legend.position = "right")
}
return(pl)
}
G1vsG2案例
本研究比较不同条件下G1和G2组的代谢物是否存在差异,且差异代谢物富集在哪些KEGG Pathway上,在明确显著富集pathway后,可聚焦在该pathway的代谢物,进而存在阐明机制的可能性。
- ORA
G1_G2_ORA <- get_enrichment(
dat = datSignif,
group = "G1 vs G2",
direction = NULL,
index_names = c("Log2FoldChange", "AdjustedPvalue"),
index_cutoff = c(0, 0.05),
enrich_type = "ORA")
head(G1_G2_ORA$enrich)
![](https://img.haomeiwen.com/i10780526/64b4653741f092b8.png)
- bubble
G1_G2_ORA_bubble <- plot_enrichment(
inputdata = G1_G2_ORA$enrich,
qval_cutoff = 0.05,
topN = 20,
plot_type = "bubble",
enrich_type = "ORA")
G1_G2_ORA_bubble
![](https://img.haomeiwen.com/i10780526/b0fda165927a48f3.png)
- 结果解析
从上述富集通路看,Tryptophan metabolism可通过代谢Tryptophan产生一些影响宿主免疫的物质如indole(吲哚),现有研究已经证实indole(吲哚)可增强T细胞的免疫。综上,可对富集在该通路的代谢物的上下游基因或酶做进一步研究。
问题
- 上述富集分析,可再根据差异代谢物富集方向做进一步分析,上述合并一起分析对后面富集通路不好解释作用方向。
网友评论