美文网首页
数据分析:代谢组功能分析

数据分析:代谢组功能分析

作者: 生信学习者2 | 来源:发表于2023-05-29 17:06 被阅读0次

前提

代谢物富集分析的目的是为了解析某些差异代谢物是否落在某些pathway上(可简单理解为单个差异代谢物解释pathway较弱,同一pathway的代谢物共同解释该通路变化则证据较为robust,在很多生物领域均存在类似的处理逻辑),进而影响pathway的功能。基于pathway可分析代谢物的上下游基因或酶等影响,最终存在阐明作用机制的可能性。

代谢组服务公司反馈回来的代谢组表一般是基于intensity也即是质谱峰强度的数据,需要做前处理方可使用。本次使用直接处理后的数据用于分析。

PS:代谢组ID也有类似基因ID的多平台多命名问题,可以参考类似的名称转换工具包。本次直接使用cpdid,可直接用于KEGG COMPOUND数据库。

输入数据

因为本次研究是内部暂未公开的数据,所以不能提供给大家使用,但可以细说一下有哪些列。


datSignif <- read.csv("TM_plasma_FC_VIP_ttest.csv")

步骤

合并1和2步骤为一个脚本,步骤1推荐使用massdatabaseR包,该步是为了获取特定物种的pathway和compound对应关系。

  1. 下载KEGG pathway对应的compound信息。本次分析是针对小鼠,因此在选择pathway的时候物种设置为小鼠。

  2. 读取每个pathway包含的compound后,整理成pathway和compound一一对应的关系表;

library(dplyr)
library(tibble)
library(ggplot2)
library(massdatabase)
library(clusterProfiler)

get_kegg_pathway_compound <- function(
    org_id = c("hsa", "mmu")) {
  
  pathway_names <- request_kegg_pathway_info(organism = org_id)
  
  res <- data.frame()
  for (i in 1:nrow(pathway_names)) {
    temp_pathway <- request_kegg_pathway(pathway_id = pathway_names$KEGG.ID[i])
    temp_colnames <- names(temp_pathway)
    if (all(c("COMPOUND", "ENTRY", "NAME", "CLASS", "PATHWAY_MAP") %in% temp_colnames)) {
      print(i)
      temp_compound <- temp_pathway$COMPOUND
      temp_class <- unlist(strsplit(temp_pathway$CLASS, "; "))
      temp_df1 <- data.frame(Pathway = temp_pathway$ENTRY,
                             NAME = temp_pathway$NAME,
                             #DESCRIPTION = temp_pathway$DESCRIPTION,
                             CLASS1 = temp_class[1],
                             CLASS2 = temp_class[2],
                             PATHWAY_MAP = temp_pathway$PATHWAY_MAP)
      temp_df2 <- data.frame(Pathway = rep(temp_pathway$ENTRY, length(temp_compound)),
                             COMPOUND = names(temp_compound),
                             COMPOUND_DESCRIPTION = temp_compound)
      
      temp_df <- temp_df1 %>%
        dplyr::full_join(temp_df2, by = "Pathway")
      
      res <- rbind(res, temp_df)
    }
  }
  
  return(res)
}

if (file.exists("KEGG_COMPOUND_PATHWAY_mmu.csv")) {
  mmu_kegg_compound <- read.csv("KEGG_COMPOUND_PATHWAY/KEGG_COMPOUND_PATHWAY_mmu.csv") 
} else {
  mmu_kegg_compound <- get_kegg_pathway_compound(org_id = "mmu")
  write.csv(mmu_kegg_compound, "KEGG_COMPOUND_PATHWAY_mmu.csv", row.names = F)
}
  1. 用ClusterProfiler提供的enrichr和GESA做分析,这两个函数可自行配置输入pathway database,然后采用超几何检验和富集rank得分评估富集状况。
get_enrichment <- function(
  dat = datSignif,
  ref = mmu_kegg_compound,
  group = c("G1 vs G2", "G1 vs G3", "G2 vs G3"),
  direction = c(NULL, "DownRegulated", "UpRegulated"), # DownRegulated->1st group; UpRegulated->2nd group 
  index_names = c("FoldChange", "Log2FoldChange", "VIP", "CorPvalue", "Pvalue", "AdjustedPvalue"),
  index_cutoff = c(1, 1, 1, 0.05, 0.05, 0.05),
  enrich_type = c("ORA", "GSEA")) {
  
  signif_df <- dat %>%
    dplyr::filter(Block2 %in% group)
  
  colnames(signif_df)[which(colnames(signif_df) == index_names[1])] <- "DA_index1"
  colnames(signif_df)[which(colnames(signif_df) == index_names[2])] <- "DA_index2"
  
  signif_df_cln <- signif_df %>%
    dplyr::filter(abs(DA_index1) > index_cutoff[1]) %>%
    dplyr::filter(DA_index2 < index_cutoff[2]) %>%
    dplyr::filter(cpd_ID != "-")

  signif_df_cln$cpd_ID <- gsub("\\s+\\|\\s+\\w+\\d+", "", signif_df_cln$cpd_ID)
  
  if (nrow(signif_df_cln) == 0) {
    stop("Beyond these thresholds, no significant metabolites were selected")
  }
  
  signif_temp <- signif_df_cln %>%
    dplyr::select(FeatureID, Block2, cpd_ID, Compounds, DA_index1, DA_index2) %>%
    dplyr::arrange(desc(DA_index1))
  
  if (all(index_names[1] == "Log2FoldChange", direction == "DownRegulated", !is.null(direction))) {
    inputdata <- signif_temp %>%
      dplyr::filter(DA_index1 > 0)
  } else if (all(index_names[1] == "Log2FoldChange", direction == "UpRegulated", !is.null(direction))) {
    inputdata <- signif_temp %>%
      dplyr::filter(DA_index1 < 0)    
  } else if (any(index_names[1] != "Log2FoldChange", is.null(direction))) {
    inputdata <- signif_temp   
  }
  
  if (nrow(inputdata) == 0) {
    stop("Beyond these thresholds, no significant metabolites were selected when using direction")
  }  
  
  ref_cln <- ref %>%
    dplyr::select(Pathway, COMPOUND)
  
  if (enrich_type == "ORA") {
    fit <- clusterProfiler::enricher(
                gene = inputdata$cpd_ID,
                pvalueCutoff = 0.05,
                pAdjustMethod = "BH",
                minGSSize = 10,
                maxGSSize = 500,
                qvalueCutoff = 0.2,
                TERM2GENE = ref_cln)    
  } else if (enrich_type == "GSEA") {
    cpd_list <- inputdata$DA_index1
    names(cpd_list) <- inputdata$cpd_ID
    fit <- clusterProfiler::GSEA(
                geneList = cpd_list,
                pvalueCutoff = 0.05,
                pAdjustMethod = "BH",
                minGSSize = 10,
                maxGSSize = 500,      
                TERM2GENE = ref_cln)     
  }

  if (nrow(fit@result) != 0) {
    if (enrich_type == "ORA") {
      EA_res <- fit@result %>%
        dplyr::select(-Description) %>%
        dplyr::inner_join(ref %>%
                            dplyr::select(Pathway, NAME, CLASS1, CLASS2, PATHWAY_MAP) %>%
                            dplyr::distinct(),
                          by = c("ID" = "Pathway")) %>%
        dplyr::mutate(Group = group) %>%
        dplyr::select(ID, NAME, everything()) %>%
        dplyr::rename(CompoundRatio = GeneRatio,
                      CompoundID = geneID)
    } else if (enrich_type == "GSEA") {
      EA_res <- fit@result %>%
        dplyr::select(-Description) %>%
        dplyr::inner_join(ref %>%
                            dplyr::select(Pathway, NAME, CLASS1, CLASS2, PATHWAY_MAP) %>%
                            dplyr::distinct(),
                          by = c("ID" = "Pathway")) %>%
        dplyr::mutate(Group = group) %>%
        dplyr::select(ID, NAME, everything())       
    }
  } else {
    EA_res <- NULL
  }

  colnames(inputdata)[which(colnames(inputdata) == "DA_index1")] <- index_names[1]
  colnames(inputdata)[which(colnames(inputdata) == "DA_index2")] <- index_names[2]  
  
  res <- list(data = inputdata,
              enrich = EA_res)
  
  return(res)
}
  1. 画气泡图或者条形图,根据qvalue和CompoundRatio以及Count画图
plot_enrichment <- function(
    inputdata,
    qval_cutoff = 0.05,
    topN = 10,
    plot_type = c("bubble", "bar"),
    enrich_type = c("ORA", "GSEA")) {
  
  if (nrow(inputdata) == 0) {
    stop("No pathway please check your input")
  }  
  
  if (enrich_type == "ORA") {
    
    if (any(is.na(inputdata$qvalue))) {
      inputdata$qvalue <- inputdata$p.adjust
    }
    
    plotdata_temp <- inputdata %>%
      dplyr::select(ID, NAME, CompoundRatio, qvalue, Count, CLASS1, CLASS2, PATHWAY_MAP, Group) %>%
      dplyr::filter(qvalue < qval_cutoff)
    
    if (nrow(plotdata_temp) == 0) {
      stop("No pathway met the threshold of qvalue")
    }    
      
    plotdata <- plotdata_temp %>%  
      dplyr::slice(1:topN) %>%
      dplyr::mutate(qvalue = as.numeric(qvalue),
                    Count = as.numeric(Count)) %>%
      dplyr::group_by(ID) %>%
      dplyr::mutate(CompoundRatio1 = unlist(strsplit(CompoundRatio, "/"))[1],
                    CompoundRatio2 = unlist(strsplit(CompoundRatio, "/"))[2],
                    CompoundRatio = as.numeric(CompoundRatio1) / as.numeric(CompoundRatio2)) %>%
      dplyr::select(-all_of(c("CompoundRatio1", "CompoundRatio2"))) %>%
      dplyr::mutate(NAME = gsub(" - Mus musculus \\(house mouse\\)", "", NAME)) %>%
      dplyr::arrange(CompoundRatio)  %>%
      dplyr::rename(Yvalue = CompoundRatio)
    y_lab <- "CompoundRatio"
  } else if (enrich_type == "GSEA") {
    if (any(is.na(inputdata$qvalue))) {
      inputdata$qvalues <- inputdata$p.adjust
    }    
    plotdata_temp <- inputdata %>%
      dplyr::select(ID, NAME, enrichmentScore, qvalues, setSize, CLASS1, CLASS2, PATHWAY_MAP, Group) %>%
      dplyr::filter(qvalues < qval_cutoff)
    
    if (nrow(plotdata_temp) == 0) {
      stop("No pathway met the threshold of qvalue")
    }    
      
    plotdata <- plotdata_temp %>%  
      dplyr::slice(1:topN) %>%
      dplyr::rename(qvalue = qvalues,
                    Count = setSize) %>%
      dplyr::mutate(qvalue = as.numeric(qvalue),
                    Count = as.numeric(Count),
                    enrichmentScore = as.numeric(enrichmentScore)) %>%
      dplyr::mutate(NAME = gsub(" - Mus musculus \\(house mouse\\)", "", NAME)) %>%
      dplyr::arrange(enrichmentScore) %>%
      dplyr::rename(Yvalue = enrichmentScore)
    y_lab <- "enrichmentScore"
  } 

  plotdata$NAME <- factor(plotdata$NAME, levels = as.character(plotdata$NAME))
  
  final_topN <- nrow(plotdata)
  
  if (plot_type == "bubble") {
    pl <- ggplot(plotdata, aes(x = NAME, y = Yvalue)) +
      geom_point(aes(size = Count, color = qvalue)) +
      coord_flip() + 
      labs(x = "", y = y_lab, title = paste("Top", final_topN, "of Enrichment")) +
      theme_bw() +
      guides(size = guide_legend(order = 1),
             color = guide_legend(order = 2))+
      scale_color_gradientn(name = "q value", 
                            colours = rainbow(5)) +
      theme(plot.title = element_text(face = 'bold', size = 12, hjust = .5),
            axis.title = element_text(face = 'bold', size = 11),
            axis.text.y = element_text(face = 'bold', size = 10),
            legend.position = "right")    
  } else if (plot_type == "bar") {
    pl <- ggplot(plotdata, aes(x = NAME, y = Yvalue, fill = -log(qvalue))) +
      geom_bar(stat = "identity", position = position_dodge()) +
      coord_flip() + 
      labs(x = "", y = y_lab, title = paste("Top", final_topN, "of Enrichment")) +
      theme_bw() +
      guides(fill = guide_legend(order = 1))+
      scale_color_gradientn(name = "-log10(q value)",
                            colours = rainbow(5)) +
      theme(plot.title = element_text(face = 'bold', size = 12, hjust = .5),
            axis.title = element_text(face = 'bold', size = 11),
            axis.text.y = element_text(face = 'bold', size = 10),
            legend.position = "right")      
  }
  
  return(pl)
}

G1vsG2案例

本研究比较不同条件下G1和G2组的代谢物是否存在差异,且差异代谢物富集在哪些KEGG Pathway上,在明确显著富集pathway后,可聚焦在该pathway的代谢物,进而存在阐明机制的可能性。

  • ORA
G1_G2_ORA <- get_enrichment(
  dat = datSignif,
  group = "G1 vs G2",
  direction = NULL,  
  index_names = c("Log2FoldChange", "AdjustedPvalue"),
  index_cutoff = c(0, 0.05),
  enrich_type = "ORA")

head(G1_G2_ORA$enrich)
  • bubble
G1_G2_ORA_bubble <- plot_enrichment(
    inputdata = G1_G2_ORA$enrich,
    qval_cutoff = 0.05,
    topN = 20,
    plot_type = "bubble",
    enrich_type = "ORA") 

G1_G2_ORA_bubble
  • 结果解析

从上述富集通路看,Tryptophan metabolism可通过代谢Tryptophan产生一些影响宿主免疫的物质如indole(吲哚),现有研究已经证实indole(吲哚)可增强T细胞的免疫。综上,可对富集在该通路的代谢物的上下游基因或酶做进一步研究。

问题

  • 上述富集分析,可再根据差异代谢物富集方向做进一步分析,上述合并一起分析对后面富集通路不好解释作用方向。

参考

相关文章

网友评论

      本文标题:数据分析:代谢组功能分析

      本文链接:https://www.haomeiwen.com/subject/gxojedtx.html