GSEA自定义做图

作者: 落寞的橙子 | 来源:发表于2020-11-06 10:11 被阅读0次

GSEA自定义做图
用clusterProfiler做GSEA（二）
用clusterProfiler做GSEA
专题：富集分析
GSEA分析
使用log2FC排序的基因集进行GSEA分析
GSEA分析
ssGSEA算法原理解析
GSEA的分析汇总-转载
fgsea做GSEA

GSEA做图有多种办法，有软件默认的图，不太适合发表，所以我就想办法搞些其他做法。
首先Y 叔clusterProfiler集成了分析及一体化算法。
另外比如Gene Set Enrichment Analysis in R也给出了自己的代码。
还有fgsea的办法。
但是这些方法我都不太习惯，也不方便所以我仔细去看了下官方软件GSEA分析的结果，在结果文件夹中有个edb文件夹，里面又有一个.edb 和 .rank文件，这个文件我看了下应该就是做图的原始文件，所以我就想把它拆分为ggplot2可以识别的文件。这样以后做图和分析比较方便。
代码参考了这篇教程，结合我的理解，我写成了一个初步的代码，后期可能可以优化一下。
下面的例子拼接了两个数据，组合做图，单个数据自己稍微修改下即可。


## Script by Thomas Kuilman
## Revise by Chengfei Jiang 11052020
## path argument: path to output folder of analysis (e.g. PATH/my_analysis.GseaPreranked.1470948568349)
## gene.set argument: name of the gene set (e.g. V$AP1_Q2).
## It is used in a grep command, so multiple matching is possible.
## Also, R regular expressions can be handled, e.g. "IL2[0-9]$"
## Leading "V$" from gene set names are stripped to allow using the grep command.
## In case of multiple grep matches a warning is given and the first option is plotted.
## class.name: the name of the class / variable to which genes have been correlated (e.g. drug-treatment)
## metric.range: the range of the metric; defaults to [min(DEFINED RANGE), max(DEFINED RANGE)]
## enrichment.score.range: the range of the enrichment score; defaults to [min(ENRICHMENT SCORE), max(ENRICHMENT SCORE)]
suppressMessages(library(ggpubr))
suppressMessages(library(tidyverse))
suppressMessages(library(DOSE))

replotGSEA <- function(path, gene.set) {
  ## Load .rnk data
  path.rnk <- list.files(path = file.path(path, "edb"),
                         pattern = ".rnk$", full.names = TRUE)
  gsea.rnk <- read.delim(file = path.rnk, header = FALSE)
  colnames(gsea.rnk) <- c("hgnc.symbol", "metric")
   metric.range <- c(min(gsea.rnk$metric), max(gsea.rnk$metric))
  
  ## Load .edb data
  path.edb <- list.files(path = file.path(path, "edb"),
                         pattern = ".edb$", full.names = TRUE)
  gsea.edb <- read.delim(file = path.edb,
                         header = FALSE, stringsAsFactors = FALSE)
  gsea.edb <- unlist(gsea.edb)
  gsea.metric <- gsea.edb[grep("METRIC=", gsea.edb)]
  gsea.metric <- unlist(strsplit(gsea.metric, " "))
  gsea.metric <- gsea.metric[grep("METRIC=", gsea.metric)]
  gsea.metric <- gsub("METRIC=", "", gsea.metric)
  gsea.edb <- gsea.edb[grep("<DTG", gsea.edb)]
  
  # Select the right gene set
  if (length(gsea.edb) == 0) {
    stop(paste("The gene set name was not found, please provide",
               "a correct name"))
  }
  if (length(grep(paste0(gsub(".\\$(.*$)", "\\1", gene.set), " "), gsea.edb)) > 1) {
    warning(paste("More than 1 gene set matched the gene.set",
                  "argument; the first match is plotted"))
  }
  gsea.edb <- gsea.edb[grep(paste0(gsub(".\\$(.*$)", "\\1", gene.set), " "), gsea.edb)[1]]
  
  # Get template name
  gsea.edb <- gsub(".*TEMPLATE=(.*)", "\\1", gsea.edb)
  gsea.edb <- unlist(strsplit(gsea.edb, " "))
  gsea.template <- gsea.edb[1]
  
  # Get gene set name
  gsea.gene.set <- gsea.edb[2]
  gsea.gene.set <- gsub("GENESET=gene_sets.gmt#", "", gsea.gene.set)
  
  # Get enrichment score
  gsea.enrichment.score <- gsea.edb[3]
  gsea.enrichment.score <- gsub("ES=", "", gsea.enrichment.score)
  
  # Get gene set name
  gsea.normalized.enrichment.score <- gsea.edb[4]
  gsea.normalized.enrichment.score <- gsub("NES=", "",
                                           gsea.normalized.enrichment.score)
  
  # Get nominal p-value
  gsea.p.value <- gsea.edb[5]
  gsea.p.value <- gsub("NP=", "", gsea.p.value)
  gsea.p.value <- as.numeric(gsea.p.value)
  
  # Get FDR
  gsea.fdr <- gsea.edb[6]
  gsea.fdr <- gsub("FDR=", "", gsea.fdr)
  gsea.fdr <- as.numeric(gsea.fdr)
  
  # Get hit indices
  gsea.edb <- gsea.edb[grep("HIT_INDICES=", gsea.edb):length(gsea.edb)]
  gsea.hit.indices <- gsea.edb[seq_len(grep("ES_PROFILE=", gsea.edb) - 1)]
  gsea.hit.indices <- gsub("HIT_INDICES=", "", gsea.hit.indices)
  gsea.hit.indices <- as.integer(gsea.hit.indices)
  
  # Get ES profile
  gsea.edb <- gsea.edb[grep("ES_PROFILE=", gsea.edb):length(gsea.edb)]
  gsea.es.profile <- gsea.edb[seq_len(grep("RANK_AT_ES=", gsea.edb) - 1)]
  gsea.es.profile <- gsub("ES_PROFILE=", "", gsea.es.profile)
  gsea.es.profile <- as.numeric(gsea.es.profile)
  
  # Set enrichment score range
    enrichment.score.range <- c(min(gsea.es.profile), max(gsea.es.profile))
  ## Create GSEA plot
  # Save default for resetting
  def.par <- par(no.readonly = TRUE)
  # Create plots
  par(mar = c(0, 5, 2, 2))
  
  rt<-data.frame(c(1, gsea.hit.indices, length(gsea.rnk$metric)))
  colnames(rt)<-"hit"
  rt$"hit"<-as.numeric(as.character(rt$hit))
  rt$"es"<-c(0, gsea.es.profile, 0)
  return(rt)
  message(paste0("NES=",as.character(gsea.normalized.enrichment.score)))
  message(paste0("Normal P=",as.character(gsea.p.value)))
  message(paste0("FDR=",as.character(gsea.fdr)))
}




path_human<-"~/my_analysis.GseaPreranked.1604608899819/"
path_mouse<-"~/my_analysis.GseaPreranked.1604608920473/"


human_glucose<-replotGSEA(path = path_human,gene.set = gene_set)
mouse_glucose<-replotGSEA(path = path_mouse,gene.set = gene_set)

gene_set="GO_GLUCOSE_METABOLIC_PROCESS"
rt<-rbind(human_glucose,mouse_glucose)
rt$group<-c(rep("Human",nrow(human_glucose)),rep("Mouse",nrow(mouse_glucose)))
p<-ggline(rt, x = "hit", y = "es", 
          plot_type="l",size=0.75,color ="group",palette = "npg",
          numeric.x.axis=T,ylab = "Enrichment score (ES)",title=gene_set)+
theme(axis.title.x=element_blank(),axis.text.x=element_blank(), axis.ticks.x=element_blank())
p<-p+geom_hline(yintercept = 0,color="grey70")
rt$ymin<-0.1
rt$ymax<-(-0.1)

g<-ggplot(rt, aes_(x = ~hit)) +geom_linerange(aes_(ymin=~ymin, ymax=~ymax,color=~group))+theme_bw()+theme_classic()

g<-g+theme(axis.title.x=element_blank(),axis.text.x=element_blank(), axis.ticks.x=element_blank())
g<-g+theme(axis.title.y=element_blank(),axis.text.y=element_blank(), axis.ticks.y=element_blank())
g<-g +theme(legend.position="none")+theme(axis.line = element_line(size=0))
ggarrange(p, g, heights = c(2, 0.7),
          ncol = 1, nrow = 2,align = "hv")

成图如下，但是美观上还有待改进。

image.png