GSEA做图有多种办法,有软件默认的图,不太适合发表,所以我就想办法搞些其他做法。
首先Y 叔clusterProfiler集成了分析及一体化算法。
另外比如Gene Set Enrichment Analysis in R也给出了自己的代码。
还有fgsea的办法。
但是这些方法我都不太习惯,也不方便所以我仔细去看了下官方软件GSEA分析的结果,在结果文件夹中有个edb文件夹,里面又有一个.edb 和 .rank文件,这个文件我看了下应该就是做图的原始文件,所以我就想把它拆分为ggplot2可以识别的文件。这样以后做图和分析比较方便。
代码参考了这篇教程,结合我的理解,我写成了一个初步的代码,后期可能可以优化一下。
下面的例子拼接了两个数据,组合做图,单个数据自己稍微修改下即可。
## Script by Thomas Kuilman
## Revise by Chengfei Jiang 11052020
## path argument: path to output folder of analysis (e.g. PATH/my_analysis.GseaPreranked.1470948568349)
## gene.set argument: name of the gene set (e.g. V$AP1_Q2).
## It is used in a grep command, so multiple matching is possible.
## Also, R regular expressions can be handled, e.g. "IL2[0-9]$"
## Leading "V$" from gene set names are stripped to allow using the grep command.
## In case of multiple grep matches a warning is given and the first option is plotted.
## class.name: the name of the class / variable to which genes have been correlated (e.g. drug-treatment)
## metric.range: the range of the metric; defaults to [min(DEFINED RANGE), max(DEFINED RANGE)]
## enrichment.score.range: the range of the enrichment score; defaults to [min(ENRICHMENT SCORE), max(ENRICHMENT SCORE)]
suppressMessages(library(ggpubr))
suppressMessages(library(tidyverse))
suppressMessages(library(DOSE))
replotGSEA <- function(path, gene.set) {
## Load .rnk data
path.rnk <- list.files(path = file.path(path, "edb"),
pattern = ".rnk$", full.names = TRUE)
gsea.rnk <- read.delim(file = path.rnk, header = FALSE)
colnames(gsea.rnk) <- c("hgnc.symbol", "metric")
metric.range <- c(min(gsea.rnk$metric), max(gsea.rnk$metric))
## Load .edb data
path.edb <- list.files(path = file.path(path, "edb"),
pattern = ".edb$", full.names = TRUE)
gsea.edb <- read.delim(file = path.edb,
header = FALSE, stringsAsFactors = FALSE)
gsea.edb <- unlist(gsea.edb)
gsea.metric <- gsea.edb[grep("METRIC=", gsea.edb)]
gsea.metric <- unlist(strsplit(gsea.metric, " "))
gsea.metric <- gsea.metric[grep("METRIC=", gsea.metric)]
gsea.metric <- gsub("METRIC=", "", gsea.metric)
gsea.edb <- gsea.edb[grep("<DTG", gsea.edb)]
# Select the right gene set
if (length(gsea.edb) == 0) {
stop(paste("The gene set name was not found, please provide",
"a correct name"))
}
if (length(grep(paste0(gsub(".\\$(.*$)", "\\1", gene.set), " "), gsea.edb)) > 1) {
warning(paste("More than 1 gene set matched the gene.set",
"argument; the first match is plotted"))
}
gsea.edb <- gsea.edb[grep(paste0(gsub(".\\$(.*$)", "\\1", gene.set), " "), gsea.edb)[1]]
# Get template name
gsea.edb <- gsub(".*TEMPLATE=(.*)", "\\1", gsea.edb)
gsea.edb <- unlist(strsplit(gsea.edb, " "))
gsea.template <- gsea.edb[1]
# Get gene set name
gsea.gene.set <- gsea.edb[2]
gsea.gene.set <- gsub("GENESET=gene_sets.gmt#", "", gsea.gene.set)
# Get enrichment score
gsea.enrichment.score <- gsea.edb[3]
gsea.enrichment.score <- gsub("ES=", "", gsea.enrichment.score)
# Get gene set name
gsea.normalized.enrichment.score <- gsea.edb[4]
gsea.normalized.enrichment.score <- gsub("NES=", "",
gsea.normalized.enrichment.score)
# Get nominal p-value
gsea.p.value <- gsea.edb[5]
gsea.p.value <- gsub("NP=", "", gsea.p.value)
gsea.p.value <- as.numeric(gsea.p.value)
# Get FDR
gsea.fdr <- gsea.edb[6]
gsea.fdr <- gsub("FDR=", "", gsea.fdr)
gsea.fdr <- as.numeric(gsea.fdr)
# Get hit indices
gsea.edb <- gsea.edb[grep("HIT_INDICES=", gsea.edb):length(gsea.edb)]
gsea.hit.indices <- gsea.edb[seq_len(grep("ES_PROFILE=", gsea.edb) - 1)]
gsea.hit.indices <- gsub("HIT_INDICES=", "", gsea.hit.indices)
gsea.hit.indices <- as.integer(gsea.hit.indices)
# Get ES profile
gsea.edb <- gsea.edb[grep("ES_PROFILE=", gsea.edb):length(gsea.edb)]
gsea.es.profile <- gsea.edb[seq_len(grep("RANK_AT_ES=", gsea.edb) - 1)]
gsea.es.profile <- gsub("ES_PROFILE=", "", gsea.es.profile)
gsea.es.profile <- as.numeric(gsea.es.profile)
# Set enrichment score range
enrichment.score.range <- c(min(gsea.es.profile), max(gsea.es.profile))
## Create GSEA plot
# Save default for resetting
def.par <- par(no.readonly = TRUE)
# Create plots
par(mar = c(0, 5, 2, 2))
rt<-data.frame(c(1, gsea.hit.indices, length(gsea.rnk$metric)))
colnames(rt)<-"hit"
rt$"hit"<-as.numeric(as.character(rt$hit))
rt$"es"<-c(0, gsea.es.profile, 0)
return(rt)
message(paste0("NES=",as.character(gsea.normalized.enrichment.score)))
message(paste0("Normal P=",as.character(gsea.p.value)))
message(paste0("FDR=",as.character(gsea.fdr)))
}
path_human<-"~/my_analysis.GseaPreranked.1604608899819/"
path_mouse<-"~/my_analysis.GseaPreranked.1604608920473/"
human_glucose<-replotGSEA(path = path_human,gene.set = gene_set)
mouse_glucose<-replotGSEA(path = path_mouse,gene.set = gene_set)
gene_set="GO_GLUCOSE_METABOLIC_PROCESS"
rt<-rbind(human_glucose,mouse_glucose)
rt$group<-c(rep("Human",nrow(human_glucose)),rep("Mouse",nrow(mouse_glucose)))
p<-ggline(rt, x = "hit", y = "es",
plot_type="l",size=0.75,color ="group",palette = "npg",
numeric.x.axis=T,ylab = "Enrichment score (ES)",title=gene_set)+
theme(axis.title.x=element_blank(),axis.text.x=element_blank(), axis.ticks.x=element_blank())
p<-p+geom_hline(yintercept = 0,color="grey70")
rt$ymin<-0.1
rt$ymax<-(-0.1)
g<-ggplot(rt, aes_(x = ~hit)) +geom_linerange(aes_(ymin=~ymin, ymax=~ymax,color=~group))+theme_bw()+theme_classic()
g<-g+theme(axis.title.x=element_blank(),axis.text.x=element_blank(), axis.ticks.x=element_blank())
g<-g+theme(axis.title.y=element_blank(),axis.text.y=element_blank(), axis.ticks.y=element_blank())
g<-g +theme(legend.position="none")+theme(axis.line = element_line(size=0))
ggarrange(p, g, heights = c(2, 0.7),
ncol = 1, nrow = 2,align = "hv")
成图如下,但是美观上还有待改进。
![](https://img.haomeiwen.com/i9498237/daf2b1dbabbd0637.png)
网友评论