GSVA其实就是pathway级别的差异分析,根据GSVA的原理其实就是计算每个sample的GSEA然后得出类似pathway enrich score,把这个可以当作芯片的表达数据一样,再用limma包分析差异基因。
参考以下两个例子:
使用GSVA方法计算某基因集在各个样本的表现
充分理解GSVA和GSEA
以及我一个例子:
rm(list=ls())
suppressMessages(library(GSVA))
suppressMessages(library(GSVAdata))
suppressMessages(library(GSEABase))
suppressMessages(library(limma))
#读入gmt文件,这个可以从MSigDB上下载,这边选的上gene symbol根据自己的data来选择
gmt_file="~/c5.bp.v7.1.symbols.gmt"
geneset <- getGmt(gmt_file)
#自行读入exp文件,我这边为行为gene symbol,列为样本(4个control,5个treatment样本)
es <- gsva(as.matrix(exp), geneset,
min.sz=10, max.sz=500, verbose=TRUE)
#得到gsva计算的数值后再用limma包做差异分析得到差异的pathway
design <- model.matrix(~ factor(c(rep("cont",4),rep("treatment",5))))
colnames(design) <- c("ALL", "contvstreatment")
row.names(design)<-colnames(exp)
fit <- lmFit(es, design)
fit <- eBayes(fit)
#这边是总的
allGeneSets <- topTable(fit, coef="contvstreatment", number=Inf)
#这边是差异的
adjPvalueCutoff <- 0.001
DEgeneSets <- topTable(fit, coef="contvstreatment", number=Inf,
p.value=adjPvalueCutoff, adjust="BH")
res <- decideTests(fit, p.value=adjPvalueCutoff)
例子2
gmt_file="~/Desktop/GoogleDrive/HPC/gmt/common/c5.bp.v7.1.symbols.gmt"
geneset <- getGmt(gmt_file)
tpm_1st_pass<-read.csv(paste0(data_dir,"TPM_GenePass.csv"),row.names = 1)
gsva_analysis<-function(sample,design,geneset,p,out_dir){
exp<-get(sample)
es <- gsva(as.matrix(exp), geneset,
min.sz=10, max.sz=500, verbose=TRUE)
colnames(design) <- c("ALL", "ALvsFast")
row.names(design)<-colnames(exp)
fit <- lmFit(es, design)
fit <- eBayes(fit)
allGeneSets <- topTable(fit, coef="ALvsFast", number=Inf)
DEgeneSets <- allGeneSets[allGeneSets$P.Value<p,]
write.csv(es,paste0(out_dir_table,sample,"_GSVA_es_matrix.csv"))
write.csv(allGeneSets,paste0(out_dir_table,sample,"_GSVA_degs_analysis_matrix.csv"))
write.csv(DEgeneSets,paste0(out_dir_table,sample,"_GSVA_degs_sig_p_",as.character(p),"_analysis_matrix.csv"))
output <- list(a = es, b = allGeneSets,c=DEgeneSets)
return(output)
}
p=0.05
design_1st <- model.matrix(~ factor(c(rep("AL",3),rep("Fast",3))))
gsva_1st<-gsva_analysis(sample="tpm_1st_pass",design=design_1st,geneset=geneset,p=p,out_dir=out_dir_table)
gsva_1st_es<-gsva_1st$a
gsva_1st_degs<-gsva_1st$b
gsva_1st_degs_sig<-gsva_1st$c
包装为一个函数,使用sbatch,下面是mouse data转化为human gene symbol然后run gsva
#!/usr/bin/env Rscript
#use for mouse data to human gsva analysis
#all data files and gmt files should be in the same folder
#@author: CFJiang 04252020
suppressMessages(library(argparse))
rm(list = ls())
options(stringsAsFactors=F)
parser <- ArgumentParser(description="")
parser$add_argument('-d', '--data_dir', metavar="*", help="path for your data")
parser$add_argument('-f', '--data_filename', metavar="*", help="data file name")
parser$add_argument('-p', '--prefix', metavar="*", help="prefix for your output")
parser$add_argument('-g1', '--group1', metavar="*", help="group1 name")
parser$add_argument('-g2', '--group2', metavar="*", help="group2 name")
parser$add_argument('-t', '--type', metavar="*", help="type for either KEGG or GO")
args <- parser$parse_args()
if (is.null(args$type)) { args$type <-"KEGG" }
#useage & example
data_dir= as.character(args$data_dir) #your data folder
data_filename= as.character(args$data_filename) #your data file full name
prefix= as.character(args$prefix) #your out put flile prefix
group1= as.character(args$group1) #your group1
group2= as.character(args$group2) #your group2
type=as.character(args$type) #your analysis type (KEGG or GO)
gsva_all_in_one<-function(data_dir,data_filename,prefix,group1,group2,type){
rm(list = ls())
options(stringsAsFactors=F)
suppressMessages(library(GSVA))
suppressMessages(library(GSEABase))
suppressMessages(library(limma))
suppressMessages(library(Hmisc))
change_name<-function(rt){
ann<-read.csv(paste0(data_dir,"/HHOM_MouseHumanSequence_mouse_human_combined.csv"),row.names = 1,header = T)
com_id<-intersect(row.names(ann),row.names(rt))
ann_com<-ann[com_id,]
rt_com<-rt[com_id,]
row.names(rt_com)<-as.character(ann_com$Symbol)
return(rt_com)
}
#read your data .csv
your_data<-read.csv(paste0(data_dir,"/",data_filename),header = T)
your_data<-your_data[!duplicated(your_data[,1]),]
row.names(your_data)<-as.character(your_data[,1])
your_data<-your_data[,-1]
#change the mouse gene name to human gene name
new_rt<-change_name(your_data)
#gsva analysis
gsva_de<-function(data_dir,new_rt,prefix,group1,group2,type){
new_dir<-paste0(data_dir,"/",prefix)
dir.create(new_dir,recursive = T)
setwd(new_dir)
if (type=="KEGG"){
gmt_file=paste0(data_dir,"/c2.cp.kegg.v7.1.symbols.gmt")
}else{
gmt_file=paste0(data_dir,"/c5.bp.v7.1.symbols.gmt")
}
geneset <- getGmt(gmt_file)
es <- gsva(as.matrix(new_rt), geneset,
min.sz=10, max.sz=500, verbose=TRUE)
gs<-paste0("(",as.character(group1),"|",as.character(group2),").*")
level<-c(as.character(group1),as.character(group2))
design <- model.matrix(~factor(gsub(gs, "\\1", colnames(new_rt)),levels = level))
compare<-paste0(as.character(group1),"vs",as.character(group2))
colnames(design)<-c("ALL", compare)
row.names(design)<-colnames(new_rt)
fit <- lmFit(es, design)
fit <- eBayes(fit)
allGeneSets <- topTable(fit, coef=compare, number=Inf)
adjPvalueCutoff <- 0.05
DEgeneSets <- topTable(fit, coef=compare, number=Inf,
p.value=adjPvalueCutoff, adjust="BH")
write.csv(es,paste0(prefix,"_",compare,"_all_gsva_matrix_",type,".csv"))
write.csv(allGeneSets,paste0(prefix,"_",compare,"_all_deg_gsva_",type,".csv"))
write.csv(DEgeneSets,paste0(prefix,"_",compare,"_padj_",as.character(adjPvalueCutoff),"_deg_gsva_",type,".csv"))
}
gsva_de(data_dir=data_dir,new_rt = new_rt ,prefix = prefix ,group1 = group1,group2 = group2,type=type)
}
# data_dir= "~/Desktop/GSVA/ori_data" #your data folder
# data_filename= "nonCD45_T_count_test.csv" #your data file full name
# prefix= "nonCD45_T" #your out put flile prefix
# group1= "FB6" #your group1
# group2= "FT2" #your group2
gsva_all_in_one(data_dir=data_dir,data_filename =data_filename,prefix = prefix,group1 = group1,group2=group2,type=type )
shell脚本如下
#! /bin/bash
#SBATCH --time=10:00:00
#SBATCH --cpus-per-task=10
#SBATCH --mem=16g
ml R/3.5
data_dir=
Rscript ./GSVA_mouse.R -d ${data_dir} -f nonCD45_T_count_test.csv -p nonCD45_T -g1 FB6 -g2 FT2 -t KEGG
另外一个文章
D11 2 理解GSVA 单个基因集的GSVA
rm(list = ls())
load(file = 'step2_select.Rdata')
# 每次都要检测数据
dat<- as.data.frame(t(M_expr))
dat[1:4,1:4]
library(hgu133plus2.db)
ls("package:hgu133plus2.db")
ids=toTable(hgu133plus2SYMBOL)
head(ids)
ids$symbol
dat<- dat[rownames(dat)%in%ids$symbol,]
table(rownames(dat) %in% ids$symbol)
ids<- ids[ids$symbol%in%rownames(dat),]
ids<- ids[!duplicated(ids$symbol),]
ids$median=apply(dat,1,median)
#
# ids=ids[order(ids$symbol,ids$median,decreasing = T),]
# ids=ids[!duplicated(ids$symbol),]
# dat=dat[ids$symbol,]
# rownames(dat)=ids$symbol
# dat[1:4,1:4]
Kmeans_results$group_list[Kmeans_results$`km.res$cluster`==1]<- c('cluster1')
Kmeans_results$group_list[Kmeans_results$`km.res$cluster`==2]<- c('cluster2')
group_list<- Kmeans_results$group_list
X=dat
##############################################################################################
X<- as.matrix(X)
library(GSVA)
options(stringsAsFactors = F)
gene_Set <- read.delim("C:/Users/chenyuqiao/Desktop/LN RNAseq/Step GSVA/395gene GeneSet.txt", header=FALSE, row.names=1)
gene_Set<- as.data.frame(t(gene_Set))
gene_list<- as.list(gene_Set)
es.max <- gsva(X, gene_list, mx.diff=FALSE, verbose=FALSE, parallel.sz=1)
adjPvalueCutoff <- 0.05 #################这里要调整
logFCcutoff <- log2(2)
table(group_list)
dim(es.max)
design <- model.matrix(~0+factor(group_list))
colnames(design)=levels(factor(group_list))
rownames(design)=colnames(es.max)
design
library(limma)
contrast.matrix<-makeContrasts(paste0(unique(group_list),collapse = "-"),
levels = design)
# contrast.matrix<-makeContrasts("TNBC-noTNBC",
# levels = design)
contrast.matrix ##这个矩阵声明,我们要把progres.组跟stable进行差异分析比较
##step1
fit <- lmFit(es.max,design)
##step2
fit2 <- contrasts.fit(fit, contrast.matrix)
##这一步很重要,大家可以自行看看效果
fit2 <- eBayes(fit2) ## default no trend !!!
##eBayes() with trend=TRUE
##step3
res <- decideTests(fit2, p.value=adjPvalueCutoff)
summary(res)
tempOutput = topTable(fit2, coef=1, n=Inf)
nrDEG = na.omit(tempOutput)
#write.csv(nrDEG2,"limma_notrend.results.csv",quote = F)
head(nrDEG)
data_gene_set<- as.data.frame(es.max)
nrDEG=nrDEG[nrDEG$P.Value<0.5 & abs(nrDEG$logFC) > 0.1,] #######在这里调阈值,保证输出
print(dim(nrDEG))
n=rownames(nrDEG)
match(n,rownames(nrDEG))
data_gene_set=data_gene_set[match(n,rownames(data_gene_set)),] ######n=rownames(nrDEG),筛选表达矩阵数据
ac=data.frame(g=group_list) ########对应后面的图中的annotation_col
rownames(ac)=colnames(data_gene_set)
# rownames(nrDEG)=substring(rownames(nrDEG),1,50)
library(pheatmap)
pheatmap(data_gene_set)
dev.off
pheatmap(data_gene_set)
pheatmap::pheatmap(data_gene_set,
fontsize_row = 8,height = 11,
annotation_col = ac,show_colnames = F,filename = '11111111111.pdf')
# pheatmap::pheatmap(nrDEG,
# fontsize_row = 8,height = 11,
# annotation_col = ac,show_colnames = F,
# filename = '11111111111.pdf')
#
转载来自:
作者:落寞的橙子
链接:https://www.jianshu.com/p/975adf808130
作者:陈宇乔
链接:https://www.jianshu.com/p/6af88970fcdf
网友评论