今天抽个空把单细胞领域的GO KEGG GSEA GSVA都做了一遍,跑完以后的感觉就是有这样的理解,就是单细胞我首先会选择做GSVA,但是我还会顺便做下kegg或者GSEA,因为我想看一下差异通路到底有哪些基因富集,从头到脚跑一遍后,可能对通路的富集分析有了更深的理解。
数据
load('singleBB.Rdata')
view(experiment.aggregate@meta.data)
experiment.aggregate@meta.data$celltype4 = "NA"
# 更改 celltype 信息,设置细胞群新名称
experiment.aggregate@meta.data[which(experiment.aggregate@meta.data$celltype3 %in% c('GC B cells','Memory B cells','Naive B cells')), "celltype4"] = "CD20+ B cells"
experiment.aggregate@meta.data[which(experiment.aggregate@meta.data$celltype3 %in% c('IGA+ Plasma','IGG+ Plasma')), "celltype4"] = "Plasma B cells"
cells1 <- subset(experiment.aggregate@meta.data, celltype4 %in% c("CD20+ B cells")) %>% rownames()
cells2 <- subset(experiment.aggregate@meta.data, celltype4 %in% c("Plasma B cells")) %>% rownames()
deg <- FindMarkers(experiment.aggregate, ident.1 = cells1, ident.2 = cells2)
deg <- data.frame(gene = rownames(deg), deg)
deg1 <- deg
logFC_t=0.5
P.Value_t = 0.05
k1 = (deg1$p_val_adj < P.Value_t)&(deg1$avg_logFC < -logFC_t)
k2 = (deg1$p_val_adj < P.Value_t)&(deg1$avg_logFC > logFC_t)
table(k1)
table(k2)
change = ifelse(k1,"down",ifelse(k2,"up","stable"))
deg1$change <- change
library(ggplot2)
library(clusterProfiler)
library(org.Hs.eg.db)
library(dplyr)
s2e <- bitr(deg1$gene,
fromType = "SYMBOL",
toType = "ENTREZID",
OrgDb = org.Hs.eg.db)#人类 转换成ENTREZID
#其他物种http://bioconductor.org/packages/release/BiocViews.html#___OrgDb
deg1 <- inner_join(deg1,s2e,by=c("gene"="SYMBOL"))
#source("kegg_plot_function.R")
#source表示运行整个kegg_plot_function.R脚本,里面是一个function
#以up_kegg和down_kegg为输入数据做图
#1.GO database analysis ----
#(1)输入数据
gene_up = deg1[deg1$change == 'up','ENTREZID']
gene_down = deg1[deg1$change == 'down','ENTREZID']
gene_diff = c(gene_up,gene_down)
gene_all = deg1[,'ENTREZID']
GO分析
if(T){
#细胞组分
ego_CC <- enrichGO(gene = gene_up,
OrgDb= org.Hs.eg.db,
ont = "CC",
pAdjustMethod = "BH",
minGSSize = 1,
pvalueCutoff = 0.01,
qvalueCutoff = 0.01,
readable = TRUE)
#生物过程
ego_BP <- enrichGO(gene = gene_up,
OrgDb= org.Hs.eg.db,
ont = "BP",
pAdjustMethod = "BH",
minGSSize = 1,
pvalueCutoff = 0.01,
qvalueCutoff = 0.01,
readable = TRUE)
#分子功能:
ego_MF <- enrichGO(gene = gene_up,
OrgDb= org.Hs.eg.db,
ont = "MF",
pAdjustMethod = "BH",
minGSSize = 1,
pvalueCutoff = 0.01,
qvalueCutoff = 0.01,
readable = TRUE)
save(ego_CC,ego_BP,ego_MF,file = "GO.Rdata")
}
if(T){
#细胞组分
ego_CC <- enrichGO(gene = gene_down,
OrgDb= org.Hs.eg.db,
ont = "CC",
pAdjustMethod = "BH",
minGSSize = 1,
pvalueCutoff = 0.01,
qvalueCutoff = 0.01,
readable = TRUE)
#生物过程
ego_BP <- enrichGO(gene = gene_down,
OrgDb= org.Hs.eg.db,
ont = "BP",
pAdjustMethod = "BH",
minGSSize = 1,
pvalueCutoff = 0.01,
qvalueCutoff = 0.01,
readable = TRUE)
#分子功能:
ego_MF <- enrichGO(gene = gene_down,
OrgDb= org.Hs.eg.db,
ont = "MF",
pAdjustMethod = "BH",
minGSSize = 1,
pvalueCutoff = 0.01,
qvalueCutoff = 0.01,
readable = TRUE)
save(ego_CC,ego_BP,ego_MF,file = "GODOWN.Rdata")
}
load('GODOWN.Rdata')
ggg <- ego_MF@result
#(3)可视化
#条带图
dotplot(ego_MF,showCategory=13)
barplot(ego_BP)
if(F){
go <- enrichGO(gene = gene_up, OrgDb = "org.Hs.eg.db", ont="all")
}
##save(go,file="go.Rdata")
### 直接加载我保存的数据
library(ggplot2)
p <- dotplot(go, split="ONTOLOGY") +facet_grid(ONTOLOGY~., scale="free")
p
图片.png
KEGG
if(T){
kk.up <- enrichKEGG(gene = gene_up,
organism = 'hsa',
universe = gene_all,
pvalueCutoff = 0.9,
qvalueCutoff = 0.9)
kk.down <- enrichKEGG(gene = gene_down,
organism = 'hsa',
universe = gene_all,
pvalueCutoff = 0.9,
qvalueCutoff =0.9)
kk.diff <- enrichKEGG(gene = gene_diff,
organism = 'hsa',
pvalueCutoff = 0.9)
save(kk.diff,kk.down,kk.up,file = "GSE4107kegg.Rdata")
}
load("GSE4107kegg.Rdata")
#(3)从富集结果中提取出结果数据框
ekegg <- setReadable(kk.up, OrgDb = org.Hs.eg.db, keyType="ENTREZID")
kegg_diff_dt <- data.frame(ekegg)
p1 <- barplot(ekegg, showCategory=10)
p2 <- dotplot(ekegg, showCategory=10)
plotc = p1/p2
down_kegg <- kk.down@result %>%
filter(pvalue<0.05) %>% #筛选行
mutate(group=-1) #新增列
up_kegg <- kk.up@result %>%
filter(pvalue<0.01) %>%
mutate(group=1)
#(5)可视化
g_kegg <- kegg_plot(up_kegg,down_kegg)
g_kegg
g_kegg +scale_y_continuous(labels = c(15,10,5,0,5,10,15,20,25,30))
图片.png
GSEA
library(GSEABase)
library(clusterProfiler)
library(DOSE)
library(org.Hs.eg.db)
library(ggplot2)
library(stringr)
options(stringsAsFactors = F)
data(geneList)
geneList = deg1$avg_logFC
names(geneList) = deg1$ENTREZID
geneList = sort(geneList,decreasing = T)
#准备gmt文件
geneset <- read.gmt("c2.cp.kegg.v7.2.entrez.gmt")
geneset$ont = str_remove(geneset$ont,"HALLMARK_")
egmt <- GSEA(geneList, TERM2GENE=geneset,verbose=F)
#> Warning in fgsea(pathways = geneSets, stats = geneList, nperm = nPerm, minSize = minGSSize, : There are ties in the preranked stats (0.05% of the list).
#> The order of those tied genes will be arbitrary, which may produce unexpected results.
egmt2<- setReadable(egmt,OrgDb=org.Hs.eg.db, keyType = "ENTREZID")
y=data.frame(egmt2)
#气泡图,展示geneset被激活还是抑制
dotplot(egmt2,split=".sign")+facet_grid(~.sign)
#dotplot(egmt2,split=".sign")+facet_grid(~.sign)+
#scale_y_discrete(labels=function(x) str_wrap(str_replace_all(x,"_"," "), width=30))
# scale_y_discrete(labels=function(x) str_remove(x,"HALLMARK_"))
#dotplot(y,showCategory=20,split=".sign")+facet_grid(~.sign)
library(enrichplot)
gseaplot2(egmt, geneSetID = 1, title = egmt$Description[1])
#多个gnenset合并展示
gseaplot2(egmt, geneSetID = 1:3,pvalue_table = T)
#gseaplot2(egmt,2,color = "blue",pvalue_table = T)
gseaplot2(egmt2, geneSetID = 'KEGG_PROTEIN_EXPORT',pvalue_table = T)
for(i in seq_along(egmt@result$ID)){
p <- gseaplot(egmt, geneSetID = i, title = egmt@result$ID[i], by = "runningScore")
filename <- paste0('i love you', egmt@result$ID[i], '.png')
ggsave(filename = filename, p, width = 8, height = 4)
}
# 美化版
if(F){
library(enrichplot)
for(i in seq_along(egmt@result$ID)){
p <- gseaplot2(egmt, geneSetID = i, title = egmt@result$ID[i])
filename <- paste0('i love you', egmt@result$ID[i], '.png')
ggsave(filename = filename, p, width = 8, height = 5)
}
}
图片.png
GSVA
我用的数据集是board公司http://www.gsea-msigdb.org/gsea/msigdb的kegg和reactome,主要是reactome有代谢通路,GSVA可以设置多核parallel.sz=12
library(ggplot2)
library(clusterProfiler)
library(org.Hs.eg.db)
library(dplyr)
setwd("~/project/")
load('singleBB.Rdata')
expr <- as.data.frame(experiment.aggregate@assays$RNA@data)
expr$ID <- rownames(expr)
s2e <- bitr(expr$ID,
fromType = "SYMBOL",
toType = "ENTREZID",
OrgDb = org.Hs.eg.db)#人类 转换成ENTREZID
#其他物种http://bioconductor.org/packages/release/BiocViews.html#___OrgDb
expr <- inner_join(expr,s2e,by=c("ID"="SYMBOL"))
rownames(expr) <- expr$ENTREZID
expr <- expr[,-9150]
expr <- expr[,-9149]
meta <- as.data.frame(experiment.aggregate@meta.data[,c('orig.ident',"celltype3")])
kegggmt1 <- read.gmt("c2.cp.kegg.v7.2.entrez.gmt")
kegggmt2 <- read.gmt("c2.cp.reactome.v7.2.entrez.gmt")
kegggmt <- rbind(kegggmt1,kegggmt2)
colnames(kegggmt1)
kegg_list = split(kegggmt$gene, kegggmt$term)
library(GSVA)
expr=as.matrix(expr)
kegg2 <- gsva(expr, kegg_list, kcdf="Gaussian",method = "gsva",parallel.sz=12)
write.table(data,"kegg2.txt",sep="\t",quote = F)
data <- data.table::fread('kegg2.txt',data.table = F)
rownames(data) <- data$V1
data <- data[,-1]
library(dplyr)
meta <- meta %>%
arrange(meta$celltype3)
data <- data[,rownames(meta)]
identical(colnames(data),rownames(meta))
data$GC <- apply(data[,1:111], 1, mean)
data$IGA <- apply(data[,112:4235], 1, mean)
data$IGG <- apply(data[,4236:5504], 1, mean)
data$memory <- apply(data[,5505:7797], 1, mean)
data$naive <- apply(data[,7798:9148], 1, mean)
table(meta$celltype3)
test <- data[,9149:9153]
pathway <- c('KEGG_ANTIGEN_PROCESSING_AND_PRESENTATION','REACTOME_COSTIMULATION_BY_THE_CD28_FAMILY','KEGG_PROTEIN_EXPORT','KEGG_DNA_REPLICATION','KEGG_BASE_EXCISION_REPAIR','KEGG_CELL_CYCLE',
'REACTOME_GLUCOSE_METABOLISM','REACTOME_FATTY_ACID_METABOLISM','REACTOME_METABOLISM_OF_FAT_SOLUBLE_VITAMINS','REACTOME_METABOLISM_OF_AMINO_ACIDS_AND_DERIVATIVES','REACTOME_SELENOAMINO_ACID_METABOLIS',
'KEGG_JAK_STAT_SIGNALING_PATHWAY','KEGG_TGF_BETA_SIGNALING_PATHWAY','KEGG_WNT_SIGNALING_PATHWAY','KEGG_MTOR_SIGNALING_PATHWAY')
test1 <- test[pathway,c(5,4,1,2,3)]
result3<- t(scale(t(test1)))
result3[result3>2]=2
result3[result3<-2]=-2
library(pheatmap)
G1 <- pheatmap(result3,
cluster_rows = F,
cluster_cols = F,
show_rownames = T,
show_colnames = T,
color =colorRampPalette(c("blue", "white","red"))(100),
cellwidth = 10, cellheight = 15,
fontsize = 10)
pdf(("G1.pdf"),width = 7,height = 7)
print(G1)
dev.off()
#test4 <- test3[grep(pattern="METABOLISM",rownames(test3)),]
图片.png
网友评论