1.构建水稻OrgDb包
1.1安装如下依赖:
R 包:argparser, tidyverse, formattable, AnnotationForge, seqinr, clusterProfiler
1.2.下载蛋白序列
链接如下:http://rice.uga.edu/pub/data/Eukaryotic_Projects/o_sativa/annotation_dbs/pseudomolecules/version_7.0/all.dir/all.pep
处理文件,去除注释,只保留ID和序列。
awk '{print $1}' all.pep > protein.fa
压缩文件:gzip protein.fa
1.3.注释
登录 http://eggnog-mapper.embl.de/ 上传蛋白序列, 在线进行蛋白注释.
完成后下载 out.emapper.annotations
1.4.安装 emcp
git clone http://git.genek.cn:3333/zhxd2/emcp.git
2.构建OrgDb包
运行 emapperx.R
Rscript ./emcp/emapperx.R out.emapper.annotations proteins.fa
注意: emapperx.R这个封装的R包在构建的时候已经把R包装到当前文件夹
命令如下:
install.packages('org.My.eg.db_1.0.tar.gz',
repos = NULL, #从本地安装
lib = 'R_Library') # 安装在当前路径的R_Library文件夹下
3.加载OrgDb并进行GO分析
library(enrichplot)
library(AnnotationDbi)
library(clusterProfiler)
读入基因list文件
gene=read_csv(file = "GO/gene.txt",col_names = "gene_id")

gene=pull(gene) #变成列表
head(gene)

可忽略
#安装OrgDb包
install.packages('org.My.eg.db_1.0.tar.gz',
repos = NULL, #从本地安装
lib = 'R_Library') # 安装在当前路径的R_Library文件夹下
加载OrgDB包
library(org.My.eg.db, lib.loc = "GO/R_Library/")
k=head(keys(org.My.eg.db))
columns(org.My.eg.db)

AnnotationDbi::select(org.My.eg.db,keys=k,columns=c("GID"))

富集分析
ego <- enrichGO(gene = gene,
OrgDb = org.My.eg.db,
keyType = "GID",
ont = "BP",
pvalueCutoff = 1,
qvalueCutoff = 1,
readable = FALSE)
ego_df <- as.data.frame(ego)
view(ego_df)

Gene_set enrichment
导入数据
Deg <- read.delim("~/GO/Deg1.txt",)
head(Deg)

geneList<-Deg$log2FoldChange
names(geneList)<-Deg$Row.names
geneList<-sort(geneList,decreasing = T)
head(geneList)

GSEA
ego2 <- gseGO(geneList = geneList,
OrgDb = org.My.eg.db,
keyType = "GID",
ont = "BP",
minGSSize = 30,
maxGSSize = 500,
pvalueCutoff = 1,
verbose = FALSE)
head(ego2)

转化为数据匡以及绘图
ego2_df_gsea=as.data.frame(ego2)
gseaplot2(ego2, geneSetID=c("GO:0009408"))

参考:基因课---《22天入门生物信息》----富集分析
网友评论