- 参考链接
https://www.jianshu.com/p/c01b4cc1b98a
https://www.jianshu.com/p/69aa1c9cf4d1
http://blog.sina.com.cn/s/blog_5d188bc40102xkr1.html
Y叔官方教程
GO、KEGG生物学意义讲解 - 我的实现代码
背景说明:此处是在做完差异分析之后的GO和KEGG,GSEA
前面的步骤
全程在R-studio完成。
各种包各种依赖,哪个缺少,安哪个。
R包的安装方法:
install.packages("ggplot2")
if (!requireNamespace("BiocManager", quietly = TRUE))
install.packages("BiocManager")
BiocManager::install("topGO", version = "3.8")
source("https://bioconductor.org/biocLite.R")
biocLite("clusterProfiler")
各种报错大部分都是包的依赖问题。
前面的步骤代码如下:
source("https://bioconductor.org/biocLite.R")
biocLite("DESeq2")
install.packages("tidyverse")
#输入数据
library(tidyverse)
library(DESeq2)
library(ggplot2)
#import data
#setwd("/home/chaim/disk/lyb/data/")
#setwd("/mnt/d/RNA-seq/")
setwd("D:/RNA-seq/")
countData <- as.matrix(read.csv("gene_count_matrix.csv",row.names="gene_id"))
condition <- factor(c(rep("NS",3),rep("WT",3)),levels = c("NS","WT"))
colData <- data.frame(row.names=colnames(countData),condition)
dds <- DESeqDataSetFromMatrix(countData = countData,colData = colData, design = ~ condition)
dds <- DESeq(dds)
#总体结果查看
res = results(dds)
res = res[order(res$pvalue),]
summary(res)
#write.csv(res,file="All_results.csv")
table(res$padj<0.05)
#提取差异基因(DEGs)并进行gene Symbol注释
diff_gene_deseq2 <- subset(res,padj<0.05 & abs(log2FoldChange)>1)
dim(diff_gene_deseq2)
write.csv(diff_gene_deseq2,file = "DEG_treat_vs_control.csv")
开始GO、KEGG、GSEA分析
#生成对应的散点火山图
resdata <- read.csv("DEG_treat_vs_control.csv",header = TRUE)
threshold <- as.factor(ifelse(resdata$padj < 0.01 & abs(resdata$log2FoldChange) >= 2 ,ifelse(resdata$log2FoldChange >= 2 ,'Up','Down'),'Not'))
deg_img <- ggplot(resdata,aes(x=resdata$log2FoldChange,y=-log10(resdata$padj),colour=threshold)) + xlab("log2(Fold Change)")+ylab("-log10(qvalue)") + geom_point(size = 0.5,alpha=1) + ylim(0,200) + xlim(-12,12) + scale_color_manual(values=c("green","grey", "red"))
ggsave("deg.pdf",deg_img)
#聚类热图
##此处生成的是所有的结果的热图
install.packages("pheatmap")
if (!requireNamespace("BiocManager", quietly = TRUE))
install.packages("BiocManager")
BiocManager::install("genefilter", version = "3.8")
#out.csv 是DEG_treat_vs_control.csv中过滤掉未被注释的基因,out.csv是已经注释的基因
#resgene <- read.csv("out.csv",header = TRUE)
library(readr)
library("genefilter")
library(pheatmap)
rld <- rlogTransformation(dds,blind=F)
write.csv(assay(rld),file="mm.DeSeq2.pseudo.counts.csv")
topVarGene <- head(order(rowVars(assay(rld)),decreasing = TRUE),30)
mat <- assay(rld)[topVarGene,]
##mat <- mat - rowMeans(mat) 减去一个平均值,让数值更加集中
anno <- as.data.frame(colData(rld)[,c("condition","sizeFactor")])
pheat <- pheatmap(mat,annotation_col=anno)
ggsave("pheatmap.png",pheat,width=12,height=12)
# #安装biomaRt包
# source("http://bioconductor.org/biocLite.R")
# biocLite("biomaRt")
# install.packages('DT')
# #用bioMart对差异表达基因进行注释
# library("biomaRt")
# listMarts()
#
# ensembl=useMart("ENSEMBL_MART_ENSEMBL")
# all_datasets <- listDatasets(ensembl)
# library(DT)
# datatable(all_datasets,options = list(searching=FALSE,pageLength=5,lengthMenu=c(5,10,15,20)))
#安装clusterProfiler 用于GO/KEGG分析及GSEA
source("https://bioconductor.org/biocLite.R")
biocLite("clusterProfiler")
biocLite("DOSE")
library(DO.db)
require(DOSE)
library(clusterProfiler)
if (!requireNamespace("BiocManager", quietly = TRUE))
install.packages("BiocManager")
BiocManager::install("S4Vectors", version = "3.8")
#安装annotationhub
if(!requireNamespace("BiocManager",quietly = TRUE))
install.packages("BiocManager")
BiocManager::install("AnnotationHub", version = "3.8")
library(AnnotationHub)
require(AnnotationHub)
#下面的hub <- AnnotationHub() maize <- hub[['AH66226']]可能会很慢,直接终止程序。根据报错信息,找到下载地址,下载对应的文件。放到本地对应位置。注意下载的splite文件应该重命名为对应报错提示的编号
hub <- AnnotationHub()
query(hub,"zea mays")
maize <- hub[['AH66226']]
length(keys(maize))
#显示maize支持的所有的数据
columns(maize)
require(clusterProfiler)
library(clusterProfiler)
bitr(keys(maize)[1],'GID',c("GO","ENTREZID","UNIGENE"),maize)
#GO富集分析
#使用enrichGO
geneid <- read.csv('geneid_GO_KEGG.csv')
#geneid_GO_KEGG.csv是将显著的gene通过gramene转换编号。第二列是GO编号。
target_gene_id <- geneid[,2]
#target_gene_id <- unique(read.delim("geneid2GO",header = TRUE)$GO_term_accession)
#此处的ont是`MF`,可以自己更改为`BP`或`CC`,具体含义见文末讲解。
res_GO=enrichGO(target_gene_id,OrgDb=maize,keyType = 'GO',ont="MF",pvalueCutoff=0.01,qvalueCutoff=0.05)
write.table(as.data.frame(res_GO@result),file="GO_result.txt")
write.csv(as.data.frame(res_GO),"GO_result.csv",row.names = F)
head(summary(res_GO))
#柱形图
barplot(res_GO,showCategory = 30,title = "EnrichmentGO")
#气泡图
dotplot(res_GO,font.size=5,showCategory=30)
#浓缩图
emapplot(res_GO)
#网络图
enrichMap(res_GO,vertex.label.cex=1.2,layout=igraph::layout.kamada.kawai)
res_GO <- setReadable(res_GO,OrgDb = maize)
plotGOgraph(res_GO)
cnetplot(res_GO,categorySize="pvalue",foldChange=target_gene_id)
goplot(res_GO)
if (!requireNamespace("BiocManager", quietly = TRUE))
install.packages("BiocManager")
BiocManager::install("Rgraphviz", version = "3.8")
library(Rgraphviz)
#转换GO id 到 gid,其实gid就是ncbi-geneid
GO2gid <- bitr(target_gene_id,fromType = 'GO',toType = 'GID',OrgDb = 'maize')
gid2kegg <- bitr_kegg(GO2gid[,2],fromType = 'ncbi-geneid',toType = 'ncbi-proteinid',organism = 'zma')
head(gid2kegg,100)
#kegg
kk <- enrichKEGG(gene = gid2kegg[,2],organism ="zma",keyType = 'ncbi-proteinid',pvalueCutoff = 0.01,qvalueCutoff = 0.01)
write.table(as.data.frame(kk@result),file="kegg_result.txt")
write.csv(as.data.frame(kk@result),file = "kegg_result.csv")
#kegg结果的可视化
dotplot(kk,showCategory=30)
if (!requireNamespace("BiocManager", quietly = TRUE))
install.packages("BiocManager")
BiocManager::install("topGO", version = "3.8")
#GSEA
gsecc <- gseGO(geneList = target_gene_id,ont="CC",OrgDb = maize,keyType = 'GO',verbose = F)
#gseGO进行GSEA分析
goplot_GO.png
deg.png
barplot_kegg.png
cneplot_GO.png
pheatmap.png
这个pheatmap的数据源不正确,所以基因名称都是MSTRG。实际应该使用的是过滤后显著差异的基因。
但是在构建S4格式数据时,仍未成功。后续实现此功能。
- 关于GO,KEGG的总结
GO有3种格式,
- MF 分子功能molecular function 本文我的是用的MF做的enrichGO,可以试试其他两个选项
- BP 生物学途径biological process
- CC 细胞组份 cellular component
分子功能描述在分子生物学上的活性,大部分指的是单个基因产物的功能,还有一小部分是此基因产物形成的复合物的功能,包括antioxidantactivity 、binding等;
生物学途径是由分子功能有序地组成的,具有多个步骤的一个过程,如cellular process、cell killing等;
细胞组份指基因产物位于何种细胞器或基因产物组中,如membrane、organelle等。
在使用GSEA的数据库格式分类GSEA分类
网友评论