deseq2和clusterProfiler使用之小试牛刀

作者: wo_monic | 来源:发表于2019-07-19 20:06 被阅读3次

deseq2和clusterProfiler使用之小试牛刀
clusterProfiler使用之小试牛刀
deseq2 and clusterprofiler
转录组从下机数据到GO、kegg、GSEA
转录组从下机数据到GO、kegg、GSEA
ggplot2绘制GO富集分析柱状图
R 实战 | 使用clusterProfiler进行多组基因富集
linux学习100篇13:安装clusterProfiler
小麦的GO富集分析
基因ID转换方法总结

参考链接
https://www.jianshu.com/p/c01b4cc1b98a
https://www.jianshu.com/p/69aa1c9cf4d1
http://blog.sina.com.cn/s/blog_5d188bc40102xkr1.html
Y叔官方教程
 GO、KEGG生物学意义讲解
 annotationhub的ftp下载地址（用于下载annotationhub数据库，速度慢的离谱）
我的实现代码
背景说明：此处是在做完差异分析之后的ＧO和KEGG，GSEA
前面的步骤

全程在R-studio完成。
各种包各种依赖，哪个缺少，安哪个。
R包的安装方法：

install.packages("ggplot2")

if (!requireNamespace("BiocManager", quietly = TRUE))
  install.packages("BiocManager")
BiocManager::install("topGO", version = "3.8")

source("https://bioconductor.org/biocLite.R")
biocLite("clusterProfiler")

各种报错大部分都是包的依赖问题。
前面的步骤代码如下：

source("https://bioconductor.org/biocLite.R")
biocLite("DESeq2")
install.packages("tidyverse")
#输入数据
library(tidyverse)
library(DESeq2)
library(ggplot2)
#import data
#setwd("/home/chaim/disk/lyb/data/")
#setwd("/mnt/d/RNA-seq/")
setwd("D:/RNA-seq/")
countData <- as.matrix(read.csv("gene_count_matrix.csv",row.names="gene_id"))
condition <- factor(c(rep("NS",3),rep("WT",3)),levels = c("WT","NS"))
colData <- data.frame(row.names=colnames(countData),condition)
dds <- DESeqDataSetFromMatrix(countData = countData,colData = colData, design = ~ condition)
dds <- DESeq(dds)
#总体结果查看
res = results(dds)
res = res[order(res$pvalue),]
summary(res)
#write.csv(res,file="All_results.csv")
table(res$padj<0.05)
#提取差异基因（DEGs）并进行gene Symbol注释
diff_gene_deseq2 <- subset(res,padj<0.05 & abs(log2FoldChange)>1)
dim(diff_gene_deseq2)
write.csv(diff_gene_deseq2,file = "DEG_treat_vs_control.csv")

开始GO、KEGG、GSEA分析

#生成对应的散点火山图
  resdata <- read.csv("DEG_treat_vs_control.csv",header = TRUE)
  threshold <- as.factor(ifelse(resdata$padj < 0.01 & abs(resdata$log2FoldChange) >= 2 ,ifelse(resdata$log2FoldChange >= 2 ,'Up','Down'),'Not'))
  deg_img <- ggplot(resdata,aes(x=resdata$log2FoldChange,y=-log10(resdata$padj),colour=threshold)) + xlab("log2(Fold Change)")+ylab("-log10(qvalue)") + geom_point(size = 0.5,alpha=1) + ylim(0,200) + xlim(-12,12) + scale_color_manual(values=c("green","grey", "red"))
  ggsave("deg.pdf",deg_img)
  #聚类热图
  ##此处生成的是所有的结果的热图
  install.packages("pheatmap")
  if (!requireNamespace("BiocManager", quietly = TRUE))
    install.packages("BiocManager")
  BiocManager::install("genefilter", version = "3.8")
  #out.csv 是DEG_treat_vs_control.csv中过滤掉未被注释的基因，out.csv是已经注释的基因
  #resgene <- read.csv("out.csv",header = TRUE) 
  library(readr)
  library("genefilter")
  library(pheatmap)
  rld <- rlogTransformation(dds,blind=F)
  write.csv(assay(rld),file="mm.DeSeq2.pseudo.counts.csv")
  topVarGene <- head(order(rowVars(assay(rld)),decreasing = TRUE),30)
  mat <- assay(rld)[topVarGene,]
  ##mat <- mat - rowMeans(mat) 减去一个平均值，让数值更加集中
  anno <- as.data.frame(colData(rld)[,c("condition","sizeFactor")])
  pheat <- pheatmap(mat,annotation_col=anno)
  ggsave("pheatmap.png",pheat,width=12,height=12)
  
# #安装biomaRt包
# source("http://bioconductor.org/biocLite.R")
# biocLite("biomaRt")
# install.packages('DT')
# #用bioMart对差异表达基因进行注释
# library("biomaRt")
# listMarts()
# 
# ensembl=useMart("ENSEMBL_MART_ENSEMBL")
# all_datasets <- listDatasets(ensembl)
# library(DT)
# datatable(all_datasets,options = list(searching=FALSE,pageLength=5,lengthMenu=c(5,10,15,20)))

#安装clusterProfiler 用于GO/KEGG分析及GSEA
source("https://bioconductor.org/biocLite.R")
biocLite("clusterProfiler")
biocLite("DOSE")
library(DO.db)
require(DOSE)
library(clusterProfiler)
 
 if (!requireNamespace("BiocManager", quietly = TRUE))
   install.packages("BiocManager")
 BiocManager::install("S4Vectors", version = "3.8")

#安装annotationhub
if(!requireNamespace("BiocManager",quietly = TRUE))
install.packages("BiocManager")
BiocManager::install("AnnotationHub", version = "3.8")
library(AnnotationHub)
require(AnnotationHub)
#下面的hub <- AnnotationHub()  maize <- hub[['AH66226']]可能会很慢，直接终止程序。根据报错信息，找到下载地址，下载对应的文件。放到本地对应位置。https://annotationhub.bioconductor.org/metadata/annotationhub.sqlite3
#注意下载的splite文件应该重命名为对应报错提示的编号
hub <- AnnotationHub()  
query(hub,"zea mays")
maize <- hub[['AH66226']]
length(keys(maize))

#显示maize支持的所有的数据
columns(maize)
require(clusterProfiler)
library(clusterProfiler)
bitr(keys(maize)[1],'GID',c("GO","ENTREZID","UNIGENE"),maize)
#GO富集分析
#使用enrichGO
geneid <- read.csv('geneid_GO_KEGG.csv')
#geneid_GO_KEGG.csv是将显著的gene通过gramene转换编号。第二列是GO编号。

基因编号转换为GO、KEGG编号参考地址

target_gene_id <- geneid[,2]
#target_gene_id <- unique(read.delim("geneid2GO",header = TRUE)$GO_term_accession)
#此处的ont是`MF`，可以自己更改为`BP`或`CC`，具体含义见文末讲解。
res_GO=enrichGO(target_gene_id,OrgDb=maize,keyType = 'GO',ont="MF",pvalueCutoff=0.01,qvalueCutoff=0.05)
write.table(as.data.frame(res_GO@result),file="GO_result.txt")
write.csv(as.data.frame(res_GO),"GO_result.csv",row.names = F)
head(summary(res_GO))
#柱形图
barplot(res_GO,showCategory = 30,title = "EnrichmentGO")
#气泡图
dotplot(res_GO,font.size=5,showCategory=30)
#浓缩图
emapplot(res_GO)

#网络图
enrichMap(res_GO,vertex.label.cex=1.2,layout=igraph::layout.kamada.kawai)
res_GO <- setReadable(res_GO,OrgDb = maize)
plotGOgraph(res_GO)
cnetplot(res_GO,categorySize="pvalue",foldChange=target_gene_id)
goplot(res_GO)

if (!requireNamespace("BiocManager", quietly = TRUE))
  install.packages("BiocManager")
BiocManager::install("Rgraphviz", version = "3.8")
library(Rgraphviz)

#转换GO id 到 gid，其实gid就是ncbi-geneid
GO2gid <- bitr(target_gene_id,fromType = 'GO',toType = 'GID',OrgDb = 'maize')
gid2kegg <- bitr_kegg(GO2gid[,2],fromType = 'ncbi-geneid',toType = 'ncbi-proteinid',organism = 'zma')
head(gid2kegg,100)
#kegg
kk <- enrichKEGG(gene = gid2kegg[,2],organism ="zma",keyType = 'ncbi-proteinid',pvalueCutoff = 0.01,qvalueCutoff = 0.01)
write.table(as.data.frame(kk@result),file="kegg_result.txt")
write.csv(as.data.frame(kk@result),file = "kegg_result.csv")
#kegg结果的可视化
dotplot(kk,showCategory=30)

if (!requireNamespace("BiocManager", quietly = TRUE))
  install.packages("BiocManager")
BiocManager::install("topGO", version = "3.8")

#GSEA
gsecc <- gseGO(geneList = target_gene_id,ont="CC",OrgDb = maize,keyType = 'GO',verbose = F)

#gseGO进行GSEA分析

goplot_GO.png

deg.png

barplot_kegg.png

cneplot_GO.png

pheatmap.png

这个pheatmap的数据源不正确，所以基因名称都是MSTRG。实际应该使用的是过滤后显著差异的基因。
但是在构建S4格式数据时，仍未成功。后续实现此功能。

关于GO,KEGG的总结
GO有3种格式，

MF 分子功能molecular function 本文我的是用的MF做的enrichGO，可以试试其他两个选项
BP 生物学途径biological process
CC 细胞组份 cellular component
分子功能描述在分子生物学上的活性，大部分指的是单个基因产物的功能，还有一小部分是此基因产物形成的复合物的功能，包括antioxidantactivity 、binding等；
生物学途径是由分子功能有序地组成的，具有多个步骤的一个过程，如cellular process、cell killing等；
细胞组份指基因产物位于何种细胞器或基因产物组中，如membrane、organelle等。

在使用GSEA的数据库格式分类GSEA分类