- 参数设置
library(getopt)
command= matrix(c(
'help' ,"h", 0, "logical","不接参数:帮助文档;",
'input' ,"i", 1, "character","参数必选:包含10X的三个文件稀疏矩阵或者h5,多个样本逗号分隔",
'tax' ,"t", 1, "character","参数必选:hsa or mmu,default:hsa",
'from' ,'f', 2, "character","参数可选:输入的基因格式是SYMBOL or ENSEMBL. default:SYMBOL",
'outdir' ,"o", 2, "character","参数可选:output directory,default:'.'"
), byrow=TRUE, ncol=5)
args=getopt(command)
if(!is.null(args$help) | is.null(args$input)){
cat(paste(getopt(command, usage = T), "\n"))
q()
}
- 参数
form<-"SYMBOL"
if(!is.null(args$form)){
form<-args$form
}
tax<-"hsa"
if(!is.null(args$tax)){
tax<-args$tax
}
outdir<-"."
if(!is.null(args$outdir)){
outdir<-args$outdir
if(!dir.exists(outdir)){
dir.create(outdir,recursive=T)
}
}
datadir <- args$input
- 脚本主体
## 根据物种选择数据库
taxdb<-NULL
if(tax=='hsa'){
library(org.Hs.eg.db)
taxdb<-"org.Hs.eg.db"
}else if(tax=='mmu'){
library(org.Mm.eg.db)
taxdb<-"org.Mm.eg.db"
}
## 读取差异基因,用于GO,KEGG 分析
## diff_signifcant.xls
data<-read.table(datadir,header=T,sep="\t",check.names=F)
## deg list
gene<-as.character(unique(data[,1]))
## convert gene symbol to ENTREZID
## convert gene symbol 等 to ENTREZID
gene.id<-bitr(gene,fromType=form,toType=c("ENTREZID"),OrgDb=taxdb,drop = T)
## GO 分析 输入最好是用ENTREZ ID,值比较固定,不建议使用GeneSymbol,容易匹配出问题。
gene<-gene.id[,"ENTREZID"]
## get all data info
all_data<-merge(gene.id, data, by.x = form, by.y = colnames(data)[1],all.x=T) ##by.y="RNA" or "gene_name"
###GO enrichment
if(file.exists(file.path(outdir,"ego.rdata"))){
load(file.path(outdir,"ego.rdata"))
}else{
ego<-enrichGO(gene=gene,
OrgDb=taxdb,
minGSSize = 2,
keyType = "ENTREZID",
ont ="ALL",
pvalueCutoff = 0.22,
readable=TRUE)
save(ego,file=file.path(outdir,"ego.rdata"))
}
write.table(ego@result,file.path(outdir,"GO_enrichment.xls"),row.names=F,quote=F,sep="\t")
####KEGG enrichment
if (file.exists(file.path(outdir,"kegg.rdata"))){
load(file.path(outdir,"kegg.rdata"))
}else{
## 输入Gene ID的格式和类型与enrichGO一致
kegg <- enrichKEGG(gene=gene,
organism=tax,
keyType="ncbi-geneid",
pvalueCutoff = 0.05,
pAdjustMethod = "BH",
minGSSize = 2,
maxGSSize = 500,
qvalueCutoff = 0.1,
use_internal_data = FALSE)
save(kegg,file=file.path(outdir,"kegg.rdata"))
}
kegg<-setReadable(kegg,
OrgDb=taxdb,
keyType="ENTREZID") ###mapping gene id to gene symbol
kegg_pathview<-kegg@result
colnames(kegg_pathview)[1]<-"pathwayID"
write.table(kegg_pathview,file.path(outdir,"KEGG_enrichment.xls"),quote=F,row.names=F,sep="\t")
GO 画图
##### ------------------- plot GO -----------------------
## 按照 ONTOLOGY 对 ego@result 进行分组,并取top10
top10_list <- lapply(split(ego@result, ego@result$ONTOLOGY), function(x) head(x,10))
## do.call(func, list) 对list中的每一个元素都执行func,适用于大规模数据。
top10 <- do.call("rbind",top10_list)
top10 <- top10[top10$p.adjust<=0.05,]
top10$Description <- factor(top10$Description,levels=unique(top10$Description))
char_size <- max(nchar(as.character(top10$Description)))
if(char_size<=55){
width = 14
}else if(char_size<=80){
width = 18
}else{
width = 20
}
pdf(file.path(outdir,"GO_barplot.pdf"),width=width,height=10)
ggplot(top10,aes(x=Count,y=Description,fill=ONTOLOGY))+
geom_bar(stat="identity")+
theme_bw()+
theme(panel.grid=element_blank(),
axis.text.y=element_text(color="black",size=rel(1.5)),
axis.text.x=element_text(color="black",size=rel(1.5)),
axis.title.x=element_text(color="black",size=rel(1.5)),
legend.text=element_text(size=rel(1.4)),
legend.title=element_text(size=rel(1.3)))+
scale_fill_manual(values=c("#ca4175","#6888f4","#7bc532"))
dev.off()
#------ dotplot --------
ratio <- lapply(str_split(top10$GeneRatio,"/"),as.numeric)
top10$ratio<-sapply(ratio,function(x) x[[1]]/x[[2]] )
pdf(file.path(outdir,"GO_dotplot.pdf"),width=width,height=10)
ggplot(top10,
aes(x=ratio,y=Description,size=Count,colour=p.adjust))+
geom_point()+
facet_grid(ONTOLOGY~.,scales="free", space = "free")+
theme_bw()+
theme(panel.grid=element_blank(),
axis.text.y=element_text(size=rel(1.5)),
axis.text.x=element_text(size=rel(1.4)),
axis.title.x=element_text(size=rel(1.5)),
strip.text.y = element_text(size=rel(1.6),
angle=-90,
face="bold"),
plot.margin=unit(c(1,0.8,1,0.8),"cm"))+
xlab("GeneRatio")
dev.off()
KEGG作图
# -------------- draw KEGG barplot -----------------------------------------------
kegg_pathview<-kegg_pathview[kegg_pathview$p.adjust<=0.05,]
if(dim(kegg_pathview)[1]>20){
kegg_pathview<-head(kegg_pathview,20)
}
kegg_pathview$Description<-factor(kegg_pathview$Description,levels=unique(kegg_pathview$Description))
pdf(file.path(outdir,"KEGG_barplot.pdf"),width=12,height=8)
ggplot(kegg_pathview,aes(x=Count,y=Description,fill=p.adjust))+
geom_bar(stat="identity")+
theme_bw()+
theme(panel.grid=element_blank(),
axis.text=element_text(color="black",size=rel(1.4)),
axis.title=element_text(color="black",size=rel(1.4)))
dev.off()
## draw dotplot
ratio <- lapply(str_split(kegg_pathview$GeneRatio,"/"),as.numeric)
kegg_pathview$ratio<-sapply(ratio,function(x) x[[1]]/x[[2]] )
pdf(file.path(outdir,"KEGG_dotplot.pdf"),width=12,height=8)
ggplot(kegg_pathview,aes(x=ratio,y=Description,size=Count,color=p.adjust))+
geom_point()+
theme_bw()+
theme(panel.grid=element_blank(),
axis.text.y=element_text(color="black",size=rel(1.5)),
axis.text.x=element_text(size=rel(1.4)),
axis.title.x=element_text(size=rel(1.5)))
+xlab("GeneRatio")
dev.off()
#### map gene to KEGG pathway
dir.create(file.path(outdir,"KEGG"))
for (i in which(kegg_pathview$pvalue<=0.05)){
# get gene ID
genelist <- unlist(strsplit(kegg_pathview$geneID[i],"/",fixed=T))
gene_len <- length(genelist)
subdata <- subset(all_data[,c("ENTREZID","log2FoldChange")],
subset=ENTREZID %in% genelist)
rownames(subdata)<-subdata[,1]
subdata[,1]<-NULL
fname <- file.path(outdir,paste(kegg_pathview[i,1],gene_len,"png",sep="."))
if(!file.exists(fname)){
pw_out <- pathview(gene.data = subdata,
pathway.id = kegg_pathview[i,1],
species=tax,
limit=list(gene =ifelse(max(abs(subdata[,1]))>3,
3,
ceiling(max(abs(subdata[,1])))),cpd=1),
out.suffix=gene_len,kegg.native = T,kegg.dir=file.path(outdir,"KEGG"))
}
}
脚本函数/方法
- bitr(geneID, fromType, toType,OrgDb,drop=F)
geneID: 输入的geneID
fromType:geneID的输入格式
toType: geneID转换后的格式
drop:是否去除空值
clusterProfiler 包 的bitr() 可以支持多种geneID之间的转换
在生信中,我们常用的geneID主要是gene symbol 和 ensembleID,
image.png
网友评论