美文网首页注释和富集
Script-代码分析--GO,KEGG

Script-代码分析--GO,KEGG

作者: QXPLUS | 来源:发表于2021-06-08 21:46 被阅读0次
  1. 参数设置
library(getopt)
command= matrix(c(
  'help'          ,"h", 0, "logical","不接参数:帮助文档;",
  'input'         ,"i", 1, "character","参数必选:包含10X的三个文件稀疏矩阵或者h5,多个样本逗号分隔",
  'tax'           ,"t", 1, "character","参数必选:hsa or mmu,default:hsa",
  'from'          ,'f', 2, "character","参数可选:输入的基因格式是SYMBOL or ENSEMBL.  default:SYMBOL",
  'outdir'        ,"o", 2, "character","参数可选:output directory,default:'.'"
  ), byrow=TRUE, ncol=5)
args=getopt(command)

if(!is.null(args$help) | is.null(args$input)){
    cat(paste(getopt(command, usage = T), "\n"))
    q()
}
  1. 参数
form<-"SYMBOL"
if(!is.null(args$form)){
  form<-args$form
}

tax<-"hsa"
if(!is.null(args$tax)){
  tax<-args$tax
}

outdir<-"."
if(!is.null(args$outdir)){
  outdir<-args$outdir
  if(!dir.exists(outdir)){
    dir.create(outdir,recursive=T)
  }
}

datadir <- args$input
  1. 脚本主体
## 根据物种选择数据库
taxdb<-NULL
if(tax=='hsa'){
  library(org.Hs.eg.db)
  taxdb<-"org.Hs.eg.db"
}else if(tax=='mmu'){
  library(org.Mm.eg.db)
  taxdb<-"org.Mm.eg.db"
}

## 读取差异基因,用于GO,KEGG 分析
## diff_signifcant.xls
data<-read.table(datadir,header=T,sep="\t",check.names=F) 
## deg list
gene<-as.character(unique(data[,1]))

## convert gene symbol to ENTREZID
## convert gene symbol 等 to ENTREZID
gene.id<-bitr(gene,fromType=form,toType=c("ENTREZID"),OrgDb=taxdb,drop = T)

## GO 分析 输入最好是用ENTREZ ID,值比较固定,不建议使用GeneSymbol,容易匹配出问题。
gene<-gene.id[,"ENTREZID"]

## get all data info
all_data<-merge(gene.id, data, by.x = form, by.y = colnames(data)[1],all.x=T) ##by.y="RNA" or "gene_name"

###GO enrichment
if(file.exists(file.path(outdir,"ego.rdata"))){
  load(file.path(outdir,"ego.rdata"))
  }else{
    ego<-enrichGO(gene=gene,
                  OrgDb=taxdb,
                  minGSSize = 2,
                  keyType = "ENTREZID",
                  ont ="ALL",
                  pvalueCutoff = 0.22, 
                  readable=TRUE)

    save(ego,file=file.path(outdir,"ego.rdata"))
    }
write.table(ego@result,file.path(outdir,"GO_enrichment.xls"),row.names=F,quote=F,sep="\t")


####KEGG enrichment
if (file.exists(file.path(outdir,"kegg.rdata"))){
  load(file.path(outdir,"kegg.rdata"))
  }else{
    ## 输入Gene ID的格式和类型与enrichGO一致
    kegg <- enrichKEGG(gene=gene,
                  organism=tax,
                  keyType="ncbi-geneid",
                  pvalueCutoff = 0.05,
                  pAdjustMethod = "BH",  
                  minGSSize = 2, 
                  maxGSSize = 500,
                  qvalueCutoff = 0.1, 
                  use_internal_data = FALSE)
    save(kegg,file=file.path(outdir,"kegg.rdata"))
    }
    
kegg<-setReadable(kegg,
                  OrgDb=taxdb,
                  keyType="ENTREZID") ###mapping gene id to gene symbol
kegg_pathview<-kegg@result
colnames(kegg_pathview)[1]<-"pathwayID"
write.table(kegg_pathview,file.path(outdir,"KEGG_enrichment.xls"),quote=F,row.names=F,sep="\t")

GO 画图

##### ------------------- plot GO -----------------------
## 按照 ONTOLOGY 对 ego@result 进行分组,并取top10
top10_list <- lapply(split(ego@result, ego@result$ONTOLOGY), function(x) head(x,10))
## do.call(func, list)  对list中的每一个元素都执行func,适用于大规模数据。
top10 <- do.call("rbind",top10_list)
top10 <- top10[top10$p.adjust<=0.05,]

top10$Description <- factor(top10$Description,levels=unique(top10$Description))
char_size <- max(nchar(as.character(top10$Description)))

if(char_size<=55){
  width = 14
  }else if(char_size<=80){
    width = 18
    }else{
      width = 20
      }

pdf(file.path(outdir,"GO_barplot.pdf"),width=width,height=10)
ggplot(top10,aes(x=Count,y=Description,fill=ONTOLOGY))+
      geom_bar(stat="identity")+
      theme_bw()+
      theme(panel.grid=element_blank(),
            axis.text.y=element_text(color="black",size=rel(1.5)),
            axis.text.x=element_text(color="black",size=rel(1.5)),
            axis.title.x=element_text(color="black",size=rel(1.5)),
            legend.text=element_text(size=rel(1.4)),
            legend.title=element_text(size=rel(1.3)))+
      scale_fill_manual(values=c("#ca4175","#6888f4","#7bc532"))
dev.off()

#------ dotplot --------
ratio <- lapply(str_split(top10$GeneRatio,"/"),as.numeric)
top10$ratio<-sapply(ratio,function(x) x[[1]]/x[[2]] )

pdf(file.path(outdir,"GO_dotplot.pdf"),width=width,height=10)
ggplot(top10,
      aes(x=ratio,y=Description,size=Count,colour=p.adjust))+
      geom_point()+
      facet_grid(ONTOLOGY~.,scales="free", space = "free")+
      theme_bw()+
      theme(panel.grid=element_blank(),
            axis.text.y=element_text(size=rel(1.5)),
            axis.text.x=element_text(size=rel(1.4)),
            axis.title.x=element_text(size=rel(1.5)),
            strip.text.y = element_text(size=rel(1.6), 
                                        angle=-90, 
                                        face="bold"),
            plot.margin=unit(c(1,0.8,1,0.8),"cm"))+
            xlab("GeneRatio")
dev.off()

KEGG作图

# -------------- draw KEGG barplot -----------------------------------------------
kegg_pathview<-kegg_pathview[kegg_pathview$p.adjust<=0.05,]
if(dim(kegg_pathview)[1]>20){
    kegg_pathview<-head(kegg_pathview,20)
}

kegg_pathview$Description<-factor(kegg_pathview$Description,levels=unique(kegg_pathview$Description))

pdf(file.path(outdir,"KEGG_barplot.pdf"),width=12,height=8)
ggplot(kegg_pathview,aes(x=Count,y=Description,fill=p.adjust))+
        geom_bar(stat="identity")+
        theme_bw()+
        theme(panel.grid=element_blank(),
              axis.text=element_text(color="black",size=rel(1.4)),
              axis.title=element_text(color="black",size=rel(1.4)))
dev.off()

## draw dotplot
ratio <- lapply(str_split(kegg_pathview$GeneRatio,"/"),as.numeric)
kegg_pathview$ratio<-sapply(ratio,function(x) x[[1]]/x[[2]] )

pdf(file.path(outdir,"KEGG_dotplot.pdf"),width=12,height=8)
ggplot(kegg_pathview,aes(x=ratio,y=Description,size=Count,color=p.adjust))+
        geom_point()+
        theme_bw()+
        theme(panel.grid=element_blank(),
              axis.text.y=element_text(color="black",size=rel(1.5)),
              axis.text.x=element_text(size=rel(1.4)),
              axis.title.x=element_text(size=rel(1.5)))
        +xlab("GeneRatio")
dev.off()


#### map gene to KEGG pathway
dir.create(file.path(outdir,"KEGG"))
for (i in which(kegg_pathview$pvalue<=0.05)){
  # get gene ID
  genelist <- unlist(strsplit(kegg_pathview$geneID[i],"/",fixed=T)) 
  gene_len <- length(genelist)
  subdata <- subset(all_data[,c("ENTREZID","log2FoldChange")],
                    subset=ENTREZID %in% genelist)
  rownames(subdata)<-subdata[,1]
  subdata[,1]<-NULL
  fname <- file.path(outdir,paste(kegg_pathview[i,1],gene_len,"png",sep="."))

  if(!file.exists(fname)){
    pw_out <- pathview(gene.data = subdata, 
                      pathway.id = kegg_pathview[i,1], 
                      species=tax, 
                      limit=list(gene =ifelse(max(abs(subdata[,1]))>3,
                      3,
                      ceiling(max(abs(subdata[,1])))),cpd=1),
                      out.suffix=gene_len,kegg.native = T,kegg.dir=file.path(outdir,"KEGG"))
    }
  }

脚本函数/方法

  • bitr(geneID, fromType, toType,OrgDb,drop=F)
    geneID: 输入的geneID
    fromType:geneID的输入格式
    toType: geneID转换后的格式
    drop:是否去除空值

clusterProfiler 包 的bitr() 可以支持多种geneID之间的转换
在生信中,我们常用的geneID主要是gene symbol 和 ensembleID,


image.png

相关文章

网友评论

    本文标题:Script-代码分析--GO,KEGG

    本文链接:https://www.haomeiwen.com/subject/ejideltx.html