美文网首页注释和富集
Script-代码分析--GO,KEGG

Script-代码分析--GO,KEGG

作者: QXPLUS | 来源:发表于2021-06-08 21:46 被阅读0次
    1. 参数设置
    library(getopt)
    command= matrix(c(
      'help'          ,"h", 0, "logical","不接参数:帮助文档;",
      'input'         ,"i", 1, "character","参数必选:包含10X的三个文件稀疏矩阵或者h5,多个样本逗号分隔",
      'tax'           ,"t", 1, "character","参数必选:hsa or mmu,default:hsa",
      'from'          ,'f', 2, "character","参数可选:输入的基因格式是SYMBOL or ENSEMBL.  default:SYMBOL",
      'outdir'        ,"o", 2, "character","参数可选:output directory,default:'.'"
      ), byrow=TRUE, ncol=5)
    args=getopt(command)
    
    if(!is.null(args$help) | is.null(args$input)){
        cat(paste(getopt(command, usage = T), "\n"))
        q()
    }
    
    1. 参数
    form<-"SYMBOL"
    if(!is.null(args$form)){
      form<-args$form
    }
    
    tax<-"hsa"
    if(!is.null(args$tax)){
      tax<-args$tax
    }
    
    outdir<-"."
    if(!is.null(args$outdir)){
      outdir<-args$outdir
      if(!dir.exists(outdir)){
        dir.create(outdir,recursive=T)
      }
    }
    
    datadir <- args$input
    
    1. 脚本主体
    ## 根据物种选择数据库
    taxdb<-NULL
    if(tax=='hsa'){
      library(org.Hs.eg.db)
      taxdb<-"org.Hs.eg.db"
    }else if(tax=='mmu'){
      library(org.Mm.eg.db)
      taxdb<-"org.Mm.eg.db"
    }
    
    ## 读取差异基因,用于GO,KEGG 分析
    ## diff_signifcant.xls
    data<-read.table(datadir,header=T,sep="\t",check.names=F) 
    ## deg list
    gene<-as.character(unique(data[,1]))
    
    ## convert gene symbol to ENTREZID
    ## convert gene symbol 等 to ENTREZID
    gene.id<-bitr(gene,fromType=form,toType=c("ENTREZID"),OrgDb=taxdb,drop = T)
    
    ## GO 分析 输入最好是用ENTREZ ID,值比较固定,不建议使用GeneSymbol,容易匹配出问题。
    gene<-gene.id[,"ENTREZID"]
    
    ## get all data info
    all_data<-merge(gene.id, data, by.x = form, by.y = colnames(data)[1],all.x=T) ##by.y="RNA" or "gene_name"
    
    ###GO enrichment
    if(file.exists(file.path(outdir,"ego.rdata"))){
      load(file.path(outdir,"ego.rdata"))
      }else{
        ego<-enrichGO(gene=gene,
                      OrgDb=taxdb,
                      minGSSize = 2,
                      keyType = "ENTREZID",
                      ont ="ALL",
                      pvalueCutoff = 0.22, 
                      readable=TRUE)
    
        save(ego,file=file.path(outdir,"ego.rdata"))
        }
    write.table(ego@result,file.path(outdir,"GO_enrichment.xls"),row.names=F,quote=F,sep="\t")
    
    
    ####KEGG enrichment
    if (file.exists(file.path(outdir,"kegg.rdata"))){
      load(file.path(outdir,"kegg.rdata"))
      }else{
        ## 输入Gene ID的格式和类型与enrichGO一致
        kegg <- enrichKEGG(gene=gene,
                      organism=tax,
                      keyType="ncbi-geneid",
                      pvalueCutoff = 0.05,
                      pAdjustMethod = "BH",  
                      minGSSize = 2, 
                      maxGSSize = 500,
                      qvalueCutoff = 0.1, 
                      use_internal_data = FALSE)
        save(kegg,file=file.path(outdir,"kegg.rdata"))
        }
        
    kegg<-setReadable(kegg,
                      OrgDb=taxdb,
                      keyType="ENTREZID") ###mapping gene id to gene symbol
    kegg_pathview<-kegg@result
    colnames(kegg_pathview)[1]<-"pathwayID"
    write.table(kegg_pathview,file.path(outdir,"KEGG_enrichment.xls"),quote=F,row.names=F,sep="\t")
    
    

    GO 画图

    ##### ------------------- plot GO -----------------------
    ## 按照 ONTOLOGY 对 ego@result 进行分组,并取top10
    top10_list <- lapply(split(ego@result, ego@result$ONTOLOGY), function(x) head(x,10))
    ## do.call(func, list)  对list中的每一个元素都执行func,适用于大规模数据。
    top10 <- do.call("rbind",top10_list)
    top10 <- top10[top10$p.adjust<=0.05,]
    
    top10$Description <- factor(top10$Description,levels=unique(top10$Description))
    char_size <- max(nchar(as.character(top10$Description)))
    
    if(char_size<=55){
      width = 14
      }else if(char_size<=80){
        width = 18
        }else{
          width = 20
          }
    
    pdf(file.path(outdir,"GO_barplot.pdf"),width=width,height=10)
    ggplot(top10,aes(x=Count,y=Description,fill=ONTOLOGY))+
          geom_bar(stat="identity")+
          theme_bw()+
          theme(panel.grid=element_blank(),
                axis.text.y=element_text(color="black",size=rel(1.5)),
                axis.text.x=element_text(color="black",size=rel(1.5)),
                axis.title.x=element_text(color="black",size=rel(1.5)),
                legend.text=element_text(size=rel(1.4)),
                legend.title=element_text(size=rel(1.3)))+
          scale_fill_manual(values=c("#ca4175","#6888f4","#7bc532"))
    dev.off()
    
    #------ dotplot --------
    ratio <- lapply(str_split(top10$GeneRatio,"/"),as.numeric)
    top10$ratio<-sapply(ratio,function(x) x[[1]]/x[[2]] )
    
    pdf(file.path(outdir,"GO_dotplot.pdf"),width=width,height=10)
    ggplot(top10,
          aes(x=ratio,y=Description,size=Count,colour=p.adjust))+
          geom_point()+
          facet_grid(ONTOLOGY~.,scales="free", space = "free")+
          theme_bw()+
          theme(panel.grid=element_blank(),
                axis.text.y=element_text(size=rel(1.5)),
                axis.text.x=element_text(size=rel(1.4)),
                axis.title.x=element_text(size=rel(1.5)),
                strip.text.y = element_text(size=rel(1.6), 
                                            angle=-90, 
                                            face="bold"),
                plot.margin=unit(c(1,0.8,1,0.8),"cm"))+
                xlab("GeneRatio")
    dev.off()
    
    

    KEGG作图

    # -------------- draw KEGG barplot -----------------------------------------------
    kegg_pathview<-kegg_pathview[kegg_pathview$p.adjust<=0.05,]
    if(dim(kegg_pathview)[1]>20){
        kegg_pathview<-head(kegg_pathview,20)
    }
    
    kegg_pathview$Description<-factor(kegg_pathview$Description,levels=unique(kegg_pathview$Description))
    
    pdf(file.path(outdir,"KEGG_barplot.pdf"),width=12,height=8)
    ggplot(kegg_pathview,aes(x=Count,y=Description,fill=p.adjust))+
            geom_bar(stat="identity")+
            theme_bw()+
            theme(panel.grid=element_blank(),
                  axis.text=element_text(color="black",size=rel(1.4)),
                  axis.title=element_text(color="black",size=rel(1.4)))
    dev.off()
    
    ## draw dotplot
    ratio <- lapply(str_split(kegg_pathview$GeneRatio,"/"),as.numeric)
    kegg_pathview$ratio<-sapply(ratio,function(x) x[[1]]/x[[2]] )
    
    pdf(file.path(outdir,"KEGG_dotplot.pdf"),width=12,height=8)
    ggplot(kegg_pathview,aes(x=ratio,y=Description,size=Count,color=p.adjust))+
            geom_point()+
            theme_bw()+
            theme(panel.grid=element_blank(),
                  axis.text.y=element_text(color="black",size=rel(1.5)),
                  axis.text.x=element_text(size=rel(1.4)),
                  axis.title.x=element_text(size=rel(1.5)))
            +xlab("GeneRatio")
    dev.off()
    
    
    #### map gene to KEGG pathway
    dir.create(file.path(outdir,"KEGG"))
    for (i in which(kegg_pathview$pvalue<=0.05)){
      # get gene ID
      genelist <- unlist(strsplit(kegg_pathview$geneID[i],"/",fixed=T)) 
      gene_len <- length(genelist)
      subdata <- subset(all_data[,c("ENTREZID","log2FoldChange")],
                        subset=ENTREZID %in% genelist)
      rownames(subdata)<-subdata[,1]
      subdata[,1]<-NULL
      fname <- file.path(outdir,paste(kegg_pathview[i,1],gene_len,"png",sep="."))
    
      if(!file.exists(fname)){
        pw_out <- pathview(gene.data = subdata, 
                          pathway.id = kegg_pathview[i,1], 
                          species=tax, 
                          limit=list(gene =ifelse(max(abs(subdata[,1]))>3,
                          3,
                          ceiling(max(abs(subdata[,1])))),cpd=1),
                          out.suffix=gene_len,kegg.native = T,kegg.dir=file.path(outdir,"KEGG"))
        }
      }
    

    脚本函数/方法

    • bitr(geneID, fromType, toType,OrgDb,drop=F)
      geneID: 输入的geneID
      fromType:geneID的输入格式
      toType: geneID转换后的格式
      drop:是否去除空值

    clusterProfiler 包 的bitr() 可以支持多种geneID之间的转换
    在生信中,我们常用的geneID主要是gene symbol 和 ensembleID,


    image.png

    相关文章

      网友评论

        本文标题:Script-代码分析--GO,KEGG

        本文链接:https://www.haomeiwen.com/subject/ejideltx.html