美文网首页注释和富集
转录组分析(11) - GO/KEGG富集分析(3)

转录组分析(11) - GO/KEGG富集分析(3)

作者: 半夜一更 | 来源:发表于2021-08-01 14:31 被阅读0次

    对于没有Org.Db的非模式生物,由于某些原因不制作Org.Db,然后利用clusterProfiler做富集

    数据格式
    ## Annotation_GO.xlsx
    Gene.ID  Biological.Pathway  BP.Description  Molecular.Function  MF.Description Cellular.Component      CC.Description
    gene_1  GO:0007283 spermatogenesis  GO:0003677 DNA binding  GO:0000786//GO:0005634  nucleosome//nucleus
    
    ## Annotation_KO.xlsx
    Gene.ID  1st-level(pathway)  2nd-level(pathway)  3rd-level(pathway) Pathway.id  KO.id  KO.Description  EC
    gene_1  Cellular Processes  Cell growth and death  Cell cycle  ko04110  K02180  cell cycle arrest protein BUB3  -
    
    ## test_list.xlsx
    gene_id  FC
    gene_1  1.85
    
    数据准备
    #coding:utf-8
    
    library(openxlsx)
    library(tidyr)
    library(stringr)
    library(dplyr)
    
    getwd()
    dir()
    rm(list=ls())
    
    #定义GO数据结构化函数
    data_clean <-function(x,ont){
      colnames(x) <- c("gene","GO_id","GO_name")
      gterms <- x %>%
        dplyr::select(gene, GO_id,GO_name) %>% na.omit()
      
      gene2go <- data.frame(gene = character(),
                               GO_id = character(),
                               GO_name = character(),
                               Ont = character()
      )
      for (row in 1:nrow(gterms)){
        gene_terms_id <- str_split(gterms[row,"GO_id"], "//", simplify = FALSE)[[1]] 
        gene_terms_name <- str_split(gterms[row,"GO_name"], "//", simplify = FALSE)[[1]]
        gene_id <- gterms[row, "gene"][[1]]
        
        tmp <- data.frame(gene = rep(gene_id, length(gene_terms_id)),
                          GO_id = gene_terms_id,
                          GO_name = gene_terms_name,
                          Ont = rep(ont, length(gene_terms_id))
        )
        gene2go <- rbind(gene2go, tmp)
      }
      return(gene2go)
    }
    # 载入数据
    go_file <- read.xlsx("Annotation_GO.xlsx")
    go_file[go_file=="--"]<-NA
    go_bp <- go_file[,1:3]
    
    # 按Ont类型格式化数据
    ## BP
    go_bp <- go_file[,1:3]
    gene2go_bp <- data_clean(go_bp,"BP")
    go2gene_bp <- distinct(gene2go_bp[,c(2,1)])
    go2name_bp <- distinct(gene2go_bp[,c(2,3)])
    ## MF
    go_mf <- go_file[,c(1,4,5)]
    gene2go_mf <- data_clean(go_mf,"MF")
    go2gene_mf <- distinct(gene2go_mf[,c(2,1)])
    go2name_mf <- distinct(gene2go_mf[,c(2,3)])
    ## CC
    go_cc <- go_file[,c(1,6,7)]
    gene2go_cc <- data_clean(go_cc,"CC")
    go2gene_cc <- distinct(gene2go_cc[,c(2,1)])
    go2name_cc <- distinct(gene2go_cc[,c(2,3)])
    
    #保存关键数据
    save(go2gene_bp,go2name_bp,go2gene_mf,go2name_mf,go2gene_cc,go2name_cc,file="XX.go.RData")
    
    #富集时调用
    #load(file = "XX.go.RData")
    
    #Pathway数据格式化
    rm(list=ls())
    ko_file <- read.xlsx("Annotation_KO.xlsx")
    ko_file[ko_file=="--"]<-NA
    gene2ko <- ko_file[,c(1,5,4,3,2)]
    colnames(gene2ko) <- c("gene","pathway_id","pathway_name_lv3","pathway_name_lv2","pathway_name_lv1")
    ko2gene <- distinct(gene2ko[,c(2,1)])
    ko2name <- distinct(gene2ko[,c(2,3)])
    #保存关键数据
    save(gene2ko,ko2gene,ko2name,file="XX.ko.RData")
    #富集时调用
    #load(file = "XX.ko.RData")
    
    
    富集分析绘图
    if (!requireNamespace("BiocManager", quietly = TRUE))
      install.packages("BiocManager")
    BiocManager::install(c("clusterProfiler"))
    
    library(clusterProfiler)
    library(openxlsx)
    
    ## __init__
    rm(list = ls())
    load(file = "XX.go.RData")
    load(file = "AX.ko.RData")
    ## 导入数据
    test_list <- read.xlsx("test_list.xlsx")
    ## 准备数据格式
    test_gene <- test_list[,1]
    fc_list <- test_list[,2]
    names(fc_list)<- as.character(test_list[,1])
    fc_list <- sort(fc_list,decreasing = T)
    ## GO/KO富集
    enrich_go <- enricher(
      test_gene,              # 前景基因集
      pvalueCutoff = 0.05,    # p值阈值
      pAdjustMethod = "fdr",   # 调整p值校正方式。(one of "holm", "hochberg", "hommel", "bonferroni", "BH", "BY", "fdr", "none")
      qvalueCutoff = 0.05,       # q值阈值
      TERM2GENE = go2gene_bp, # 修改以确定富集的Ont类型(go2gene_bp, go2gene_mf, go2gene_cc,ko2gene)
      TERM2NAME = go2name_bp  # 跟随上一行一起修改 (go2name_bp, go2name_mf, go2name_cc, ko2name)
      #universe,              # 背景基因集
      #minGSSize = 10,
      #maxGSSize = 500,
    )
    ## 保存富集结果
    write.csv(enrich_go@result,file="XXX.go.csv")
    ## 绘制条形图
    plot_data <- enrich_go@result[1:10,c(2,9,5)]
    library(ggplot2)
    pp1=ggplot(plot_data, aes(x=reorder(plot_data$Description,plot_data$Count ),y=Count,fill=-1*log10(plot_data$pvalue)))
    pbar=pp1+geom_bar(stat="identity")+theme_minimal()+coord_flip()
    output_bar_pdf <- paste("XXX",".bar",".pdf",sep = '')
    ggsave(pbar,file=output_bar_pdf,width=10,height=6)
    ## 绘制气泡图
    library(ggplot2)
    pp= ggplot(plot_data, aes(x=plot_data$pvalue,y=reorder(plot_data$Description,plot_data$pvalue))) 
    pbubble = pp + geom_point(aes(size=plot_data$Count,color=-1*log10(plot_data$pvalue))) + scale_colour_gradient(low="green",high = "red")+ theme_minimal()
    output_bb_pdf <- paste("XXX",".bb",".pdf",sep = '')
    ggsave(pbubble,file=output_bb_pdf,width=10,height=6)
    
    ## GSEA富集分析
    gse_result <- GSEA(
      fc_list,
      exponent = 1,
      eps = 1e-10,
      pvalueCutoff = 1,
      pAdjustMethod = "fdr",
      TERM2GENE = go2gene_bp,    # 修改以确定富集的Ont类型(go2gene_bp, go2gene_mf, go2gene_cc,ko2gene)
      TERM2NAME = go2name_bp,    # 跟随上一行一起修改 (go2name_bp, go2name_mf, go2name_cc, ko2name)
      verbose = TRUE,
      by = "fgsea"
      #minGSSize = 10,
      #maxGSSize = 500,
    )
    ## 保存结果
    write.csv(gse_result@result,file = "XXX.csv")
    ## 批量绘制GSEA图
    for(i in 1:length(gse_result@result$ID)) {
      id <- gse_result@result$ID[i]
      gsefig<- gseaplot(gse_result,geneSetID = id)
      output_gse_pdf <- paste("XXX",i,".",id,".gse",".pdf",sep = '')
      ggsave(gsefig,file=output_gse_pdf,width=10,height=6)
    }
    
    

    相关文章

      网友评论

        本文标题:转录组分析(11) - GO/KEGG富集分析(3)

        本文链接:https://www.haomeiwen.com/subject/pmalvltx.html