美文网首页基因注释/富集分析与功能分类GEO数据挖掘
R语言GEO数据挖掘:步骤四:富集分析KEGG,GO

R语言GEO数据挖掘:步骤四:富集分析KEGG,GO

作者: mayoneday | 来源:发表于2019-03-27 21:58 被阅读211次

    1.读取第三部存储数据(基因差异表达情况)

    rm(list = ls())  ## 魔幻操作,一键清空~
    load(file = 'deg.Rdata')
    head(deg)
    
    deg

    2.设定阈值计算基因上调下调数量

    ## 不同的阈值,筛选到的差异基因数量就不一样,后面的超几何分布检验结果就大相径庭。
    logFC_t=1.5
    deg$g=ifelse(deg$P.Value>0.05,'stable',
                ifelse( deg$logFC > logFC_t,'UP',
                        ifelse( deg$logFC < -logFC_t,'DOWN','stable') )
    )#P>0.05输出stable,其中设定当logFC大于1.5为上调输出'UP',大于-1.5为下调输出'DOWN',,如果都不是则输出'stable',从而增加了一列g,筛选出了上调和下调的基因
    table(deg$g)
    
    得出上调41,下调88
    head(deg)
    
    deg多了提示基因上下调的一列g
    deg$symbol=rownames(deg)
    
    给deg增加一列基因名

    3.ID转换

    library(ggplot2)
    library(clusterProfiler)
    library(org.Hs.eg.db)
    df <- bitr(unique(deg$symbol), fromType = "SYMBOL",
               toType = c( "ENTREZID"),
               OrgDb = org.Hs.eg.db)
    #bitr功能为ID转换,
    #bitr(geneID, fromType, toType, OrgDb, drop = TRUE);
    #geneid :基因ID输入 ; fromtype : 输入ID型;toType:输出ID型;orgdb :注释数据库)
    head(df)
    
    QQ截图20190327194950.jpg
    DEG=deg#把deg数据赋值给DEG数据
    head(DEG)
    
    得到DEG
    DEG=merge(DEG,df,by.y='SYMBOL',by.x='symbol')
    #把数据DEG,df通过,DEG的'symbol'列,df的'SYMBOL'列连接在一起,转化ID
    head(DEG)
    save(DEG,file = 'anno_DEG.Rdata')
    
    DEG

    4.得出差异基因

    gene_up= DEG[DEG$g == 'UP','ENTREZID'] #选出上调基因ID
    gene_down=DEG[DEG$g == 'DOWN','ENTREZID'] #选出下调基因ID
    gene_diff=c(gene_up,gene_down)#得出上下调基因ID
    gene_all=as.character(DEG[ ,'ENTREZID'] )#得出所有基因ID
    data(geneList, package="DOSE")#得出geneList数据
    head(geneList)#查看数据
    
    geneList
    boxplot(geneList)#画箱线图
    boxplot(DEG$logFC)#画箱线图
    geneList=DEG$logFC#把DEG数据logFC列值赋值给数据geneList
    
    QQ截图20190327214317.jpg
    names(geneList)=DEG$ENTREZID#把ID赋值给geneList数据的名字
    
    得到geneList:ID和表达量的关系
    geneList=sort(geneList,decreasing = T)#把数据进行排序
    
    排序之后的geneList

    5. KEGG pathway analysis

    做KEGG数据集超几何分布检验分析,重点在结果的可视化及生物学意义的理解。

    if(T){
      ###   over-representation test
      kk.up <- enrichKEGG(gene         = gene_up,
                          organism     = 'hsa',
                          universe     = gene_all,
                          pvalueCutoff = 0.9,
                          qvalueCutoff =0.9)
      head(kk.up)[,1:6]
      dotplot(kk.up );ggsave('kk.up.dotplot.png')
      kk.down <- enrichKEGG(gene         =  gene_down,
                            organism     = 'hsa',
                            universe     = gene_all,
                            pvalueCutoff = 0.9,
                            qvalueCutoff =0.9)
      head(kk.down)[,1:6]
      dotplot(kk.down );ggsave('kk.down.dotplot.png')
      kk.diff <- enrichKEGG(gene         = gene_diff,
                            organism     = 'hsa',
                            pvalueCutoff = 0.05)
      head(kk.diff)[,1:6]
      dotplot(kk.diff );ggsave('kk.diff.dotplot.png')
      
      kegg_diff_dt <- as.data.frame(kk.diff)
      kegg_down_dt <- as.data.frame(kk.down)
      kegg_up_dt <- as.data.frame(kk.up)
      down_kegg<-kegg_down_dt[kegg_down_dt$pvalue<0.05,];down_kegg$group=-1
      up_kegg<-kegg_up_dt[kegg_up_dt$pvalue<0.05,];up_kegg$group=1
      source('functions.R')
      g_kegg=kegg_plot(up_kegg,down_kegg)
      print(g_kegg)
      
      ggsave(g_kegg,filename = 'kegg_up_down.png')
    

    6. GSEA

      kk_gse <- gseKEGG(geneList     = geneList,
                        organism     = 'hsa',
                        nPerm        = 1000,
                        minGSSize    = 120,
                        pvalueCutoff = 0.9,
                        verbose      = FALSE)
      head(kk_gse)[,1:6]
      gseaplot(kk_gse, geneSetID = rownames(kk_gse[1,]))
      
      down_kegg<-kk_gse[kk_gse$pvalue<0.05 & kk_gse$enrichmentScore < 0,];down_kegg$group=-1
      up_kegg<-kk_gse[kk_gse$pvalue<0.05 & kk_gse$enrichmentScore > 0,];up_kegg$group=1
      
      g_kegg=kegg_plot(up_kegg,down_kegg)
      print(g_kegg)
      ggsave(g_kegg,filename = 'kegg_up_down_gsea.png')
      
      
    }
    

    7.GO database analysis

    做GO数据集超几何分布检验分析,重点在结果的可视化及生物学意义的理解。

    {
      
      g_list=list(gene_up=gene_up,
                  gene_down=gene_down,
                  gene_diff=gene_diff)
      
      if(F){
        go_enrich_results <- lapply( g_list , function(gene) {
          lapply( c('BP','MF','CC') , function(ont) {
            cat(paste('Now process ',ont ))
            ego <- enrichGO(gene          = gene,
                            universe      = gene_all,
                            OrgDb         = org.Hs.eg.db,
                            ont           = ont ,
                            pAdjustMethod = "BH",
                            pvalueCutoff  = 0.99,
                            qvalueCutoff  = 0.99,
                            readable      = TRUE)
            
            print( head(ego) )
            return(ego)
          })
        })
        save(go_enrich_results,file = 'go_enrich_results.Rdata')
        
      }
      
      
      load(file = 'go_enrich_results.Rdata')
      
      n1= c('gene_up','gene_down','gene_diff')
      n2= c('BP','MF','CC') 
      for (i in 1:3){
        for (j in 1:3){
          fn=paste0('dotplot_',n1[i],'_',n2[j],'.png')
          cat(paste0(fn,'\n'))
          png(fn,res=150,width = 1080)
          print( dotplot(go_enrich_results[[i]][[j]] ))
          dev.off()
        }
      }
      
      
    }
    

    把之前的数据设置好之后,后面的富集分析也是傻瓜式的

    最后

    感谢jimmy的生信技能树团队!

    感谢导师岑洪老师!

    感谢健明、孙小洁,慧美等生信技能树团队的老师一路以来的指导和鼓励!

    特别注明:此文中编码来自生信技能树健明老师

    相关文章

      网友评论

        本文标题:R语言GEO数据挖掘:步骤四:富集分析KEGG,GO

        本文链接:https://www.haomeiwen.com/subject/fcyovqtx.html