美文网首页
KEGG分析

KEGG分析

作者: 余绕 | 来源:发表于2022-02-02 16:03 被阅读0次
    library('R2HTML')
    Rice KEGG 分析
    #读取背景基因(所有注释到KEGG的基因)和差异基因列表,以K号表示
    
    allgene=read.delim('Rice_all_gene_KOs.txt',header=F,sep="\t",stringsAsFactors = F)#读取水稻所有pathways
    
    head(allgene) #展示allgene数据
    ##       V1
    ## 1 K02989
    ## 2 K02989
    ## 3 K01634
    ## 4 K09880
    ## 5 K13151
    ## 6 K13151
    deggene=read.delim('Degs_KOs.txt',header=F,sep="\t",stringsAsFactors = F) #取差异基因的pathway信息
    
    head(deggene) #展示deggene数据
    ##       V1
    ## 1 K02989
    ## 2 K01595
    ## 3 K15414
    ## 4 K11252
    ## 5 K07874
    ## 6 K09489
    pathinfo=read.delim('kegg.pathway.id',header=T,sep="\t",stringsAsFactors = F) #取所有的pathway信息
    
    head(pathinfo) #展示pathinfo数据
    ##     KOID                       PATHWAY     PID Category
    ## 1 K00844 glycolysis / Gluconeogenesis  ko00010     PATH
    ## 2 K12407 glycolysis / Gluconeogenesis  ko00010     PATH
    ## 3 K00845 glycolysis / Gluconeogenesis  ko00010     PATH
    ## 4 K01810 glycolysis / Gluconeogenesis  ko00010     PATH
    ## 5 K06859 glycolysis / Gluconeogenesis  ko00010     PATH
    ## 6 K13810 glycolysis / Gluconeogenesis  ko00010     PATH
    #读取背景基因(所有注释到KEGG的基因)和差异基因列表,以K号表示
    
    pathway=unique(pathinfo[,c('PID','PATHWAY')]) #unique()去除重复函数,获得单一的Pathway用于后面for循环计数每个pathway的Ko数目
    
    head(pathway) #展示pathway数据
    ##         PID                                   PATHWAY
    ## 1   ko00010             glycolysis / Gluconeogenesis 
    ## 105 ko00020                citrate cycle (TCA cycle) 
    ## 164 ko00030                pentose phosphate pathway 
    ## 247 ko00040 pentose and glucuronate interconversions 
    ## 326 ko00051          fructose and mannose metabolism 
    ## 435 ko00052                     galactose metabolism
    allgene=unique(allgene$V1)   #unique()去除重复函数,获得单一的Pathway用于后面for循环计数每个pathway的Ko数目
    
    head(allgene) #展示allgene数据
    ## [1] "K02989" "K01634" "K09880" "K13151" "K01114" "K08066"
    deggene=unique(deggene$V1)   #unique()去除重复函数,获得单一的Pathway用于后面for循环计数每个pathway的Ko数目
    
    head(deggene)  #展示deggene数据
    ## [1] "K02989" "K01595" "K15414" "K11252" "K07874" "K09489"
    #==============设定输出的文件名称============
    
    outfile="kegg.enrichment"
    #=======将基因与Pathway中有注释的基因取交集========
    
    deginfo=pathinfo[pathinfo[,1]%in%deggene,] #相当于merge
    
    head(deginfo)#展示deginfo数据
    ##      KOID                       PATHWAY     PID Category
    ## 1  K00844 glycolysis / Gluconeogenesis  ko00010     PATH
    ## 4  K01810 glycolysis / Gluconeogenesis  ko00010     PATH
    ## 8  K00850 glycolysis / Gluconeogenesis  ko00010     PATH
    ## 12 K00895 glycolysis / Gluconeogenesis  ko00010     PATH
    ## 18 K01623 glycolysis / Gluconeogenesis  ko00010     PATH
    ## 24 K01803 glycolysis / Gluconeogenesis  ko00010     PATH
    allinfo=pathinfo[pathinfo[,1]%in%allgene,]  #相当于merge
    
    head(allinfo) #展示allinfo数据
    ##      KOID                       PATHWAY     PID Category
    ## 1  K00844 glycolysis / Gluconeogenesis  ko00010     PATH
    ## 4  K01810 glycolysis / Gluconeogenesis  ko00010     PATH
    ## 8  K00850 glycolysis / Gluconeogenesis  ko00010     PATH
    ## 12 K00895 glycolysis / Gluconeogenesis  ko00010     PATH
    ## 13 K03841 glycolysis / Gluconeogenesis  ko00010     PATH
    ## 18 K01623 glycolysis / Gluconeogenesis  ko00010     PATH
    k=length(unique(deginfo[,1]))  #差异基因总数,length函数求基因数目
    
    k  #展示K数据
    ## [1] 353
    N=length(unique(allinfo[,1]))  #总基因数   #length函数求基因数目
    
    N  #展示N数据
    ## [1] 2012
    #=========对每个通路基于超几何分布计算富集p值=======
    
    p<-data.frame() #新建data frame用于储存KO数目
    
    
    urlColor<-NULL # used to color the genes in pathway
    geneLink<-NULL # used to set hyperlink to DEGs in pathway
    
    #通过循环计算每个KEGGpathways上的Ko号
    
    for (i in seq(1,nrow(pathway))) {
     
      pid=pathway[i,'PID'] #获取Pathway的KO号码,例如:pathway[1,'PID']=》 "ko00010";pathway[1,'PID']=》"ko00020"等
      
      allInPath=allinfo[allinfo[,'PID']%in%pid,] #获得每一个pathway所有的KO数目,具体见备注
      
      m=length(unique(allInPath[,1]))  #计算i Pathway基因总数
      
      degInPath=deginfo[deginfo[,'PID']%in%pid,] #获得每一个pathway所有的KO数目
      
      x=length(unique(degInPath[,1]))  #计算i Pathway中差异基因数
      
      p[i,1]=x #数据框p的第i行,第一列值
      p[i,2]=m #数据框p的第i行,第二列值
      p[i,3]=phyper(x-1,m,N-m,k,lower.tail=FALSE)  #lower.tail=FALSE,计算的是P[X>(x-1)]的概率,其中x-1:某一pathway中差异基因数目(或KO数),m:背景数(或KO数),N:所有基因数(或KO数),K:所有差异数(或KO数)
      
      urlColor[i]=apply(as.matrix(paste("/",t(degInPath[,1]),'%09red',sep="")),2,paste,collapse="")
      Link=paste('<a href=', shQuote(paste('https://www.kegg.jp/dbget-bin/www_bget?ko:',degInPath[,1],sep="")),'/','>', degInPath[,1],'</a>',sep="")
      if(x==0){
        geneLink[i]="--"
      }else{
        geneLink[i]=apply(as.matrix(Link),2,paste,collapse=", ")
      }
    }
    #============计算FDR,整理结果==============
    
    fdr=p.adjust(p[,3],method="BH") #利用第三列的p值计算FDR值,!!!可以借鉴!!!!
    
    output=cbind(pathway,p,fdr)#合并数据库框,产生新的数据框,因为for循环按照pathway内容从上到下依次运行的,所以顺序是相同的,可以进行列合并
    
    head(output)#展示数据内容
    ##         PID                                   PATHWAY V1 V2           V3
    ## 1   ko00010             glycolysis / Gluconeogenesis  21 34 7.896160e-09
    ## 105 ko00020                citrate cycle (TCA cycle)  16 20 1.469740e-09
    ## 164 ko00030                pentose phosphate pathway  10 17 1.484026e-04
    ## 247 ko00040 pentose and glucuronate interconversions   6 14 2.381463e-02
    ## 326 ko00051          fructose and mannose metabolism   7 18 2.628347e-02
    ## 435 ko00052                     galactose metabolism   7 15 8.569568e-03
    ##              fdr
    ## 1   5.152244e-07
    ## 105 1.278674e-07
    ## 164 5.533295e-03
    ## 247 2.702443e-01
    ## 326 2.840186e-01
    ## 435 1.574602e-01
    colnames(output)=c('ID','Pathway','DEGsInPathway','GenesInPathway','Pvalue','FDR') #重新给数据框命名
    
    ind=order(output[,"Pvalue"])#按照P值从小到大对结果排序
    output=output[ind,]       #用于输出到表格文件
    urlColor=urlColor[ind]
    DEGs=geneLink[ind]
    
    output2=cbind(output,DEGs) #用于输出到HTML文件
    #==============导出到表格文件================
    
    write.table(output,file=paste(outfile,".xls",sep=""),quote = F,row.names = F,col.names = T,sep="\t")
    #==============导出结果到HTML文件============
    
    #==============导出结果到HTML文件============
    urlTitles <- output[,'ID']
    url=paste('https://www.genome.jp/kegg-bin/show_pathway?',gsub("ko", "map", urlTitles),urlColor,sep="")
    
    htmlOUT <- transform(output2, ID = paste('<a href = ', shQuote(url),'/','>', urlTitles, '</a>'))
    
    target <- HTMLInitFile(Title = "KEGG Pathway Ernichment Results",outdir=getwd(),filename=outfile, BackGroundColor="#f8f8f8",useGrid = T,useLaTeX = F,HTMLframe = F)
    write("<style>table{border-collapse:collapse;width:1280px;}a:link,a:visited {color: #3366FF;text-decoration: none; }a:hover,a:active {color: #ff3366;text-decoration: none; };</style>",target,append = T)
    HTML(as.title("KEGG Pathway Ernichment Results"),file=target)
    HTML(htmlOUT,file=target,innerBorder = 1,row.names = F,digits=3)
    HTMLEndFile()
    #============================================
    #——————备注——————-
    
    #------------------备注-------------------
    pid=pathway[2,'PID']#获得第二个pathway
    
    pid #pid内容
    ## [1] "ko00020"
    allInPath=allinfo[allinfo[,'PID']%in%pid,]#计算第二个pathway中所有背景Ko号
    
    allInPath #展示allInPath内容
    ##       KOID                    PATHWAY     PID Category
    ## 105 K01647 citrate cycle (TCA cycle)  ko00020     PATH
    ## 106 K01648 citrate cycle (TCA cycle)  ko00020     PATH
    ## 110 K01681 citrate cycle (TCA cycle)  ko00020     PATH
    ## 112 K00031 citrate cycle (TCA cycle)  ko00020     PATH
    ## 113 K00030 citrate cycle (TCA cycle)  ko00020     PATH
    ## 115 K00164 citrate cycle (TCA cycle)  ko00020     PATH
    ## 116 K00658 citrate cycle (TCA cycle)  ko00020     PATH
    ## 117 K00382 citrate cycle (TCA cycle)  ko00020     PATH
    ## 124 K01899 citrate cycle (TCA cycle)  ko00020     PATH
    ## 125 K01900 citrate cycle (TCA cycle)  ko00020     PATH
    ## 127 K00234 citrate cycle (TCA cycle)  ko00020     PATH
    ## 128 K00235 citrate cycle (TCA cycle)  ko00020     PATH
    ## 129 K00236 citrate cycle (TCA cycle)  ko00020     PATH
    ## 144 K01679 citrate cycle (TCA cycle)  ko00020     PATH
    ## 145 K00025 citrate cycle (TCA cycle)  ko00020     PATH
    ## 146 K00026 citrate cycle (TCA cycle)  ko00020     PATH
    ## 152 K01610 citrate cycle (TCA cycle)  ko00020     PATH
    ## 155 K00161 citrate cycle (TCA cycle)  ko00020     PATH
    ## 156 K00162 citrate cycle (TCA cycle)  ko00020     PATH
    ## 157 K00627 citrate cycle (TCA cycle)  ko00020     PATH
    

    参考:

    Beijing PlantTech Technologies
    Email:tech@planttech.com.cn

    相关文章

      网友评论

          本文标题:KEGG分析

          本文链接:https://www.haomeiwen.com/subject/rxrrkrtx.html