library('R2HTML')
Rice KEGG 分析
#读取背景基因(所有注释到KEGG的基因)和差异基因列表,以K号表示
allgene=read.delim('Rice_all_gene_KOs.txt',header=F,sep="\t",stringsAsFactors = F)#读取水稻所有pathways
head(allgene) #展示allgene数据
## V1
## 1 K02989
## 2 K02989
## 3 K01634
## 4 K09880
## 5 K13151
## 6 K13151
deggene=read.delim('Degs_KOs.txt',header=F,sep="\t",stringsAsFactors = F) #取差异基因的pathway信息
head(deggene) #展示deggene数据
## V1
## 1 K02989
## 2 K01595
## 3 K15414
## 4 K11252
## 5 K07874
## 6 K09489
pathinfo=read.delim('kegg.pathway.id',header=T,sep="\t",stringsAsFactors = F) #取所有的pathway信息
head(pathinfo) #展示pathinfo数据
## KOID PATHWAY PID Category
## 1 K00844 glycolysis / Gluconeogenesis ko00010 PATH
## 2 K12407 glycolysis / Gluconeogenesis ko00010 PATH
## 3 K00845 glycolysis / Gluconeogenesis ko00010 PATH
## 4 K01810 glycolysis / Gluconeogenesis ko00010 PATH
## 5 K06859 glycolysis / Gluconeogenesis ko00010 PATH
## 6 K13810 glycolysis / Gluconeogenesis ko00010 PATH
#读取背景基因(所有注释到KEGG的基因)和差异基因列表,以K号表示
pathway=unique(pathinfo[,c('PID','PATHWAY')]) #unique()去除重复函数,获得单一的Pathway用于后面for循环计数每个pathway的Ko数目
head(pathway) #展示pathway数据
## PID PATHWAY
## 1 ko00010 glycolysis / Gluconeogenesis
## 105 ko00020 citrate cycle (TCA cycle)
## 164 ko00030 pentose phosphate pathway
## 247 ko00040 pentose and glucuronate interconversions
## 326 ko00051 fructose and mannose metabolism
## 435 ko00052 galactose metabolism
allgene=unique(allgene$V1) #unique()去除重复函数,获得单一的Pathway用于后面for循环计数每个pathway的Ko数目
head(allgene) #展示allgene数据
## [1] "K02989" "K01634" "K09880" "K13151" "K01114" "K08066"
deggene=unique(deggene$V1) #unique()去除重复函数,获得单一的Pathway用于后面for循环计数每个pathway的Ko数目
head(deggene) #展示deggene数据
## [1] "K02989" "K01595" "K15414" "K11252" "K07874" "K09489"
#==============设定输出的文件名称============
outfile="kegg.enrichment"
#=======将基因与Pathway中有注释的基因取交集========
deginfo=pathinfo[pathinfo[,1]%in%deggene,] #相当于merge
head(deginfo)#展示deginfo数据
## KOID PATHWAY PID Category
## 1 K00844 glycolysis / Gluconeogenesis ko00010 PATH
## 4 K01810 glycolysis / Gluconeogenesis ko00010 PATH
## 8 K00850 glycolysis / Gluconeogenesis ko00010 PATH
## 12 K00895 glycolysis / Gluconeogenesis ko00010 PATH
## 18 K01623 glycolysis / Gluconeogenesis ko00010 PATH
## 24 K01803 glycolysis / Gluconeogenesis ko00010 PATH
allinfo=pathinfo[pathinfo[,1]%in%allgene,] #相当于merge
head(allinfo) #展示allinfo数据
## KOID PATHWAY PID Category
## 1 K00844 glycolysis / Gluconeogenesis ko00010 PATH
## 4 K01810 glycolysis / Gluconeogenesis ko00010 PATH
## 8 K00850 glycolysis / Gluconeogenesis ko00010 PATH
## 12 K00895 glycolysis / Gluconeogenesis ko00010 PATH
## 13 K03841 glycolysis / Gluconeogenesis ko00010 PATH
## 18 K01623 glycolysis / Gluconeogenesis ko00010 PATH
k=length(unique(deginfo[,1])) #差异基因总数,length函数求基因数目
k #展示K数据
## [1] 353
N=length(unique(allinfo[,1])) #总基因数 #length函数求基因数目
N #展示N数据
## [1] 2012
#=========对每个通路基于超几何分布计算富集p值=======
p<-data.frame() #新建data frame用于储存KO数目
urlColor<-NULL # used to color the genes in pathway
geneLink<-NULL # used to set hyperlink to DEGs in pathway
#通过循环计算每个KEGGpathways上的Ko号
for (i in seq(1,nrow(pathway))) {
pid=pathway[i,'PID'] #获取Pathway的KO号码,例如:pathway[1,'PID']=》 "ko00010";pathway[1,'PID']=》"ko00020"等
allInPath=allinfo[allinfo[,'PID']%in%pid,] #获得每一个pathway所有的KO数目,具体见备注
m=length(unique(allInPath[,1])) #计算i Pathway基因总数
degInPath=deginfo[deginfo[,'PID']%in%pid,] #获得每一个pathway所有的KO数目
x=length(unique(degInPath[,1])) #计算i Pathway中差异基因数
p[i,1]=x #数据框p的第i行,第一列值
p[i,2]=m #数据框p的第i行,第二列值
p[i,3]=phyper(x-1,m,N-m,k,lower.tail=FALSE) #lower.tail=FALSE,计算的是P[X>(x-1)]的概率,其中x-1:某一pathway中差异基因数目(或KO数),m:背景数(或KO数),N:所有基因数(或KO数),K:所有差异数(或KO数)
urlColor[i]=apply(as.matrix(paste("/",t(degInPath[,1]),'%09red',sep="")),2,paste,collapse="")
Link=paste('<a href=', shQuote(paste('https://www.kegg.jp/dbget-bin/www_bget?ko:',degInPath[,1],sep="")),'/','>', degInPath[,1],'</a>',sep="")
if(x==0){
geneLink[i]="--"
}else{
geneLink[i]=apply(as.matrix(Link),2,paste,collapse=", ")
}
}
#============计算FDR,整理结果==============
fdr=p.adjust(p[,3],method="BH") #利用第三列的p值计算FDR值,!!!可以借鉴!!!!
output=cbind(pathway,p,fdr)#合并数据库框,产生新的数据框,因为for循环按照pathway内容从上到下依次运行的,所以顺序是相同的,可以进行列合并
head(output)#展示数据内容
## PID PATHWAY V1 V2 V3
## 1 ko00010 glycolysis / Gluconeogenesis 21 34 7.896160e-09
## 105 ko00020 citrate cycle (TCA cycle) 16 20 1.469740e-09
## 164 ko00030 pentose phosphate pathway 10 17 1.484026e-04
## 247 ko00040 pentose and glucuronate interconversions 6 14 2.381463e-02
## 326 ko00051 fructose and mannose metabolism 7 18 2.628347e-02
## 435 ko00052 galactose metabolism 7 15 8.569568e-03
## fdr
## 1 5.152244e-07
## 105 1.278674e-07
## 164 5.533295e-03
## 247 2.702443e-01
## 326 2.840186e-01
## 435 1.574602e-01
colnames(output)=c('ID','Pathway','DEGsInPathway','GenesInPathway','Pvalue','FDR') #重新给数据框命名
ind=order(output[,"Pvalue"])#按照P值从小到大对结果排序
output=output[ind,] #用于输出到表格文件
urlColor=urlColor[ind]
DEGs=geneLink[ind]
output2=cbind(output,DEGs) #用于输出到HTML文件
#==============导出到表格文件================
write.table(output,file=paste(outfile,".xls",sep=""),quote = F,row.names = F,col.names = T,sep="\t")
#==============导出结果到HTML文件============
#==============导出结果到HTML文件============
urlTitles <- output[,'ID']
url=paste('https://www.genome.jp/kegg-bin/show_pathway?',gsub("ko", "map", urlTitles),urlColor,sep="")
htmlOUT <- transform(output2, ID = paste('<a href = ', shQuote(url),'/','>', urlTitles, '</a>'))
target <- HTMLInitFile(Title = "KEGG Pathway Ernichment Results",outdir=getwd(),filename=outfile, BackGroundColor="#f8f8f8",useGrid = T,useLaTeX = F,HTMLframe = F)
write("<style>table{border-collapse:collapse;width:1280px;}a:link,a:visited {color: #3366FF;text-decoration: none; }a:hover,a:active {color: #ff3366;text-decoration: none; };</style>",target,append = T)
HTML(as.title("KEGG Pathway Ernichment Results"),file=target)
HTML(htmlOUT,file=target,innerBorder = 1,row.names = F,digits=3)
HTMLEndFile()
#============================================
#——————备注——————-
#------------------备注-------------------
pid=pathway[2,'PID']#获得第二个pathway
pid #pid内容
## [1] "ko00020"
allInPath=allinfo[allinfo[,'PID']%in%pid,]#计算第二个pathway中所有背景Ko号
allInPath #展示allInPath内容
## KOID PATHWAY PID Category
## 105 K01647 citrate cycle (TCA cycle) ko00020 PATH
## 106 K01648 citrate cycle (TCA cycle) ko00020 PATH
## 110 K01681 citrate cycle (TCA cycle) ko00020 PATH
## 112 K00031 citrate cycle (TCA cycle) ko00020 PATH
## 113 K00030 citrate cycle (TCA cycle) ko00020 PATH
## 115 K00164 citrate cycle (TCA cycle) ko00020 PATH
## 116 K00658 citrate cycle (TCA cycle) ko00020 PATH
## 117 K00382 citrate cycle (TCA cycle) ko00020 PATH
## 124 K01899 citrate cycle (TCA cycle) ko00020 PATH
## 125 K01900 citrate cycle (TCA cycle) ko00020 PATH
## 127 K00234 citrate cycle (TCA cycle) ko00020 PATH
## 128 K00235 citrate cycle (TCA cycle) ko00020 PATH
## 129 K00236 citrate cycle (TCA cycle) ko00020 PATH
## 144 K01679 citrate cycle (TCA cycle) ko00020 PATH
## 145 K00025 citrate cycle (TCA cycle) ko00020 PATH
## 146 K00026 citrate cycle (TCA cycle) ko00020 PATH
## 152 K01610 citrate cycle (TCA cycle) ko00020 PATH
## 155 K00161 citrate cycle (TCA cycle) ko00020 PATH
## 156 K00162 citrate cycle (TCA cycle) ko00020 PATH
## 157 K00627 citrate cycle (TCA cycle) ko00020 PATH
参考:
网友评论