输入文件有两个:一个为上一题的输出文件,另一个为limma包输出结果。
import os
import re
from collections import OrderedDict
import numpy as np
import pandas as pd
os.chdir('D:/python/question7')
kegg = OrderedDict()
pop = []
with open('cleaned_KEGG.csv') as f:
for line in f:
line = line.rstrip()
lst = line.split(',')
if len(lst) > 3:
geneList = lst[-1].split(';')
kegg[lst[2]] = geneList
pop = pop + [gene for gene in geneList if gene not in pop] #计算出所有通路的基因
DEGs = []
with open('dif.csv') as f:
for line in f:
line = line.strip()
lst = line.split(',')
if lst[0].startswith("gene"): #注意看第一行第一个元素有没有东西
continue
elif lst[0] in pop: #如果差异基因在背景基因里面,则取出放到DGEs里
DEGs.append(lst[0])
popTotal = len(pop)
#6990
listTotal = len(DEGs)
#15
print ('Number of genes in background list: %d' %popTotal)
print ('Number of DE genes involved in any pathways: %d' %listTotal)
from scipy.stats import hypergeom
keggEnrich = OrderedDict()
for ke,val in kegg.items():
hits = [gene for gene in val if gene in DEGs]
hitCount = len(hits)
popHits = len(val)
if hitCount == 0:
print(ke)
else:
pVal = hypergeom.sf(hitCount-1,popTotal,popHits,listTotal)
#Scipy中的stats.hypergeom.sf(x, m+n, m, k)相当于R中的phyper(x, m, n, k, lower.tail=FALSE),其中当lower.tail的值为真时,phyper返回一个当抽取到的白球数小于等于q值的概率,当lower.tail值为假时,phyper返回一个当抽取到的白球数大于q值的概率。phyper的返回值是一个累积的概率值。
keggEnrich[ke] = [hitCount,listTotal,popHits,popTotal,pVal,';'.join(hits)]
keggOutput = pd.DataFrame.from_dict(keggEnrich,orient='columns',dtype=None)
keggOutput = pd.DataFrame.transpose(keggOutput)
keggOutput.columns = ['Count','List Total','pop Hits','pop Total','pval','Genes']
keggOutput=keggOutput.sort_values(by='pval',axis=0)
from rpy2.robjects.packages import importr
from rpy2.robjects.vectors import FloatVector
from rpy2.robjects import pandas2ri
stats = importr('stats')
pVal = keggOutput['pval']
fdr = stats.p_adjust(FloatVector(pVal),method='fdr')
fdrPD = pandas2ri.ri2py(fdr)
keggOutput.insert(5,'FDR',fdrPD)
keggOutput.to_excel('kegg_pathway_enrichment.xlsx',sheet_name='kegg')
网友评论