import numpy
import collections
import scipy.stats
import math
import vcf
###读取SNP位点
pos_list=[]
vcf_reader=vcf.Reader(filename=r'suis.vcf')
for record in vcf_reader:
pos_list.append(int(record.POS))
branch1_snp_list=[]
branch2_snp_list=[]
total_snp_list=[]
#读取fasta文件
with open(r'suis_branch1.fasta') as f:
for line in f:
if line.startswith('>'):
continue
else:
snp=line.replace('\n','')
snp=str(snp)
branch1_snp_list.append(list(snp))
total_snp_list.append(list(snp))
with open(r'suis_branch2.fasta') as f:
for line in f:
if line.startswith('>'):
continue
else:
snp=line.replace('\n','')
snp = str(snp)
branch2_snp_list.append(list(snp))
total_snp_list.append(list(snp))
branch1_array=[]
branch2_array=[]
total_array=[]
array_len=len(branch1_snp_list[0])
print("一共有%d个位点" % array_len)
for i in range(0,array_len):
branch1_array.append(0)
branch2_array.append(0)
total_array.append(0)
branch1_array=numpy.array(branch1_array)
branch2_array=numpy.array(branch2_array)
total_array=numpy.array(total_array)
#将两个分支的序列合并成矩阵
for i in branch1_snp_list:
i=numpy.array(i)
branch1_array=numpy.column_stack((branch1_array,i))
branch1_array = numpy.delete(branch1_array, 0, axis=1)###矩阵的列数为branch1内菌株的个数,行数为总共SNP位点的个数
for i in branch2_snp_list:
i=numpy.array(i)
branch2_array=numpy.column_stack((branch2_array,i))
branch2_array = numpy.delete(branch2_array, 0, axis=1)
for i in total_snp_list:
i=numpy.array(i)
total_array=numpy.column_stack((total_array,i))
total_array = numpy.delete(total_array, 0, axis=1)
###卡方检验函数
def kafang(a,b,c,d,thre,index):
n=(a+b+c+d)*1.0
branch1=(a+b)*1.0
branch2=(c+d)*1.0
major=(a+c)*1.0
minor=(b+d)*1.0
TRC_a=branch1*(major/n)
TRC_b=branch1-TRC_a
TRC_c=branch2*(major/n)
TRC_d=branch2-TRC_c
out_res=0
X2=0
Pval=0
###判断理论数的大小,以选择不同的卡方检验
if(n>=40):
if(TRC_a>=5 and TRC_b>=5 and TRC_c>=5 and TRC_d >=5):
X2=math.pow((a-TRC_a),2)/TRC_a+math.pow((b-TRC_b),2)/TRC_b+math.pow((c-TRC_c),2)/TRC_c+math.pow((d-TRC_d),2)/TRC_d
if(thre==0.05):
if(X2>=3.84):
out_res=1
elif(thre==0.01):
if(X2>=6.64):
out_res=1
elif(TRC_a>=1 and TRC_b>=1 and TRC_c>=1 and TRC_d>=1):
X2=math.pow((abs(a-TRC_a)-0.5),2)/TRC_a+math.pow((abs(b-TRC_b)-0.5),2)/TRC_b+math.pow((abs(c-TRC_c)-0.5),2)/TRC_c+math.pow((abs(d-TRC_d)-0.5),2)/TRC_d
if (thre == 0.05):
if (X2 >= 3.84):
out_res = 1
elif (thre == 0.01):
if (X2 >= 6.64):
out_res = 1
else:
Pval=scipy.stats.fisher_exact([[a,b],[c,d]])[1]
if(thre==0.05):
if(Pval<0.05):
out_res=1
elif(thre==0.01):
if(Pval<0.01):
out_res=1
else:
Pval = scipy.stats.fisher_exact([[a, b], [c, d]])[1]
if (thre == 0.05):
if (Pval < 0.05):
out_res = 1
elif (thre == 0.01):
if (Pval < 0.01):
out_res = 1
with open(r'test.log','a+') as f:###输出所有位点的记录
print("snp_pos:",pos_list[index],file=f)
if(X2==0):
print("Pval:",Pval,file=f)
else:
print("X2:", X2,file=f)
print("num\tmajor_allele\tminor_allele",file=f)
print("bran1\t%d\t%d" % (a,b),file=f)
print("bran2\t%d\t%d" % (c,d),file=f)
finall_out=0
if(out_res==1):
if(a>b and d>c):
if(a/branch1>=0.8 and d/branch2>=0.8):
finall_out = 1
with open(r'diff_pos.log','a+') as f:
print("snp_pos:", pos_list[index], file=f)
if (X2 == 0):
print("Pval:", Pval, file=f)
else:
print("X2:", X2, file=f)
print("num\tmajor_allele\tminor_allele", file=f)
print("bran1\t%d\t%d" % (a, b), file=f)
print("bran2\t%d\t%d" % (c, d), file=f)
if(b>a and c>d):
if(b/branch1>=0.8 and c/branch2>=0.8):
finall_out = 1
with open(r'diff_pos.log','a+') as f:
print("snp_pos:", pos_list[index], file=f)
if (X2 == 0):
print("Pval:", Pval, file=f)
else:
print("X2:", X2, file=f)
print("num\tmajor_allele\tminor_allele", file=f)
print("bran1\t%d\t%d" % (a, b), file=f)
print("bran2\t%d\t%d" % (c, d), file=f)
return finall_out
###选出两个分支矩阵中的两种碱基及其对应的个数
for i in range(0,len(total_array)):
total_most_common=collections.Counter(total_array[i]).most_common()
if len(total_most_common)==2:
major_base=total_most_common[0][0]
minor_base=total_most_common[1][0]
branch1_most_common=collections.Counter(branch1_array[i]).most_common()
branch2_most_common = collections.Counter(branch2_array[i]).most_common()
###对branch1进行处理
if len(branch1_most_common)==1:
if branch1_most_common[0][0]==major_base:
branch1_major_num=branch1_most_common[0][1]
branch1_minor_num=0
else:
branch1_major_num = 0
branch1_minor_num = branch1_most_common[0][1]
else:
if branch1_most_common[0][0]==major_base:
branch1_major_num = branch1_most_common[0][1]
branch1_minor_num = branch1_most_common[1][1]
else:
branch1_major_num = branch1_most_common[1][1]
branch1_minor_num = branch1_most_common[0][1]
###对branch2进行处理
if len(branch2_most_common)==1:
if branch2_most_common[0][0]==major_base:
branch2_major_num=branch2_most_common[0][1]
branch2_minor_num=0
else:
branch2_major_num = 0
branch2_minor_num = branch2_most_common[0][1]
else:
if branch2_most_common[0][0]==major_base:
branch2_major_num = branch2_most_common[0][1]
branch2_minor_num = branch2_most_common[1][1]
else:
branch2_major_num = branch2_most_common[1][1]
branch2_minor_num = branch2_most_common[0][1]
###进行卡方检验
finall_out=kafang(a=branch1_major_num,b=branch1_minor_num,c=branch2_major_num,d=branch2_minor_num,thre=0.05,index=i)
if(finall_out==1):
with open(r'sign_diff_pos', 'a+') as f:
chrome="FM252032.1"
print("%s\t%d" % (chrome,pos_list[i]),file=f)
网友评论