# -*- coding: utf-8 -*-
import os
import sys
import re
from Bio import SeqIO
from Bio.Seq import Seq
in1=sys.argv[1]
out1=sys.argv[2]
out2=sys.argv[3]
ouc=open(out1,'w')
virus_dict={}
with open('.virus_nameid','r') as v:
vi=v.readlines()
for vin in vi:
vinc=vin.strip().split(" ")
vinkey=str(vinc[0])
vinvalue=vinc[1:]
vincon="_".join(vinvalue)
virus_dict[vinkey]=str(vincon)
print(str(virus_dict.keys()))
print("\n")
print(str(virus_dict))
with open(in1,'r') as i:
li=i.readlines()
for lin in li:
linc=lin.strip().split("\t")
# pos1=str(linc[3])
# pos2=str(linc[7])
matchcon=str(linc[5])
posID=str(linc[0])
if str(matchcon) == "150M":
continue
else:
if "S" not in matchcon:
if "D" in matchcon:
conpos=re.split('M|D',matchcon)
poslen=0
for conposi in conpos:
print(str(conposi))
if conposi.isdigit():
poslen=poslen+int(conposi)
elif "I" in str(conposi):
conpos1=re.split('I',matchcon)
for conpos1i in conpos1:
if conpos1i.isdigit():
continue
else:
con2=re.split('M|D',conpos1i)
for con2i in con2:
if con2i.isdigit():
poslen=poslen+int(con2i)
else:
poslen=150
else:
conpos=re.split('S',matchcon)
con1=conpos[0]
if con1.isdigit():
poslen=0
else:
con2=re.split('M|D',str(con1))
poslen=0
for conposi in con2:
if conposi.isdigit():
poslen=poslen+int(conposi)
elif "I" in str(conposi):
conpos1=re.split('I',matchcon)[1]
con2=re.split('M|D',conpos1)
for con2i in con2:
if con2i.isdigit():
poslen=poslen+int(con2i)
pos1id=linc[2]
pos1pos=linc[3]
pos2id=linc[6]
pos2pos=linc[7]
# print(str(pos2))
if "chr" in str(pos1id):
if "virus" in str(pos2id):
# posinsert=int(linc[4])
# print(str(posinsert))
posinsert=int(pos1pos)+int(poslen)
virusid=str(pos2id)
print("virusid : "+str(virusid))
virusname=virus_dict[virusid]
#ouc.write(str(linc[0])+"\t"+str(linc[2])+"\t"+str(posinsert)+"\t"+str(virusid)+"\t"+str(virusname)+"\n")
ouc.write(str(pos1id)+"\t"+str(posinsert)+"\t"+str(linc[0])+"\t"+str(virusid)+"\t"+str(virusname)+"\n")
elif "chr" in str(pos2id):
if "virus" in str(pos1id) :
posinsert=int(pos2pos)
posinsert=posinsert+int(poslen)
virusid=str(pos1id)
print("virusid : "+str(virusid))
virusname=virus_dict[virusid]
# ouc.write(str(linc[0])+"\t"+str(linc[6])+"\t"+str(posinsert)+"\t"+str(virusid)+"\t"+str(virusname)+"\n")
ouc.write(str(pos2id)+"\t"+str(posinsert)+"\t"+str(linc[0])+"\t"+str(virusid)+"\t"+str(virusname)+"\n")
ouc.close()
seqdict={}
for seqi in SeqIO.parse('./homo.fa',"fasta"):
seqid=seqi.id
seqseq=seqi.seq
seqdict[seqid]=seqseq
pos_list=[]
ou2=open(out2,'w')
count=0
with open(out1,'r') as i:
li=i.readlines()
for lin in li:
linc=lin.strip().split("\t")
print(str(linc))
count=count+1
poschr=linc[0]
pospos=linc[1]
pos_list_con=str(poschr)+"_"+str(pospos)
if pos_list_con not in pos_list:
pos_list.append(pos_list_con)
basepos=seqdict[poschr][int(pospos)]
ou2.write(str(linc[0])+"\t"+str(linc[1])+"\tINV0000000"+str(count)+"\t"+str(basepos)+"\t<INV>\t1\tPASS\tPRECISE;SVTYPE=INV;SVMETHOD=EMBL.DELLYv0.8.7;""\n")
ou2.close()
网友评论