#!/usr/bin/env python
# -*- coding: utf-8 -*-
import os
import sys
import re
from Bio import SeqIO
from Bio.Seq import Seq
inputfile = sys.argv[1]
result = open("result.xls","w")
for seqi in SeqIO.parse(inputfile,'fasta'):
seqid = seqi.id
seqseq = seqi.seq
seqseq = str(seqseq)
lenghtnum = len(seqseq)
Gcont = seqseq.count('G')
Ccont = seqseq.count('C')
Tcont = seqseq.count('T')
Acont = seqseq.count('A')
GCcont = int(Gcont) + int(Ccont)
GCpre = str(float(GCcont*100/int(lenghtnum)))
GCprent = float(GCprentdup)
result.write(str(seqid)+"\t"+str(GCprent)+"\t"+str(lenghtnum)+"\n")
result.close()
# -*- coding: utf-8 -*-
# !usr/bin/python
import os
import os.path
import sys
import re
def reverseq(sequence):
rever_seq = ''
basepair = {'A':'T', 'T':'A', 'G':'C', 'C':'G','N':'N'}
sequencon = str(sequence)
for sequencei in sequence:
rever_seq = rever_seq + basepair[sequencei]
return rever_seq[::-1]
dictposidcds = {}
dictposidgene = []
os.system('touch gffinfo.txt')
gfflicon = open("gffinfo.txt","w")
genecon = open("geneinfo.txt","w")
with open("genome.gff","r") as gffliness:
gfflines = gffliness.readlines()
for gfflinev1 in gfflines:
gffline = gfflinev1.strip().split("\n")[0]
gfflin = gffline.strip().split("\t")
difftype = gfflin[2]
startpos = gfflin[3]
endpos = gfflin[4]
strand = gfflin[6]
IDpos = str(gfflin[8])
if str(difftype) == "CDS":
if IDpos in dictposidcds:
dictposidcds[IDpos].append([startpos,endpos])
elif IDpos not in dictposidcds:
dictposidcds[IDpos] = []
dictposidcds[IDpos].append([startpos,endpos])
gfflicon.write(str(startpos)+"\t"+str(endpos)+"\t"+str(strand)+"\t"+str(IDpos)+'\n')
if str(difftype) == "gene":
dictposidgene.append([int(startpos),int(endpos),str(IDpos)])
genecon.write(str(startpos)+"\t"+str(endpos)+"\t"+str(strand)+"\t"+str(IDpos)+'\n')
gfflicon.close()
genecon.close()
dictgffid = {}
gfflist = []
with open("gffinfo.txt","r") as gffliness:
gfflines = gffliness.readlines()
for gffline in gfflines:
splitgffcon = gffline.strip().split("\t")
gfflist.append(splitgffcon)
genoseq = ''
with open("keptgenome.fa","r") as fastaliness:
fastalines = fastaliness.readlines()
for fastaline in fastalines:
if fastaline.startswith(">"):
continue
else:
genoseq = str(fastaline)
dictcdslenpos = {}
idfastazheng = open("sequencezheng.txt","w")
for gfflisti in gfflist:
startpos = int(gfflisti[0])
endpos = int(gfflisti[1])
strand = gfflisti[2]
contigIDpos = gfflisti[3]
keyvaluecontig1 = str(contigIDpos).strip().split("=")[2]
keyvaluecontig = str(keyvaluecontig1).strip().split(";")[0]
if keyvaluecontig not in dictcdslenpos:
dictcdslenpos[keyvaluecontig] = []
if str(strand) == '+':
sequence = genoseq[startpos-1:endpos]
lenghtidcds = len(sequence)
dictcdslenpos[keyvaluecontig].append([int(startpos),int(endpos),str(strand),int(lenghtidcds),str(contigIDpos)])
idfastazheng.write(str(contigIDpos)+"\t"+str(startpos)+"\t"+str(endpos)+"\t"+str(strand)+"\t"+str(sequence)+"\n")
elif str(strand) == '-':
sequencev1 = str(genoseq[startpos:endpos])
lenghtidcds = len(sequencev1)
dictcdslenpos[keyvaluecontig].append([int(startpos),int(endpos),str(strand),int(lenghtidcds),str(contigIDpos)])
sequence = reverseq(sequencev1)
idfastazheng.write(str(contigIDpos)+"\t"+str(startpos)+"\t"+str(endpos)+"\t"+str(strand)+"\t"+str(sequence)+"\n")
idfastazheng.close()
dictzheseq = {}
trimfastacon = open("trimfasta.txt",'w')
with open("sequencezheng.txt","r") as zhesequenliness:
zhesequenlines = zhesequenliness.readlines()
for zhesequenline in zhesequenlines:
zhesplitzhecont = zhesequenline.strip().split("\t")
IDzheseq = zhesplitzhecont[0]
zhestartpos = zhesplitzhecont[1]
zheendpos = zhesplitzhecont[2]
zhesequence = zhesplitzhecont[4]
if IDzheseq not in dictzheseq:
dictzheseq[IDzheseq] = []
dictzheseq[IDzheseq].append([zhestartpos,zheendpos,zhesequence])
else:
dictzheseq[IDzheseq].append([zhestartpos,zheendpos,zhesequence])
for dictzhekeys,dictzhevalues in dictzheseq.items():
trimfastacon.write(">"+str(dictzhekeys)+"\n")
for dictzhevaluei in dictzhevalues:
sequence = dictzhevaluei[2]
trimfastacon.write(str(sequence))
trimfastacon.write("\n")
trimfastacon.close()
def dnatrans(dnaseq):
proteintable = {'ATA': 'I', 'ATC': 'I', 'ATT': 'I', 'ATG': 'M', 'ACA': 'T', 'ACC': 'T', 'ACG': 'T', 'ACT': 'T', 'AAC': 'N', 'AAT': 'N', 'AAA': 'K', 'AAG': 'K', 'AGC': 'S', 'AGT': 'S', 'AGA': 'R', 'AGG': 'R', 'CTA': 'L', 'CTC': 'L', 'CTG': 'L', 'CTT': 'L', 'CCA': 'P', 'CCC': 'P', 'CCG': 'P', 'CCT': 'P', 'CAC': 'H', 'CAT': 'H', 'CAA': 'Q', 'CAG': 'Q', 'CGA': 'R', 'CGC': 'R', 'CGG': 'R', 'CGT': 'R', 'GTA': 'V', 'GTC': 'V', 'GTG': 'V', 'GTT': 'V', 'GCA': 'A', 'GCC': 'A', 'GCG': 'A', 'GCT': 'A', 'GAC': 'D', 'GAT': 'D', 'GAA': 'E', 'GAG': 'E', 'GGA': 'G', 'GGC': 'G', 'GGG': 'G', 'GGT': 'G', 'TCA': 'S', 'TCC': 'S', 'TCG': 'S', 'TCT': 'S', 'TTC': 'F', 'TTT': 'F', 'TTA': 'L', 'TTG': 'L', 'TAC': 'Y', 'TAT': 'Y', 'TGC': 'C', 'TGT': 'C', 'TGG': 'W'}
endcode = ['TAA','TAG','TGA']
startsit = re.search('ATG',dnaseq)
protein = ''
codons = []
for sit in range(startsit.end(),len(dnaseq),3):
if str(dnaseq[sit:sit+3]) in proteintable.keys():
codons.append(str(dnaseq[sit:sit+3]))
elif str(dnaseq[sit:sit+3]) in endcode:
break
trimcondon[conteintrim] = codons
for codoni in codons:
transpro = proteintable[str(codoni)]
protein = protein + transpro
return protein
trimpro = open("trimprotein.txt","w")
trimcondon = {}
with open("trimfasta.txt","r") as trimseqliness:
trimseqlines = trimseqliness.readlines()
for trimseqline in trimseqlines:
contein = str(trimseqline).strip().split("\n")[0]
if str(trimseqline[0]) == ">":
trimpro.write(contein+"\n")
conteintrim = str(contein).strip().split(">")[1]
trimcondon[conteintrim] = []
elif str(trimseqline[0]) == "A" or str(trimseqline[0]) == "T" or str(trimseqline[0]) == "C" or str(trimseqline[0]) == "G":
seqprotein = dnatrans(contein)
trimpro.write(seqprotein+"\n")
trimpro.close()
proteintable = {'ATA': 'I', 'ATC': 'I', 'ATT': 'I', 'ATG': 'M', 'ACA': 'T', 'ACC': 'T', 'ACG': 'T', 'ACT': 'T', 'AAC': 'N', 'AAT': 'N', 'AAA': 'K', 'AAG': 'K', 'AGC': 'S', 'AGT': 'S', 'AGA': 'R', 'AGG': 'R', 'CTA': 'L', 'CTC': 'L', 'CTG': 'L', 'CTT': 'L', 'CCA': 'P', 'CCC': 'P', 'CCG': 'P', 'CCT': 'P', 'CAC': 'H', 'CAT': 'H', 'CAA': 'Q', 'CAG': 'Q', 'CGA': 'R', 'CGC': 'R', 'CGG': 'R', 'CGT': 'R', 'GTA': 'V', 'GTC': 'V', 'GTG': 'V', 'GTT': 'V', 'GCA': 'A', 'GCC': 'A', 'GCG': 'A', 'GCT': 'A', 'GAC': 'D', 'GAT': 'D', 'GAA': 'E', 'GAG': 'E', 'GGA': 'G', 'GGC': 'G', 'GGG': 'G', 'GGT': 'G', 'TCA': 'S', 'TCC': 'S', 'TCG': 'S', 'TCT': 'S', 'TTC': 'F', 'TTT': 'F', 'TTA': 'L', 'TTG': 'L', 'TAC': 'Y', 'TAT': 'Y', 'TGC': 'C', 'TGT': 'C', 'TGG': 'W'}
genosnp = []
possnpcon = open("resultsnppos.txt","w")
with open("genome.snp","r") as genoliness:
genolines = genoliness.readlines()
for genoline in genolines[1:]:
splitgenosnp = genoline.strip().split("\t")
chrgenosnp = splitgenosnp[0]
posgenosnp = splitgenosnp[1]
posrefsnp = splitgenosnp[2]
posrealsnp = splitgenosnp[3]
print("splitgenosnp: "+str(splitgenosnp))
for dictposidgenei in dictposidgene:
print("dictposidgenei: "+str(dictposidgenei))
startposgff = gfflisti[0]
endposgff = gfflisti[1]
contigID = gfflisti[2]
if int(posgenosnp) > startposgff and int(posgenosnp) < endposgff:
posstartsnp = startposgff
posendsnp = endposgff
searchpossgenoid1 = str(contigID).strip().split("=")[1]
searchpossgenoid = str(searchpossgenoid1).strip().split(";")[0]
condonlist = dictcdslenpos[searchpossgenoid]
lenghtid = 0
for condonlisti in condonlist:
print("condonlist: "+str(condonlisti))
cdsstartpos = condonlisti[0]
cdsendpos = condonlisti[1]
cdsstrandpos = condonlisti[2]
cdslenghtpos = int(condonlisti[3])
contigIDposgff = str(condonlisti[4])
if cdsstrandpos == "-":
if int(posgenosnp) > cdsendpos:
lenghtid = lenghtid + cdslenghtpos
elif int(posgenosnp) < int(cdsendpos) and int(posgenosnp) > int(cdsstartpos):
diststart = int(cdsendpos) - int(cdsstartpos) + 1
elif cdsstrandpos == "+":
if int(posgenosnp) > cdsendpos:
lenghtid = lenghtid + cdslenghtpos
elif int(posgenosnp) < int(cdsendpos) and int(posgenosnp) > int(cdsstartpos):
diststart = int(posgenosnp) - int(cdsstartpos) + 1
mutapos = 0
if condonnum %3 == 0:
condonnum = (lenghtid + diststart)//3
mutapos = 0
elif condonnum %3 != 0:
condonnum = (lenghtid + diststart)//3 + 1
if condonnum %3 == 1:
mutapos = 1
elif condonnum %3 == 2:
mutapos = 2
condonseq = trimcondon[contigID][condonnum]
print("condonseq: "+ condonseq)
refprotein = proteintable[str(condonseq)]
if mutapos == 0:
mutantcondon = str(posrealsnp)+str(condonseq[1])+str(condonseq[2])
elif mutapos == 1:
mutantcondon = str(condonseq[0])+str(posrealsnp)+str(condonseq[2])
elif mutapos == 2:
mutantcondon = str(condonseq[0])+str(condonseq[1])+str(posrealsnp)
mutprotein = proteintable[str(mutantcondon)]
if str(refprotein) == str(mutprotein):
syscon = "SYN"
elif str(refprotein) != str(mutprotein):
syscon = "NON-SYN"
possnpcon.write(str(contigIDposgff)+"\t"+str(chrgenosnp)+"\t"+str(posstartsnp)+"\t"+str(posendsnp)+"\t"+str(posgenosnp)+"\t"+str(posrefsnp)+"\t"+str(posrealsnp)+"\t"+str(cdsstrandpos)+"\t"+str(condonseq)+"\t"+str(mutantcondon)+"\t"+str(refprotein)+"\t"+str(mutprotein)+"\t"+str(syscon)+"\n")
网友评论