美文网首页
2021-03-20

2021-03-20

作者: rong酱 | 来源:发表于2021-03-20 11:23 被阅读0次
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import os
import sys
import re
from Bio import SeqIO
from Bio.Seq import Seq

inputfile = sys.argv[1]
result = open("result.xls","w")
for seqi in SeqIO.parse(inputfile,'fasta'):
    seqid = seqi.id
    seqseq = seqi.seq
    seqseq = str(seqseq)
    lenghtnum = len(seqseq)
    Gcont = seqseq.count('G')
    Ccont = seqseq.count('C')
    Tcont = seqseq.count('T')
    Acont = seqseq.count('A')
    GCcont = int(Gcont) + int(Ccont)
    GCpre = str(float(GCcont*100/int(lenghtnum)))
    GCprent = float(GCprentdup)
    result.write(str(seqid)+"\t"+str(GCprent)+"\t"+str(lenghtnum)+"\n")
result.close()


# -*- coding: utf-8 -*-
# !usr/bin/python

import os
import os.path
import sys
import re

def reverseq(sequence):
    rever_seq = ''
    basepair = {'A':'T', 'T':'A', 'G':'C', 'C':'G','N':'N'}
    sequencon = str(sequence)
    for sequencei in sequence:
        rever_seq = rever_seq + basepair[sequencei]
    return rever_seq[::-1]

dictposidcds = {}
dictposidgene = []
os.system('touch gffinfo.txt')
gfflicon = open("gffinfo.txt","w")
genecon = open("geneinfo.txt","w")
with open("genome.gff","r") as gffliness:
    gfflines = gffliness.readlines()
    for gfflinev1 in gfflines:
        gffline = gfflinev1.strip().split("\n")[0]
        gfflin = gffline.strip().split("\t")
        difftype = gfflin[2]
        startpos = gfflin[3]
        endpos = gfflin[4]
        strand = gfflin[6]
        IDpos = str(gfflin[8])
        if str(difftype) == "CDS":
            if IDpos in dictposidcds:
                dictposidcds[IDpos].append([startpos,endpos])
            elif IDpos not in dictposidcds:
                dictposidcds[IDpos] = []
                dictposidcds[IDpos].append([startpos,endpos])
            gfflicon.write(str(startpos)+"\t"+str(endpos)+"\t"+str(strand)+"\t"+str(IDpos)+'\n')
        if str(difftype) == "gene":
            dictposidgene.append([int(startpos),int(endpos),str(IDpos)])
            genecon.write(str(startpos)+"\t"+str(endpos)+"\t"+str(strand)+"\t"+str(IDpos)+'\n')
gfflicon.close()
genecon.close()
dictgffid = {}
gfflist = []
with open("gffinfo.txt","r") as gffliness:
    gfflines = gffliness.readlines()
    for gffline in gfflines:
        splitgffcon = gffline.strip().split("\t")
        gfflist.append(splitgffcon)

genoseq = ''
with open("keptgenome.fa","r") as fastaliness:
    fastalines = fastaliness.readlines()
    for fastaline in fastalines:
        if fastaline.startswith(">"):
            continue
        else:
            genoseq = str(fastaline)

dictcdslenpos = {}
idfastazheng = open("sequencezheng.txt","w")
for gfflisti in gfflist:
    startpos = int(gfflisti[0])
    endpos = int(gfflisti[1])
    strand = gfflisti[2]
    contigIDpos = gfflisti[3]
    keyvaluecontig1 = str(contigIDpos).strip().split("=")[2]
    keyvaluecontig = str(keyvaluecontig1).strip().split(";")[0]
    if keyvaluecontig not in dictcdslenpos:
        dictcdslenpos[keyvaluecontig] = []
    if str(strand) == '+':
        sequence = genoseq[startpos-1:endpos]
        lenghtidcds = len(sequence)
        dictcdslenpos[keyvaluecontig].append([int(startpos),int(endpos),str(strand),int(lenghtidcds),str(contigIDpos)])
        idfastazheng.write(str(contigIDpos)+"\t"+str(startpos)+"\t"+str(endpos)+"\t"+str(strand)+"\t"+str(sequence)+"\n")
    elif str(strand) == '-':
        sequencev1 = str(genoseq[startpos:endpos])
        lenghtidcds = len(sequencev1)
        dictcdslenpos[keyvaluecontig].append([int(startpos),int(endpos),str(strand),int(lenghtidcds),str(contigIDpos)])
        sequence = reverseq(sequencev1)
        idfastazheng.write(str(contigIDpos)+"\t"+str(startpos)+"\t"+str(endpos)+"\t"+str(strand)+"\t"+str(sequence)+"\n")    
idfastazheng.close()

dictzheseq = {}
trimfastacon = open("trimfasta.txt",'w')
with open("sequencezheng.txt","r") as zhesequenliness:
    zhesequenlines = zhesequenliness.readlines()
    for zhesequenline in zhesequenlines:
        zhesplitzhecont = zhesequenline.strip().split("\t")
        IDzheseq = zhesplitzhecont[0]
        zhestartpos = zhesplitzhecont[1]
        zheendpos = zhesplitzhecont[2]
        zhesequence = zhesplitzhecont[4]
        if IDzheseq not in dictzheseq:
            dictzheseq[IDzheseq] = []
            dictzheseq[IDzheseq].append([zhestartpos,zheendpos,zhesequence])
        else:
            dictzheseq[IDzheseq].append([zhestartpos,zheendpos,zhesequence])
    for dictzhekeys,dictzhevalues in dictzheseq.items():
        trimfastacon.write(">"+str(dictzhekeys)+"\n")
        for dictzhevaluei in dictzhevalues:
            sequence = dictzhevaluei[2]
            trimfastacon.write(str(sequence))
        trimfastacon.write("\n")
trimfastacon.close()

def dnatrans(dnaseq):
    proteintable = {'ATA': 'I', 'ATC': 'I', 'ATT': 'I', 'ATG': 'M', 'ACA': 'T', 'ACC': 'T', 'ACG': 'T', 'ACT': 'T', 'AAC': 'N', 'AAT': 'N', 'AAA': 'K', 'AAG': 'K', 'AGC': 'S', 'AGT': 'S', 'AGA': 'R', 'AGG': 'R', 'CTA': 'L', 'CTC': 'L', 'CTG': 'L', 'CTT': 'L', 'CCA': 'P', 'CCC': 'P', 'CCG': 'P', 'CCT': 'P', 'CAC': 'H', 'CAT': 'H', 'CAA': 'Q', 'CAG': 'Q', 'CGA': 'R', 'CGC': 'R', 'CGG': 'R', 'CGT': 'R', 'GTA': 'V', 'GTC': 'V', 'GTG': 'V', 'GTT': 'V', 'GCA': 'A', 'GCC': 'A', 'GCG': 'A', 'GCT': 'A', 'GAC': 'D', 'GAT': 'D', 'GAA': 'E', 'GAG': 'E', 'GGA': 'G', 'GGC': 'G', 'GGG': 'G', 'GGT': 'G', 'TCA': 'S', 'TCC': 'S', 'TCG': 'S', 'TCT': 'S', 'TTC': 'F', 'TTT': 'F', 'TTA': 'L', 'TTG': 'L', 'TAC': 'Y', 'TAT': 'Y', 'TGC': 'C', 'TGT': 'C', 'TGG': 'W'}
    endcode = ['TAA','TAG','TGA']
    startsit = re.search('ATG',dnaseq)
    protein = ''
    codons = []
    for sit in range(startsit.end(),len(dnaseq),3):
        if str(dnaseq[sit:sit+3]) in proteintable.keys():
            codons.append(str(dnaseq[sit:sit+3]))
        elif str(dnaseq[sit:sit+3]) in endcode:
            break
    trimcondon[conteintrim] = codons
    for codoni in codons:
        transpro = proteintable[str(codoni)]
        protein = protein + transpro
    return protein

trimpro = open("trimprotein.txt","w")
trimcondon = {}
with open("trimfasta.txt","r") as trimseqliness:
    trimseqlines = trimseqliness.readlines()
    for trimseqline in trimseqlines:
        contein = str(trimseqline).strip().split("\n")[0]
        if str(trimseqline[0]) == ">":
            trimpro.write(contein+"\n")
            conteintrim = str(contein).strip().split(">")[1]
            trimcondon[conteintrim] = []
        elif str(trimseqline[0]) == "A" or str(trimseqline[0]) == "T" or str(trimseqline[0]) == "C" or str(trimseqline[0]) == "G":
            seqprotein = dnatrans(contein)
            trimpro.write(seqprotein+"\n")
trimpro.close()

proteintable = {'ATA': 'I', 'ATC': 'I', 'ATT': 'I', 'ATG': 'M', 'ACA': 'T', 'ACC': 'T', 'ACG': 'T', 'ACT': 'T', 'AAC': 'N', 'AAT': 'N', 'AAA': 'K', 'AAG': 'K', 'AGC': 'S', 'AGT': 'S', 'AGA': 'R', 'AGG': 'R', 'CTA': 'L', 'CTC': 'L', 'CTG': 'L', 'CTT': 'L', 'CCA': 'P', 'CCC': 'P', 'CCG': 'P', 'CCT': 'P', 'CAC': 'H', 'CAT': 'H', 'CAA': 'Q', 'CAG': 'Q', 'CGA': 'R', 'CGC': 'R', 'CGG': 'R', 'CGT': 'R', 'GTA': 'V', 'GTC': 'V', 'GTG': 'V', 'GTT': 'V', 'GCA': 'A', 'GCC': 'A', 'GCG': 'A', 'GCT': 'A', 'GAC': 'D', 'GAT': 'D', 'GAA': 'E', 'GAG': 'E', 'GGA': 'G', 'GGC': 'G', 'GGG': 'G', 'GGT': 'G', 'TCA': 'S', 'TCC': 'S', 'TCG': 'S', 'TCT': 'S', 'TTC': 'F', 'TTT': 'F', 'TTA': 'L', 'TTG': 'L', 'TAC': 'Y', 'TAT': 'Y', 'TGC': 'C', 'TGT': 'C', 'TGG': 'W'}

genosnp = []
possnpcon = open("resultsnppos.txt","w")
with open("genome.snp","r") as genoliness:
    genolines = genoliness.readlines()
    for genoline in genolines[1:]:
        splitgenosnp = genoline.strip().split("\t")
        chrgenosnp = splitgenosnp[0]
        posgenosnp = splitgenosnp[1]
        posrefsnp = splitgenosnp[2]
        posrealsnp = splitgenosnp[3]
        print("splitgenosnp: "+str(splitgenosnp))
        for dictposidgenei in dictposidgene:
            print("dictposidgenei: "+str(dictposidgenei))
            startposgff = gfflisti[0]
            endposgff = gfflisti[1]
            contigID = gfflisti[2]
            if int(posgenosnp) > startposgff and int(posgenosnp) < endposgff:
                posstartsnp = startposgff
                posendsnp = endposgff
                searchpossgenoid1 = str(contigID).strip().split("=")[1]
                searchpossgenoid = str(searchpossgenoid1).strip().split(";")[0]
                condonlist = dictcdslenpos[searchpossgenoid]    
                lenghtid = 0
                for condonlisti in condonlist:
                    print("condonlist: "+str(condonlisti))
                    cdsstartpos = condonlisti[0]
                    cdsendpos = condonlisti[1]
                    cdsstrandpos = condonlisti[2]
                    cdslenghtpos = int(condonlisti[3])
                    contigIDposgff = str(condonlisti[4])              
                    if cdsstrandpos == "-":
                        if int(posgenosnp) > cdsendpos:
                            lenghtid = lenghtid + cdslenghtpos
                        elif int(posgenosnp) < int(cdsendpos) and int(posgenosnp) > int(cdsstartpos):
                            diststart = int(cdsendpos) - int(cdsstartpos) +  1
                    elif cdsstrandpos == "+":
                        if int(posgenosnp) > cdsendpos:
                            lenghtid = lenghtid + cdslenghtpos
                        elif int(posgenosnp) < int(cdsendpos) and int(posgenosnp) > int(cdsstartpos):
                            diststart = int(posgenosnp) - int(cdsstartpos) +  1
                mutapos = 0
                if condonnum %3 == 0:
                    condonnum = (lenghtid + diststart)//3
                    mutapos = 0
                elif condonnum %3 != 0:
                    condonnum = (lenghtid + diststart)//3 + 1
                    if condonnum %3 == 1:
                        mutapos = 1
                    elif condonnum %3 == 2:
                        mutapos = 2
                condonseq = trimcondon[contigID][condonnum]
                print("condonseq: "+ condonseq)
                refprotein = proteintable[str(condonseq)]
                if mutapos == 0:
                    mutantcondon = str(posrealsnp)+str(condonseq[1])+str(condonseq[2])
                elif mutapos == 1:
                    mutantcondon = str(condonseq[0])+str(posrealsnp)+str(condonseq[2])
                elif mutapos == 2:
                    mutantcondon = str(condonseq[0])+str(condonseq[1])+str(posrealsnp)
                mutprotein = proteintable[str(mutantcondon)]
                if str(refprotein) == str(mutprotein):
                    syscon = "SYN"
                elif str(refprotein) != str(mutprotein):
                    syscon = "NON-SYN"
                possnpcon.write(str(contigIDposgff)+"\t"+str(chrgenosnp)+"\t"+str(posstartsnp)+"\t"+str(posendsnp)+"\t"+str(posgenosnp)+"\t"+str(posrefsnp)+"\t"+str(posrealsnp)+"\t"+str(cdsstrandpos)+"\t"+str(condonseq)+"\t"+str(mutantcondon)+"\t"+str(refprotein)+"\t"+str(mutprotein)+"\t"+str(syscon)+"\n")

相关文章

  • 春风吹野菜生

    Nature's Springtime Gifts 2021-03-20 276词 四级生活 When sprin...

  • 前端面试每日 3+1 —— 第704天

    今天的知识点 (2021-03-20) —— 第704天 (我也要出题[http://www.h-camel.co...

  • 品茗茶

    品茗茶 2021-03-20 22:14:39 0 ————《品茗茶》———— 平水/五律 平起/下平六麻 ———...

  • 周六 2021-03-20 23:00 - 07:30 阴 05

    周六 2021-03-20 23:00 - 07:30 阴 05h06m 记录闪现的灵感(inspirations...

  • bitshares比特股数据20210320

    2021-03-20比特股BTS大额转账的记录 时间转出转入BTS数量11:33:51zbbts001zbsend...

  • 追求好睡眠

    2021-03-20精进第60天 | 没有记录就没有发生 2021年的事业梦想个人目标描述:达成MDRT业绩目标,...

  • 能量本无事,低频自扰之

    我怎么如此幸运-99挑战赛14-重生204-戴红霞(2021-03-20) 我怎么如此幸运-能量本无事,低频自扰之...

  • 果然复盘24❤思维的格局

    果然复盘24 2021-03-20 1今日收获 ①心动就行动,心想事成。当我把行动力提升能量果然增强。 ②听小鱼穆...

  • 2021-05-09

    2021-03-20 - 草稿 不知道该写✍什么,又感觉很多想些出来的东西。是自己实力不够吧,无法用笔杆抒发心情。...

  • 阅读《56号教室的奇迹》

    幸福日志2021-03-20 周六 阴 已经春分了,可是总觉得这个冬天好长好长,感觉要冻死在春天里,只有路边的花草...

网友评论

      本文标题:2021-03-20

      本文链接:https://www.haomeiwen.com/subject/fftwcltx.html