from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
import gffutils
import re
#######################
#从cds获得每条转录本的长度,从gff获得每个基因包含哪些转录本
fout = 'output/longest.fa'
cds = 'data/rna.fna'
gff = 'data/genomic.gff'
fout = 'output/longest.fa'
seq_index = SeqIO.index(cds ,'fasta')
gffdb = gffutils.create_db(gff,'gff.db'.force=True,
merge_strategy='creat_unique')
#以基因为key,转录本为value的字典
gene2longestcds = {}
#对基因进行迭代
for g in gffdb.all_features(featuretype='gene'):
g_id= g.id
for m in gffdb.children(g, featuretype='mRNA'):##只关注mRNA信息
m_id = m.id.replace('rna-' , '')
m_len = len(seq_index[m_id].seq)
#一个基因中最长转录本的名字
if g_id not in gene2longestcds:
gene2longestcds[g_id] = m_id
elif m_len >len(seq_index[gene2longestcds[g_id]].seq):
gene2longestcds[g_id] = m_id
sr_list = [v for k,v in seq_index.items() if k in gene2longestcds.values()]
##输出
SeqIO.write(sr_list, fout, 'fasta')
网友评论