# -*- coding: utf-8 -*-
import re
# cds位置信息存入列表 #
def del_GFF(file): #file是GFF文件
GFF_list = []
with open(file) as file_liness:
file_lines = file_liness.readlines()
for file_line in file_lines:
file_lin = file_line.strip().split("\t")
chr_num = file_lin[0]
change = file_lin[2]
if change == "gene":
startpoint = file_lin[3]
endpoint = file_lin[4]
ID_nums_mer = file_lin[8]
ID_nums = ID_nums_mer.strip().split(";")
ID_num = ID_nums[0]
GFF_list.append([ID_num,change,chr_num,startpoint,endpoint])
return GFF_list
# GFF_List [ID号 ,mRNA /CDS/gene , 染色体号,起始位点,终止位点]
# 处理fasta文件 # , 形成字典。 键值是ContigID, value值是 序列
def del_fasta(file): #file 是fasta文件
fasta_dict = {}
with open(file) as fa_liness:
fa_lines = fa_liness.readlines()
for lines_fa in fa_lines:
line_fa = lines_fa.strip()
if line_fa == '':
continue
if line_fa.startswith('>'):
seqname = line_fa.lstrip('>')
seqname = re.sub('\..*','',seqname)
else:
fasta_dict[seqname] = line_fa
return fasta_dict
def trim_fa(GFF_list,fasta_dict): # 根据 GFF文件的位置信息,提取的序列,存入字典
trim_cds = {}
for GFFi in GFF_list:
GFFi_chrnum = GFFi[2]
print("GFFi_chrnum")
print(GFFi_chrnum)
GFFi_startpoint = int(GFFi[3])-1
GFFi_endpoint = int(GFFi[4])-1
GFFi_ID = GFFi[0]
GFFi_sequen = fasta_dict[GFFi_chrnum][GFFi_startpoint:GFFi_endpoint]
trim_cds[GFFi_ID] = GFFi_sequen
return trim_cds #
# DNA序列 转换为 mRNA序列
def rever_comp(trim_cds):
rever_RNA = {}
basepair = {'A':'U', 'T':'A', 'G':'C', 'C':'G','N':'N'}
for DNAi_keys,DNAi_value in trim_cds.items():
rever_seq = ''
for DNAseqi_value in DNAi_value:
rever_RNAseq = rever_seq + basepair[DNAseqi_value]
rever_RNA[DNAi_keys] = rever_RNAseq
return rever_RNA # rever_RNA 字典的形式,ID为键值,RNA序列为value值。
def code_tran(rever_RNA): # 3个字母 # RNA序列转换氨基酸序列 , 每个基因序列转换为对应的氨基酸序列,rever_RNA是字典的形式,contig ID 为键值,序列为VALUE值。
codon_table = {'GCU':'A','GCC':'A','GCA':'A','GCG':'A',
'CGU':'R','CGC':'R','CGA':'R','CGG':'R','AGA':'R','AGG':'R',
'UCU':'S','UCC':'S','UCA':'S','UCG':'S','AGU':'S','AGC':'S',
'AUU':'I','AUC':'I','AUA':'I','AUU':'I','AUC':'I','AUA':'I',
'UUA':'L','UUG':'L','CUU':'L','CUC':'L','CUA':'L','CUG':'L',
'GGU':'G','GGC':'G','GGA':'G','GGG':'G',
'GUU':'V','GUC':'V','GUA':'V','GUG':'V',
'ACU':'T','ACC':'T','ACA':'T','ACG':'T',
'CCU':'P','CCC':'P','CCA':'P','CCG':'P',
'AAU':'N','AAC':'N',
'GAU':'D','GAC':'D',
'UGU':'C','UGC':'C',
'CAA':'Q','CAG':'Q',
'GAA':'E','GAG':'E',
'CAU':'H','CAC':'H',
'AAA':'K','AAG':'K',
'UUU':'F','UUC':'F',
'UAU':'Y','UAC':'Y',
'AUG':'M',
'UGG':'W',
'AUG': 'START',
'UAG':'STOP','UGA':'STOP','UAA':'STOP'}
protein = ''
for rever_RNA_keys,rever_RNA_value in rever_RNA.items():
lenght_RNA = len(rever_RNA_value)
for RNAi in range(0,lenght_RNA,3):
startpoint = RNAi
endpoint = RNAi+3
print("startpoint"+str(startpoint))
print("endpoint"+str(endpoint))
condon = rever_RNA_value[startpoint:endpoint]
print("condon")
print(condon)
protein = protein + codon_table[condon]
return protein
if __name__ == "__main__":
geno_file = "genome.gff"
GFF_list = del_GFF(geno_file)
fasta_file = "genome.fa"
fasta_dict = del_fasta(fasta_file)
trim_seq = trim_fa(GFF_list,fasta_dict)
RNA_seq = rever_comp(trim_seq)
Dict_trim_seq = code_tran(RNA_seq)
网友评论