美文网首页
2021-02-22

2021-02-22

作者: rong酱 | 来源:发表于2021-02-22 23:13 被阅读0次
# -*- coding: utf-8 -*-
import re

# cds位置信息存入列表 #
def del_GFF(file): #file是GFF文件
    GFF_list = []
    with open(file) as file_liness: 
        file_lines = file_liness.readlines()
        for file_line in file_lines:
            file_lin = file_line.strip().split("\t")
            chr_num = file_lin[0]
            change = file_lin[2]
            if change == "gene":
                startpoint = file_lin[3]
                endpoint = file_lin[4]
                ID_nums_mer = file_lin[8]
                ID_nums = ID_nums_mer.strip().split(";")
                ID_num = ID_nums[0]
                GFF_list.append([ID_num,change,chr_num,startpoint,endpoint])
    return GFF_list
# GFF_List [ID号 ,mRNA /CDS/gene , 染色体号,起始位点,终止位点]

# 处理fasta文件 # , 形成字典。 键值是ContigID, value值是 序列
def del_fasta(file): #file 是fasta文件
    fasta_dict = {}
    with open(file) as fa_liness:
        fa_lines = fa_liness.readlines()
        for lines_fa in fa_lines:
            line_fa = lines_fa.strip()
            if line_fa == '':
                continue
            if line_fa.startswith('>'):
                seqname = line_fa.lstrip('>')
                seqname = re.sub('\..*','',seqname)
            else:
                fasta_dict[seqname] = line_fa
    return fasta_dict
 
def trim_fa(GFF_list,fasta_dict): # 根据 GFF文件的位置信息,提取的序列,存入字典
    trim_cds = {}
    for GFFi in GFF_list:
        GFFi_chrnum = GFFi[2]
        print("GFFi_chrnum")
        print(GFFi_chrnum)
        GFFi_startpoint = int(GFFi[3])-1
        GFFi_endpoint = int(GFFi[4])-1
        GFFi_ID = GFFi[0]
        GFFi_sequen = fasta_dict[GFFi_chrnum][GFFi_startpoint:GFFi_endpoint]
        trim_cds[GFFi_ID] = GFFi_sequen
    return trim_cds  # 

# DNA序列 转换为 mRNA序列
def rever_comp(trim_cds):
    rever_RNA = {}
    basepair = {'A':'U', 'T':'A', 'G':'C', 'C':'G','N':'N'}
    for DNAi_keys,DNAi_value in trim_cds.items():
        rever_seq = ''
        for DNAseqi_value in DNAi_value:
            rever_RNAseq = rever_seq + basepair[DNAseqi_value]
        rever_RNA[DNAi_keys] = rever_RNAseq
    return rever_RNA  # rever_RNA 字典的形式,ID为键值,RNA序列为value值。

def code_tran(rever_RNA): # 3个字母 # RNA序列转换氨基酸序列 , 每个基因序列转换为对应的氨基酸序列,rever_RNA是字典的形式,contig ID 为键值,序列为VALUE值。 
    codon_table = {'GCU':'A','GCC':'A','GCA':'A','GCG':'A',
                'CGU':'R','CGC':'R','CGA':'R','CGG':'R','AGA':'R','AGG':'R',
                'UCU':'S','UCC':'S','UCA':'S','UCG':'S','AGU':'S','AGC':'S',
                'AUU':'I','AUC':'I','AUA':'I','AUU':'I','AUC':'I','AUA':'I',
                'UUA':'L','UUG':'L','CUU':'L','CUC':'L','CUA':'L','CUG':'L',
                'GGU':'G','GGC':'G','GGA':'G','GGG':'G',
                'GUU':'V','GUC':'V','GUA':'V','GUG':'V',
                'ACU':'T','ACC':'T','ACA':'T','ACG':'T',
                'CCU':'P','CCC':'P','CCA':'P','CCG':'P',
                'AAU':'N','AAC':'N',
                'GAU':'D','GAC':'D',
                'UGU':'C','UGC':'C',
                'CAA':'Q','CAG':'Q',
                'GAA':'E','GAG':'E',
                'CAU':'H','CAC':'H',
                'AAA':'K','AAG':'K',
                'UUU':'F','UUC':'F',
                'UAU':'Y','UAC':'Y',
                'AUG':'M',
                'UGG':'W',
                'AUG': 'START',
                'UAG':'STOP','UGA':'STOP','UAA':'STOP'}
    protein = ''
    for rever_RNA_keys,rever_RNA_value in rever_RNA.items():
        lenght_RNA = len(rever_RNA_value)
        for RNAi in range(0,lenght_RNA,3):
            startpoint = RNAi
            endpoint = RNAi+3
            print("startpoint"+str(startpoint))
            print("endpoint"+str(endpoint))
            condon = rever_RNA_value[startpoint:endpoint]
            print("condon")
            print(condon)
            protein = protein + codon_table[condon]
    return protein

if __name__ == "__main__":
    geno_file = "genome.gff"
    GFF_list = del_GFF(geno_file)
    fasta_file = "genome.fa"
    fasta_dict = del_fasta(fasta_file)
    trim_seq = trim_fa(GFF_list,fasta_dict)
    RNA_seq = rever_comp(trim_seq)
    Dict_trim_seq = code_tran(RNA_seq)

相关文章

网友评论

      本文标题:2021-02-22

      本文链接:https://www.haomeiwen.com/subject/vtslfltx.html