美文网首页生物信息Python
使用python处理生物信息数据(九)

使用python处理生物信息数据(九)

作者: 你猜我菜不菜 | 来源:发表于2020-03-16 20:36 被阅读0次

    Python学习的第九天,主要学习如何写自己的函数功能。


    1. 从PDB文件中提取fasta序列
    import struct
    
    pdb_format = '6s5s1s4s1s3s1s1s4s1s3s8s8s8s6s6s10s2s3s'
    
    amino_acids = {
        'ALA':'A', 'CYS':'C', 'ASP':'D', 'GLU':'E',
        'PHE':'F', 'GLY':'G', 'HIS':'H', 'LYS':'K',
        'ILE':'I', 'LEU':'L', 'MET':'M', 'ASN':'N',
        'PRO':'P', 'GLN':'Q', 'ARG':'R', 'SER':'S',
        'THR':'T', 'VAL':'V', 'TYR':'Y', 'TRP':'W'
        }
    
    def threeletter2oneletter(residues):
        for i, threeletter in enumerate(residues):
            residues[i][0] = amino_acids[threeletter[0]] 
    #这将氨基酸的三个字母的代码转换为一个字母的代码。 
    #该函数读取[res_type,chain]对的列表,
    #并用对应的单字母代码替换氨基酸三字母代码的第一个元素。
    
    def get_residues(pdb_file):
        residues = []
        for line in pdb_file:
            if line[0:4] == "ATOM":
                tmp = struct.unpack(pdb_format, line.encode('utf-8'))
                tmp_to_string = [line.decode('utf-8') for line in tmp]
                ca = tmp_to_string[3].strip()
                if ca == 'CA':
                    res_type = tmp_to_string[5].strip()
                    chain = tmp_to_string[7]
                    residues.append([res_type, chain])
        return residues
    
    def write_fasta_records(residues, pdb_id, fasta_file):
        seq = ''
        chain = residues[0][1]
        for aa, new_chain in residues:
            if new_chain == chain:
                seq = seq + aa
            else:
                fasta_file.write(">%s_%s\n%s\n" % (pdb_id, chain, seq))
                seq = aa
                chain = new_chains
        fasta_file.write(">%s_%s\n%s\n" % (pdb_id, chain, seq))
    
    def extract_sequence(pdb_id):
        pdb_file = open(pdb_id + ".pdb")
        fasta_file = open(pdb_id + ".fasta", "w")
        residues = get_residues(pdb_file)
        threeletter2oneletter(residues)
        write_fasta_records(residues, pdb_id, fasta_file)
        pdb_file.close()
        fasta_file.close()
        
    extract_sequence("3G5U")
    

    2. 写一个极简单的函数
    def calc_sum(num1, num2):
        '''一个名为calc_sum函数,定义函数功能,对两个参数求和,然后返回出结果'''
        result = num1 + num2
        return result
        
    calc_sum(23,7)  #calc_sum计算23和7的和
    Out[6]: 30
    
    ###直接返回函数运行后的结果
    def increment(number):
        '''返回给定的数字加1的结果'''
        return number + 1
        
    increment(5) #给定5,返回值为6
    Out[8]: 6
    
    ###直接打印出给定的参数
    def print_arg(number):
        print(number)
        
    print_arg(5) #输入5,打印出5
    5
    
    ###上述两个函数的套用
    print_arg(increment(5)) #先是increment(5)函数,对5进行加1操作,然后print_arg()直接打印出increment(5)函数运行后的结果
    
    

    3. 使用struct模块将字符串转为元组
    import struct
    
    format = "1s2s1s1s" #1s表示一个字符为一个单位, 2s则表示两个字符为一个单位
    
    line = "12345"
    
    col = struct.unpack(format, line.encode("utf-8") #将line这个字符串以format的格式拆分开
    )
    
    col_to_string = [line.decode("utf-8") for line in col]
    
    print(col)
    (b'1', b'23', b'4', b'5')
    
    print(col_to_string)
    ['1', '23', '4', '5']
    
    ###calcsize()函数这个是统计format格式中有多少个字符
    format = '30s30s20s1s'
    
    print(struct.calcsize(format))
    81
    

    4. 将任意数量的参数转化为tab符分隔的字符串
    def convert_to_string(*args):
        result = [str(a) for a in args]
        return "\t".join(result) + "\n"
    
    output_file = open("nucleotideSubstitMatrix.txt", "w")
    
    output_file.write(convert_to_string('', 'A', 'T', 'C', 'G'))
    Out[17]: 9
    
    output_file.write(convert_to_string('A', 1.0))
    Out[18]: 6
    
    output_file.write(convert_to_string('T', 0.5, 1.0))
    Out[19]: 10
    
    output_file.write(convert_to_string('C', 0.1, 0.1, 1.0))
    Out[20]: 14
    
    output_file.write(convert_to_string('G', 0.1, 0.1, 0.5, 1.0))
    Out[21]: 18
    
    output_file.close()
    

    5. 提取PDB文件中特定的残基
    import struct
    
    from parse_pdb import parse_atom_line
    
    def main(pdb_filename, residues, output_filename):
        pdb = open(pdb_filename)
        outfile = open(output_filename, "w")
        for line in pdb:
            if line.startswith("ATOM"):
                chain, res_type, res_num, atom, x, y, z = parse_atom_line(line)
                for aa, num in residues:
                    if res_type == aa and res_num == num:
                        outfile.write(line)
        outfile.close()
    
    residues = [('ASP', '102'), ('HIS', '57'), ('SER', '195')]
    main("1TLD.pdb", residues, "trypsin_triad.pdb")
    

    6. 计算两个原子之间的距离
    #计算PDB文件中两个原子间的距离
    from math import sqrt
    from distance import calc_dist
    from parse_pdb import parse_atom_line
    
    pdb = open('3G5U.pdb')
    points = []
    
    while len(points) < 2:
        line = pdb.readline()
        if line.startswith("ATOM"):
            chain, res_type, res_num, atom, x, y, z = parse_atom_line(line)
            if res_num == '123' and chain == 'A' and atom == 'CA':
                points.append((x, y, z))
            if res_num == '209' and chain == 'A' and atom == 'CA':
                points.append((x, y, z))
    
    
    print(calc_dist(points[0], points[1]))
    35.219262627147664
    
    ####计算PDB中所有CA原子的距离
    from math import sqrt
    from distance import calc_dist
    from parse_pdb import parse_atom_line
    
    def get_ca_atoms(pdb_filename):
        pdb_file = open(pdb_filename, "r") #提取ca原子的数据列表
        ca_list = []
        for line in pdb_file:
            if line.startswith('ATOM'):
                data = parse_atom_line(line)
                chain, res_type, res_num, atom, x, y, z = data
                if atom == 'CA' and chain == 'A': 
                    ca_list.append(data)
        pdb_file.close()
        return ca_list
    
    ca_atoms = get_ca_atoms("1TLD.pdb")
    for i, atom1 in enumerate(ca_atoms):
        # save coordinates in a variable
        name1 = atom1[1] + atom1[2]
        coord1 = atom1[4:]
        # compare atom1 with all other atoms
        for j in range(i+1, len(ca_atoms)):
            atom2 = ca_atoms[j]
            name2 = atom2[1] + atom2[2]
            coord2 = atom2[4:]
            # calculate the distance between atoms
            dist = calc_dist(coord1, coord2)
            print(name1, name2, dist)
    
    

    相关文章

      网友评论

        本文标题:使用python处理生物信息数据(九)

        本文链接:https://www.haomeiwen.com/subject/lcfadhtx.html