美文网首页
rosalind1-10

rosalind1-10

作者: dmmy大印 | 来源:发表于2018-08-16 11:21 被阅读0次

    1.counting DNA nucleotides

    rosalind1 题目
    #style1
    seq=''
    ntcount=[]
    with open('rosalind1.txt') as f:
        for line in f:
             line = line.rstrip() #去末尾空格
             seq += line.upper()
    for nt in ['A','C','G','T']:
        ntcount.append(seq.count(nt))
    print('\t'.join(map(str,ntcount)))
    
    #style2
    with open('rosalind1.txt') as f:
        for a in f:
            b = list(str(a)) #b是一个个核苷酸的列表
           c={}
           for d in b:
               c[d] = b.count(d)
           print(c)
    
    #style3
    from collections import Counter
    with open('rosalind_dna.txt') as f:
        list=f.read().rstrip()
        print(Counter(list))
    

    2.将DNA转录成RNA

    Rosalind 2 题目
    #style1 使用replace函数
    with open('rosalind_rna.txt','r') as f:
        list=f.read()
        print(list.replace('T','U'))
    
    #style2 使用re.sub正则表达式
    import re
    with open('rosalind_rna.txt', 'r') as f:
        list = f.read()
        print(re.sub('T','U',list))
    

    3.获取反向互补链 Rosalind 3 题目

    #rosalind 3
    #style1
    def reverse_complement(seq):
        ntComplement={'A':'T','C':'G','G':'C','T':'A'}
        revSeqList = list(reversed(seq))
        revComSeqList = [ntComplement[k] for k in revSeqList]
        revComSeq = ''.join(revComSeqList)
        return revComSeq
    
    seq=''
    with open('rosalind_revc.txt') as f:
        for line in f:
            line = line.rstrip()
            seq += line.rstrip()
            print(reverse_complement(seq))
    
    #style2 使用翻译表
    seq=''
    with open('rosalind_revc.txt') as f:
        for line in f:
            line = line.rstrip()
            seq += line.rstrip()
    transtable = str.maketrans('ATCG','TAGC')
    print(seq.translate(transtable)[::-1])
    
    #style3 使用biopython
    from Bio.Seq import Seq
    from Bio.Alphabet import IUPAC
    
    seq=''
    with open('rosalind_revc.txt') as f:
        for line in f:
            line = line.rstrip()
            seq += line.rstrip()
    
    my_seq = Seq(seq,IUPAC.unambiguous_dna)
    print(my_seq)
    print(my_seq.complement()) #互补链
    print(my_seq.reverse_complement()) #反向互补链
    

    4.兔子和递归关系

    rosalind4 题目
    def fibonacciRabbits(n, k):
        F = [1, 1]
        generation = 2
        while generation <= n:
            F.append(F[generation - 1] + F[generation - 2] * k)
            generation += 1
        # 函数返回列表最后一项
        return (F[n - 1])
    
    #调用函数并打印
    print (fibonacciRabbits(5, 3))
    

    5.计算GC含量

    rosalind 5题目
    #style1
    from Bio import SeqIO
    Target_Seq = SeqIO.parse('rosalind_gc.txt','fasta')
    GC_content = 0
    ID = ''
    for target in Target_Seq:
        if GC_content < (float(target.seq.count('C')+target.seq.count('G'))/len(target.seq))*100:
            GC_content = (float(target.seq.count('C')+target.seq.count('G'))/len(target.seq))*100
            ID = target.description
    print(ID,'\n',GC_content)
    
    #style2
    from Bio.Seq import Seq
    from Bio.Alphabet import IUPAC
    from Bio.SeqUtils import GC
    my_seq = Seq("GGGATCGTAC",IUPAC.unambiguous_dna)
    for index,letter in enumerate(my_seq):
        print(index,letter)
    print(len(my_seq))
    print(my_seq.count('A'))
    print(GC(my_seq))
    

    6.计算突变个数

    Rosalind6题目
    d = open('rosalind_hamm.txt','r')
    s = d.readlines()
    a = s[0].strip()
    b = s[1].strip()
    hum = 0
    for i in range(len(a)):
        if a[i] != b[I]:
            hum+=1
    print(hum)
    

    7.孟德尔第一定律 Rosalind7题目

    from scipy.misc import comb
    def fun(k,m,n):
        y = k+m+n
        nn = comb(n,2)/comb(y,2)
        kk = comb(m,2)/comb(y,2)
        mm = comb(n,1)/comb(y,2)
        p = round(1-(nn+kk*1/4+mm*1/2),5)
        return p
    
    #读取数字
    d = open('rosalind_iprb.txt','r')
    s = d.read()
    f = s.split()
    k,m,n = int(f[0]),int(f[1]),int(f[2])
    print(fun(k,m,n))
    

    8.将RNA翻译成蛋白质

    rosalind8 题目
    # -*- coding: utf-8 -*-
    '''rosaline 将RNA翻译成蛋白质'''
    #style1 使用biopython
    from Bio import Seq
    from Bio.Alphabet import generic_dna,generic_rna
    from Bio import SeqIO
    from Bio.Data import CodonTable
    #载入mRNA序列
    RNA_string=open('rosalind_prot.txt','r').read().strip()
    my_seq=Seq.Seq(RNA_string,generic_rna)
    #载入标准遗传密码表
    standard_table=CodonTable.unambiguous_rna_by_name['Standard']
    #将mRNA翻译成蛋白质
    protein=my_seq.translate(table='Standard')
    print(protein)
    
    #style2 自己写出密码表
    import re
    def mRNA_protein(RNA_string):
        #将mRNA翻译成蛋白质
        start_code = 'AUG'
        end_code = ['UAA', 'UAG', 'UGA']
        protein_table = {'UUU': 'F', 'CUU': 'L', 'AUU': 'I', 'GUU': 'V', \
                         'UUC': 'F', 'CUC': 'L', 'AUC': 'I', 'GUC': 'V', \
                         'UUA': 'L', 'CUA': 'L', 'AUA': 'I', 'GUA': 'V', \
                         'UUG': 'L', 'CUG': 'L', 'AUG': 'M', 'GUG': 'V', \
                         'UCU': 'S', 'CCU': 'P', 'ACU': 'T', 'GCU': 'A', \
                         'UCC': 'S', 'CCC': 'P', 'ACC': 'T', 'GCC': 'A', \
                         'UCA': 'S', 'CCA': 'P', 'ACA': 'T', 'GCA': 'A', \
                         'UCG': 'S', 'CCG': 'P', 'ACG': 'T', 'GCG': 'A', \
                         'UAU': 'Y', 'CAU': 'H', 'AAU': 'N', 'GAU': 'D', \
                         'UAC': 'Y', 'CAC': 'H', 'AAC': 'N', 'GAC': 'D', \
                         'UAA': 'Stop', 'CAA': 'Q', 'AAA': 'K', 'GAA': 'E', \
                         'UAG': 'Stop', 'CAG': 'Q', 'AAG': 'K', 'GAG': 'E', \
                         'UGU': 'C', 'CGU': 'R', 'AGU': 'S', 'GGU': 'G', \
                         'UGC': 'C', 'CGC': 'R', 'AGC': 'S', 'GGC': 'G', \
                         'UGA': 'Stop', 'CGA': 'R', 'AGA': 'R', 'GGA': 'G', \
                         'UGG': 'W', 'CGG': 'R', 'AGG': 'R', 'GGG': 'G'
                         }
        # 找到起始密码子的位置
        start_sit = re.search(start_code, RNA_string)
        protein = ''
        # 按阅读框匹配蛋白质
        for sit in range(start_sit.end(), len(RNA_string), 3):
            protein = protein + protein_table[RNA_string[sit:sit + 3]]
        print(protein)
    
    if __name__ == '__main__':
        RNA_string = open('rosalind_prot.txt','r').read().strip()
        mRNA_protein(RNA_string)
    
    #style3 使用biopython
    from Bio.Seq import Seq
    from Bio.Alphabet import IUPAC
    d=open('rosalind_prot.txt','r').read().strip()
    messenger_rna=Seq(d,IUPAC.unambiguous_rna)
    print(messenger_rna.translate().strip('*'))
    

    9.找到子串位置

    rosalind9 题目
    # -*- coding: utf-8 -*-
    #rosalind9:finding a motif in dna
    #style1
    d=open('rosalind_subs.txt','r')
    s=d.readlines()   #返回列表
    a=s[0].strip()
    b=s[1].strip()
    print(s)
    num1,num2=len(a),len(b)
    n=num1-num2
    I=0
    site=[]
    for i in range(n):
        c=a[i:i+num2]
        if b==c:
            site.append(i+1)#python的第1位是0,第2位是1,所以要加1
    
    print(" ".join(str(j) for j in site))#输出成sampleoutput里的格式,使用join需要改为字符串格式
    
    
    #style2
    import re
    # 如果overlapped   为False, 匹配到位置为2,10
    matches = re.finditer('ATAT', 'GATATATGCATATACTT')
    # 返回所有匹配项,
    for match in matches:  #迭代器可循环
        print(match.start())
    
    • re.match,re.search,re.findall,re.finditer区别
      re.match从首字母开始匹配,没有,就返回none
      re.search返回对象
      re.findall返回数组
      re.finditer返回迭代器
    #style3
    seq = 'GATATATGCATATACTT'
    pattern = 'ATAT'
    
    def find_motif(seq, pattern):
        position = []
        for i in range(len(seq) - len(pattern)):
            if seq[i:i + len(pattern)] == pattern:
                position.append(str(i + 1))
    
        print('\t'.join(position))
    
    find_motif(seq, pattern)
    

    10.寻找最可能的共同祖先

    Rosalind10 题目 rosalind 10 题目
    #!/usr/bin/env python3
    # -*- coding: utf-8 -*-
    
    #rosalind10: consensus and profile
    #style1
    from collections import Counter
    from collections import OrderedDict  #有序字典
    # 写出到文件
    fh = open('cons.txt', 'wt')
    
    rosalind = OrderedDict()
    seqLength = 0
    with open('rosalind_cons.txt') as f:
        for line in f:
            line = line.rstrip()
            if line.startswith('>'):
                rosalindName = line
                rosalind[rosalindName] = ''
                # 跳出这个循环,进入下一个循环
                continue
            # 如果不是'>'开头,执行这一句
            rosalind[rosalindName] += line
        # len(ATCCAGCT)
        seqLength = len(rosalind[rosalindName])
    
    
    A, C, G, T = [], [], [], []
    consensusSeq = ''
    for i in range(seqLength):
        seq = ''
        # rosalind.keys = Rosalind_1...Rosalind_7
        for k in rosalind.keys():
            # 把 Rosalind_1...Rosalind_7 相同顺序的碱基加起来
            seq += rosalind[k][i]
        A.append(seq.count('A'))
        C.append(seq.count('C'))
        G.append(seq.count('G'))
        T.append(seq.count('T'))
        # 为seq计数
        counts = Counter(seq)
    
        # 从多到少返回,是个list啊,只返回第一个
        consensusSeq += counts.most_common()[0][0]
        print(counts.most_common())
    
    fh.write(consensusSeq + '\n')
    fh.write('\n'.join(['A:\t' + '\t'.join(map(str, A)), 'C:\t' + '\t'.join(map(str, C)),
                        'G:\t' + '\t'.join(map(str, G)), 'T:\t' + '\t'.join(map(str, T))]))
    # .join(map(str,A))  把 A=[5, 1, 0, 0, 5, 5, 0, 0] 格式转化成 A:5 1 0 0 5 5 0 0
    fh.close()
    
    
    #style2(未成功)
    import numpy as np
    import os
    from collections import Counter
    fhand = open('rosalind_cons.txt')
    t = []
    for line in fhand:
        if line.startswith('>'):
            continue
        else:
            line = line.rstrip()
            line_list = list(line)  # 变成一个list
        t.append(line_list)  # 再把list 放入list
    a = np.array(t)  # 创建一个二维数组
    print(a)
    L1, L2, L3, L4 = [], [], [], []
    comsquence = ''
    
    for i in range(a.shape[1]):  # shape[0],是7 行数,shape[1]是8 项目数
        l = [x[i] for x in a]  # 调出二维数组的每一列
        L1.append(l.count('A'))
        L2.append(l.count('C'))
        L3.append(l.count('T'))
        L4.append(l.count('G'))
        comsquence = comsquence + Counter(l).most_common()[0][0]
    print(comsquence)
    print('A:', L1, '\n', 'C:', L2, '\n', 'T:', L3, '\n', 'G:', L4)
    
    
    #style3
    def update_cnt(list, char, str):
        res = [a + b for a,b in zip(list, [1 if x == char else 0 for x in str])]
        return(res)
    
    def print_cnt(list, label):
        print(label,end = ": ")
        for x in list:
            print(x, end=" ")
        print()
    
    f = open("rosalind_cons.txt", 'r')
    fas = f.readlines()
    
    seq = ''
    j = 0
    for i in range(len(fas)):
        if fas[i][0] == '>' and j == 0:
            j += 1
            next
        elif fas[i][0] == '>':
            if j == 1:
                A = [0 for x in seq]
                C = A
                G = A
                T = A
            A = update_cnt(A, 'A', seq)
            C = update_cnt(C, 'C', seq)
            G = update_cnt(G, 'G', seq)
            T = update_cnt(T, 'T', seq)
            j += 1
            seq = ''
        else:
            seq += fas[i].strip()
    
    ## last seq
    A = update_cnt(A, 'A', seq)
    C = update_cnt(C, 'C', seq)
    G = update_cnt(G, 'G', seq)
    T = update_cnt(T, 'T', seq)
    
    consensus = ''
    for i in range(len(A)):
        m = max(A[i], C[i], G[i], T[I])
        if A[i] == m:
            consensus += 'A'
        elif C[i] == m:
            consensus += 'C'
        elif G[i] == m:
            consensus += 'G'
        elif T[i] == m:
            consensus += 'T'
    
    print(consensus)
    
    print_cnt(A, 'A')
    print_cnt(C, 'C')
    print_cnt(G, 'G')
    print_cnt(T, 'T')
    
    
    #style4
    import numpy as np #引入numpy
    d=open('rosalind_cons.txt','r')
    from collections import Counter #问题1里求A/T/C/G出现过多少次时用到的函数
    list2=[]
    a=1 #起标记作用,看是不是第一次遇到'>'
    dline=''
    for line in d:
        if line.startswith('>'):
            if a==0: #如果不是第一次遇到'>'
                list2.append(list(dline)) #list是为了创建二维数组
                dline=''
            continue
        else:
            a=0 #如果第一次遇到'>'
            dline+=line.rstrip() #去掉每一行右边的换行
    #最后一行
    list2.append(list(dline))
    bi=np.array(list2)#创建二维数组
    LA,LT,LC,LG=[],[],[],[]
    consensus =''#共同序列
    
    for i in range(bi.shape[1]):#数组第1行有多少个
        l=bi[:,i] #调出数组每一列
        listl=list(l)
        LA.append(listl.count('A'))#数一个加一个,LA是数值列表,每数一列就加一个在末尾
        LC.append(listl.count('C'))
        LT.append(listl.count('T'))
        LG.append(listl.count('G'))
        most=Counter(listl).most_common() #算出最多的那个
        print(most)
        consensus=consensus+most[0][0]#第一列的(排序)第一的相加
    
    print (consensus)
    print("A:"," ".join(str(j) for j in LA))
    print("C:"," ".join(str(j) for j in LC))
    print("G:"," ".join(str(j) for j in LG))
    print("T:"," ".join(str(j) for j in LT))
    
    • 矩阵运算: numpy spicy pandas
    • counters包

    相关文章

      网友评论

          本文标题:rosalind1-10

          本文链接:https://www.haomeiwen.com/subject/waunbftx.html