美文网首页生物信息学与算法
用python实现pattern finding

用python实现pattern finding

作者: Shaoqian_Ma | 来源:发表于2020-04-18 14:25 被阅读0次

    参考教材:Bioinformatics Algorithms Design and Implementation in Python

    Naive Algorithm for Fixed Pattern Finding

    把pattern里的字符和text里的字符逐一比较,简单粗暴的方法:

    def search_first_occ(seq, pattern):
        found = False#返回值是有没有找到pattern,所以先设定标签
        i = 0
        while i <= len(seq)-len (pattern) and not found:#i用来设置窗口的步移
            j = 0
            while j < len(pattern) and pattern[j] == seq[i + j]:#j设置一一比对pattern和窗口
                j = j + 1
            if j == len(pattern): found = True
            else:
                i +=1
        if found:return True
        else:return -1
    
    def search_all_occurrences(seq, pattern):
        res = []
        for i in range( len (seq)-len(pattern)+1):
            j = 0
            while j < len(pattern) and pattern[j] == seq[i + j]:
                j = j + 1
            if j == len(pattern):
                res.append(i)
        return res
    
    seqDNA = "ATAGAATAGATAATAGTC"
    print ( search_first_occ(seqDNA , "GAAT") )
    print ( search_first_occ(seqDNA , "TATA") )
    print ( search_all_occurrences(seqDNA , "AAT") )
    
    def test_pat_search():
        seq = input ("Input sequence: ")
        pat = input ("Input pattern: ")
        print (pat, " occurs in the following positions:", )
        print ( search_all_occurrences(seq, pat) )
    test_pat_search()
    
    #上述方法一共要比较(N − k + 1) × k times.非常耗时
    

    Heuristic Algorithm: Boyer-Moore

    BM算法是著名的字符串查找匹配算法:详情看:https://www.jianshu.com/p/87c44a9ffd49

    class BoyerMoore:
       def __init__(self, alphabet , pattern):
          self.alphabet = alphabet
          self.pattern = pattern
          self.preprocess()
    
       def preprocess(self):
          self.process_bcr()
          self.process_gsr()
    
       def process_bcr(self):#创建字典occ,对应每个字符在pattern里的位置
          self.occ ={}
          for symb in self.alphabet:
             self.occ[symb] = -1
          for j in range(len(self.pattern)):
             c = self.pattern[j]
             self.occ[c] = j
       def process_gsr(self):#好后缀规则,这里可能比较复杂,感兴趣的可以摸索一下
        #如果程序匹配了一个好后缀, 并且在模式中还有另外一个相同的后缀或后缀的部分, 那把下一个后缀或部分移动到当前后缀位置
          self.f = [0] *(len(self.pattern)+1)
          self.s = [0]*(len(self.pattern)+1)
          i = len(self.pattern)
          j =len(self.pattern)+1
          self.f[i] = j 
          while i > 0:
             while j <=len(self.pattern) and self.pattern[i-1]!=self.pattern[j-1]:
                if self.s[j] ==0:self.s[j] =j-i;
                j = self.f[j]
             i -=1
             j -=1
             self.f[i] = j
          j = self.f[0]
          for i in range(len(self.pattern)):
             if self.s[i] ==0:self.s[i]=j
             if i ==j:j=self.f[j]
    
       def search_pattern(self, text):
          res = []
          i = 0
          while i <= len(text) - len (self.pattern):
             j =len(self.pattern)-1
             while j >= 0 and self.pattern[j] == text[j+i]: j -= 1
             if (j<0):
                res.append(i)
                i +=self.s[0]
             else:
                c = text[j + i]#如果没有匹配上,计算需要步移的长度,c为坏字符
                i += max(self.s[j+1],j-self.occ[c])
          return res
    
    def test():
       bm = BoyerMoore("ACTG", "ACCA")
       print (bm.search_pattern("ATAGAACCAATGAACCATGATGAACCATGGATACCCAACCACC"))
    test()
    

    用正则表达式寻找匹配

    先简单尝试一下re这个包的用法:

    import re
    
    str = "TGAAGTATGAGA"
    mo = re.search("TAT",str)
    mo.group()#返回匹配的字符串
    mo.span()#返回匹配位置,为元组(start,end)
    
    mo2 = re.search("TG.", str)#返回第一个匹配的
    mo2.group()
    "TGA"
    mo2.start()#返回匹配起点位置,相当于mo2.span()[0]
    re.findall("TG.", str)#返回全部匹配的
    mos = re.finditer("TG.", str)#迭代查找
    for x in mos:
       print(x.group())
       print(x.span())
    #TGA
    # (0, 3)
    # TGA
    # (7, 10)
    

    如果要包装成函数的话,可以这样做:

    def find_pattern_re (seq, pat):
       from re import search
       mo = search(pat, seq)
       if (mo != None):
          return mo.span()[0]#返回第一个匹配
       else:
          return -1
    def find_all_occurrences_re (seq, pat):
       from re import finditer
       mos = finditer(pat, seq)
       res = []
       for x in mos:
          res.append(x.span()[0])#返回所有匹配索引
       return res
    
    def test():
       seq = input ("Input sequence:")
       pat = input ("Input pattern (as a regular expression):")
       res = find_pattern_re(seq, pat)
       if res >= 0:
          print ("Pattern found in position: ", res)
       else : print ("Pattern not found")
       all_res = find_all_occurrences_re(seq, pat)
       if len(all_res) > 0:
          print ("Pattern found in positions: ", all_res)
       else : print ("Pattern not found")
    test()
    

    但是上述函数的缺陷是如果pattern在text里面重叠,它就识别不出来,比如:

    # Input sequence:ATATGAAGAG
    # Input pattern (as a regular expression):AT.
    # Pattern found in position: 0
    # Pattern found in positions: [0]
    

    这里可以看到第一个ATA的第二个ATG的AT是重叠的

    为了解决这个问题可以这样做:

    添加(?=p)可以解决,p是pattern
    def find_all_overlap(seq, pat):
        return find_all_occurrences_re(seq, "(?="+pat+")")
    def test():
        seq = input("Input sequence:")
        pat = input("Input pattern (as a regular expression):")
        all_ov = find_all_overlap(seq, pat)
        if len(all_ov) > 0:
            print("Pattern found in positions: ", all_ov)
        else:
            print("Pattern not found")
    
    test()
    # Input sequence:>? ATATGAAGAG
    # Input pattern (as a regular expression):>? AT.
    # Pattern found in positions:  [0, 2]
    

    如果要在很多string中寻找匹配,为了使得查找更有效率,可以对RE进行预处理,这个步骤叫做compilation.

    seq = "AAATAGAGATGAAGAGAGATAGCGC"
    rgx = re.compile("GA.A")
    rgx.search(seq).group()
    #'GAGA'
    rgx.findall(seq)
    #['GAGA', 'GAGA', 'GATA']
    mo = rgx.finditer(seq)
    for x in mo: print (x.span())
    # (5, 9)
    # (13, 17)
    # (17, 21)
    

    对于复杂的RE,可以分别查看RE不同part在target中的match情况:

    rgx = re.compile("(TATA..)((GC){3})")
    seq = "ATATAAGGCGCGCGCTTATGCGC"
    result = rgx.search(seq)
    result.group(0)
    'TATAAGGCGCGC'
    result.group(1)#返回RE中第一个括号内的匹配
    'TATAAG'
    result.group(2)#返回RE中第二个括号内的匹配
    'GCGCGC'
    

    RE在序列分析中的应用

    验证DNA序列的有效性

    def validate_dna_re (seq):
       from re import search
       if search("[^ACTGactg]", seq) != None:#这里^开头表示“非”,如果除了四种碱基还有其他字符,不是合法的dna序列
          return False
       else:
          return True
    validate_dna_re("ATAGAGACTATCCGCTAGCT")
    validate_dna_re("ATAGAGACTAXTCCGCTAGCT")
    

    把密码子转换成氨基酸

    def translate_codon_re (cod):
       import re
       if re.search("GC.", cod): aa = "A"
       elif re.search("TG[TC]", cod): aa = "C"
       elif re.search("GA[TC]", cod): aa = "D"
       elif re.search("GA[AG]", cod): aa = "E"
       elif re.search("TT[TC]", cod): aa = "F"
       elif re.search("GG.", cod): aa = "G"
       elif re.search("CA[TC]", cod): aa = "H"
       elif re.search("AT[TCA]", cod): aa = "I"
       elif re.search("AA[AG]", cod): aa = "K"
       elif re.search("TT[AG]|CT.", cod): aa = "L"
       elif re.search("ATG", cod): aa = "M"
       elif re.search("AA[TC]", cod): aa = "N"
       elif re.search("CC.", cod): aa = "P"
       elif re.search("CA[AG]", cod): aa = "Q"
       elif re.search("CG.|AG[AG]", cod): aa = "R"
       elif re.search("TC.|AG[TC]", cod): aa = "S"
       elif re.search("AC.", cod): aa = "T"
       elif re.search("GT.", cod): aa = "V"
       elif re.search("TGG", cod): aa = "W"
       elif re.search("TA[TC]", cod): aa = "Y"
       elif re.search("TA[AG]|TGA", cod): aa = "_";
       else: aa = ""
       return aa
    

    在一串氨基酸序列中找到可能最大的蛋白

    注意蛋白质以M开头,以_结尾

    def largest_protein_re (seq_prot):
       import re
       mos = re.finditer("M[^_]∗_", seq_prot)#以M开头,中间不能有_,以_结尾
       sizem = 0
       lprot = ""
       for x in mos:
          ini = x.span()[0]
          fin = x.span()[1]
          s = fin - ini + 1#蛋白的长度
          if s > sizem:#选择最大的蛋白输出
             lprot = x.group()
             sizem = s
       return lprot
    

    这里因为本来就是为了找到最大的蛋白,所以类似“M. . . M . . . M . . . _”.有overlap的情况不用考虑,
    如果是需要把所有的蛋白找到,就要用先前提到的解决办法

    Finding Protein Motifs

    The Prosite database (http://prosite.expasy.org/) contains many protein motifs, represented
    with different formats.蛋白模体数据库

    motif的展示语法:

    Some of the syntax rules of this representation are the following:

    • each aminoacid is represented by one letter symbol

    • a list of aminoacids within square brackets represents a list of possible aminoacids in a given position;

    • the symbol “x” represents any aminoacid in a given position;

    • a number within parenthesis after an aminoacid (or aminoacid list) represents the number of occurrences of those aminoacids;

    • a pair of numbers separated by a comma symbol within parentheses, indicates a number of occurrences between the first and the second number (i.e. indicates a range for the
    number of occurrences);

    • the “-” symbol is used to separate the several positions.

    the “Zinc finger RING-type signature” (PS00518) motif:“C-x-H-x-[LIVMFY]-C-x(2)-C-[LIVMYA]”

    现在的问题是怎么把pattern的syntax转化成RE

    def find_zync_finger(seq):
       from re import search
       regexp = "C.H.[LIVMFY]C.{2}C[LIVMYA]"
       mo = search(regexp , seq)
       if (mo != None):
          return mo.span()[0]
       else:
          return -1
    def test():
       seq = "HKMMLASCKHLLCLKCIVKLG"
       print (find_zync_finger(seq))
    test()
    

    上面我们是在函数里面给定了motif,如果可以把regexp的motif也作为一个参数传入就好了:

    def find_prosite(seq, profile):
       from re import search
       regexp = profile.replace("−","")
       regexp = regexp.replace("x",".")
       regexp = regexp.replace("(","{")
       regexp = regexp.replace(")","}")
       mo = search(regexp , seq)
       if (mo != None):
          return mo.span()[0]
       else:
          return -1
    def test():
       seq = "HKMMLASCKHLLCLKCIVKLG"
       print (find_prosite(seq,"C−x−H−x−[LIVMFY]−C−x(2)−C−[LIVMYA]"))
    test()
    

    用ScanProsite查找蛋白磷酸化位点:http://www.360doc.com/content/17/0903/18/46615923_684371843.shtml

    完整的syntax rule请看:https://prosite.expasy.org/scanprosite/scanprosite_doc.html

    An Application to Restriction Enzymes

    An Application to Restriction Enzymes:限制酶的数据库:http://rebase.neb.com/rebase/rebase.html

    首先把iupac命名规则转化成RE

    def iub_to_RE (iub):
       dic = {"A":"A", "C":"C", "G":"G", "T":"T", "R":"[GA]", "Y":"[CT]","M":"[AC]", "K":"[GT]", "S":"[GC]", "W": "[AT]", "B":"[CGT]",
       "D":"[AGT]", "H":"[ACT]", "V":"[ACG]", "N":"[ACGT]"}
       site = iub.replace("^", "")
       regexp = ""
       for c in site:
          regexp += dic[c]
       return regexp
    def test():
       print (iub_to_RE("G^AMTV"))
    test()
    #GA[AC]T[ACG]
    

    寻找酶切位点并生成子序列

    def cut_positions (enzyme , sequence):#在序列中查找切点
       from re import finditer
       cutpos = enzyme.find("^")
       regexp = iub_to_RE(enzyme)
       matches = finditer(regexp , sequence)
       locs = [ ]
       for m in matches:
          locs.append(m.start() + cutpos)
       return locs
    def cut_subsequences (locs, sequence):
       res = []
       positions = locs
       positions.insert(0,0)
       positions.append( len (sequence))
       for i in range( len (positions)-1):
          res.append(sequence[positions[i]:positions[i+1]])
       return res
    def test():
       pos = cut_positions("G^ATTC", "GTAGAAGATTCTGAGATCGATTC")
       print (pos)
       print(cut_subsequences(pos, "GTAGAAGATTCTGAGATCGATTC"))
    test()
    #[7, 19]
    # ['GTAGAAG', 'ATTCTGAGATCG', 'ATTC']
    

    总结

    简单粗暴地逐一比对、BM算法搜索、正则表达式匹配都是实现pattern finding的方法,个人比较推荐RE,python的re包还可以做更多个性化的搜索工作

    相关文章

      网友评论

        本文标题:用python实现pattern finding

        本文链接:https://www.haomeiwen.com/subject/xyavvhtx.html