参考教材:Bioinformatics Algorithms Design and Implementation in Python
Naive Algorithm for Fixed Pattern Finding
把pattern里的字符和text里的字符逐一比较,简单粗暴的方法:
def search_first_occ(seq, pattern):
found = False#返回值是有没有找到pattern,所以先设定标签
i = 0
while i <= len(seq)-len (pattern) and not found:#i用来设置窗口的步移
j = 0
while j < len(pattern) and pattern[j] == seq[i + j]:#j设置一一比对pattern和窗口
j = j + 1
if j == len(pattern): found = True
else:
i +=1
if found:return True
else:return -1
def search_all_occurrences(seq, pattern):
res = []
for i in range( len (seq)-len(pattern)+1):
j = 0
while j < len(pattern) and pattern[j] == seq[i + j]:
j = j + 1
if j == len(pattern):
res.append(i)
return res
seqDNA = "ATAGAATAGATAATAGTC"
print ( search_first_occ(seqDNA , "GAAT") )
print ( search_first_occ(seqDNA , "TATA") )
print ( search_all_occurrences(seqDNA , "AAT") )
def test_pat_search():
seq = input ("Input sequence: ")
pat = input ("Input pattern: ")
print (pat, " occurs in the following positions:", )
print ( search_all_occurrences(seq, pat) )
test_pat_search()
#上述方法一共要比较(N − k + 1) × k times.非常耗时
Heuristic Algorithm: Boyer-Moore
BM算法是著名的字符串查找匹配算法:详情看:https://www.jianshu.com/p/87c44a9ffd49
class BoyerMoore:
def __init__(self, alphabet , pattern):
self.alphabet = alphabet
self.pattern = pattern
self.preprocess()
def preprocess(self):
self.process_bcr()
self.process_gsr()
def process_bcr(self):#创建字典occ,对应每个字符在pattern里的位置
self.occ ={}
for symb in self.alphabet:
self.occ[symb] = -1
for j in range(len(self.pattern)):
c = self.pattern[j]
self.occ[c] = j
def process_gsr(self):#好后缀规则,这里可能比较复杂,感兴趣的可以摸索一下
#如果程序匹配了一个好后缀, 并且在模式中还有另外一个相同的后缀或后缀的部分, 那把下一个后缀或部分移动到当前后缀位置
self.f = [0] *(len(self.pattern)+1)
self.s = [0]*(len(self.pattern)+1)
i = len(self.pattern)
j =len(self.pattern)+1
self.f[i] = j
while i > 0:
while j <=len(self.pattern) and self.pattern[i-1]!=self.pattern[j-1]:
if self.s[j] ==0:self.s[j] =j-i;
j = self.f[j]
i -=1
j -=1
self.f[i] = j
j = self.f[0]
for i in range(len(self.pattern)):
if self.s[i] ==0:self.s[i]=j
if i ==j:j=self.f[j]
def search_pattern(self, text):
res = []
i = 0
while i <= len(text) - len (self.pattern):
j =len(self.pattern)-1
while j >= 0 and self.pattern[j] == text[j+i]: j -= 1
if (j<0):
res.append(i)
i +=self.s[0]
else:
c = text[j + i]#如果没有匹配上,计算需要步移的长度,c为坏字符
i += max(self.s[j+1],j-self.occ[c])
return res
def test():
bm = BoyerMoore("ACTG", "ACCA")
print (bm.search_pattern("ATAGAACCAATGAACCATGATGAACCATGGATACCCAACCACC"))
test()
用正则表达式寻找匹配
先简单尝试一下re这个包的用法:
import re
str = "TGAAGTATGAGA"
mo = re.search("TAT",str)
mo.group()#返回匹配的字符串
mo.span()#返回匹配位置,为元组(start,end)
mo2 = re.search("TG.", str)#返回第一个匹配的
mo2.group()
"TGA"
mo2.start()#返回匹配起点位置,相当于mo2.span()[0]
re.findall("TG.", str)#返回全部匹配的
mos = re.finditer("TG.", str)#迭代查找
for x in mos:
print(x.group())
print(x.span())
#TGA
# (0, 3)
# TGA
# (7, 10)
如果要包装成函数的话,可以这样做:
def find_pattern_re (seq, pat):
from re import search
mo = search(pat, seq)
if (mo != None):
return mo.span()[0]#返回第一个匹配
else:
return -1
def find_all_occurrences_re (seq, pat):
from re import finditer
mos = finditer(pat, seq)
res = []
for x in mos:
res.append(x.span()[0])#返回所有匹配索引
return res
def test():
seq = input ("Input sequence:")
pat = input ("Input pattern (as a regular expression):")
res = find_pattern_re(seq, pat)
if res >= 0:
print ("Pattern found in position: ", res)
else : print ("Pattern not found")
all_res = find_all_occurrences_re(seq, pat)
if len(all_res) > 0:
print ("Pattern found in positions: ", all_res)
else : print ("Pattern not found")
test()
但是上述函数的缺陷是如果pattern在text里面重叠,它就识别不出来,比如:
# Input sequence:ATATGAAGAG
# Input pattern (as a regular expression):AT.
# Pattern found in position: 0
# Pattern found in positions: [0]
这里可以看到第一个ATA的第二个ATG的AT是重叠的
为了解决这个问题可以这样做:
添加(?=p)可以解决,p是pattern
def find_all_overlap(seq, pat):
return find_all_occurrences_re(seq, "(?="+pat+")")
def test():
seq = input("Input sequence:")
pat = input("Input pattern (as a regular expression):")
all_ov = find_all_overlap(seq, pat)
if len(all_ov) > 0:
print("Pattern found in positions: ", all_ov)
else:
print("Pattern not found")
test()
# Input sequence:>? ATATGAAGAG
# Input pattern (as a regular expression):>? AT.
# Pattern found in positions: [0, 2]
如果要在很多string中寻找匹配,为了使得查找更有效率,可以对RE进行预处理,这个步骤叫做compilation.
seq = "AAATAGAGATGAAGAGAGATAGCGC"
rgx = re.compile("GA.A")
rgx.search(seq).group()
#'GAGA'
rgx.findall(seq)
#['GAGA', 'GAGA', 'GATA']
mo = rgx.finditer(seq)
for x in mo: print (x.span())
# (5, 9)
# (13, 17)
# (17, 21)
对于复杂的RE,可以分别查看RE不同part在target中的match情况:
rgx = re.compile("(TATA..)((GC){3})")
seq = "ATATAAGGCGCGCGCTTATGCGC"
result = rgx.search(seq)
result.group(0)
'TATAAGGCGCGC'
result.group(1)#返回RE中第一个括号内的匹配
'TATAAG'
result.group(2)#返回RE中第二个括号内的匹配
'GCGCGC'
RE在序列分析中的应用
验证DNA序列的有效性
def validate_dna_re (seq):
from re import search
if search("[^ACTGactg]", seq) != None:#这里^开头表示“非”,如果除了四种碱基还有其他字符,不是合法的dna序列
return False
else:
return True
validate_dna_re("ATAGAGACTATCCGCTAGCT")
validate_dna_re("ATAGAGACTAXTCCGCTAGCT")
把密码子转换成氨基酸
def translate_codon_re (cod):
import re
if re.search("GC.", cod): aa = "A"
elif re.search("TG[TC]", cod): aa = "C"
elif re.search("GA[TC]", cod): aa = "D"
elif re.search("GA[AG]", cod): aa = "E"
elif re.search("TT[TC]", cod): aa = "F"
elif re.search("GG.", cod): aa = "G"
elif re.search("CA[TC]", cod): aa = "H"
elif re.search("AT[TCA]", cod): aa = "I"
elif re.search("AA[AG]", cod): aa = "K"
elif re.search("TT[AG]|CT.", cod): aa = "L"
elif re.search("ATG", cod): aa = "M"
elif re.search("AA[TC]", cod): aa = "N"
elif re.search("CC.", cod): aa = "P"
elif re.search("CA[AG]", cod): aa = "Q"
elif re.search("CG.|AG[AG]", cod): aa = "R"
elif re.search("TC.|AG[TC]", cod): aa = "S"
elif re.search("AC.", cod): aa = "T"
elif re.search("GT.", cod): aa = "V"
elif re.search("TGG", cod): aa = "W"
elif re.search("TA[TC]", cod): aa = "Y"
elif re.search("TA[AG]|TGA", cod): aa = "_";
else: aa = ""
return aa
在一串氨基酸序列中找到可能最大的蛋白
注意蛋白质以M开头,以_结尾
def largest_protein_re (seq_prot):
import re
mos = re.finditer("M[^_]∗_", seq_prot)#以M开头,中间不能有_,以_结尾
sizem = 0
lprot = ""
for x in mos:
ini = x.span()[0]
fin = x.span()[1]
s = fin - ini + 1#蛋白的长度
if s > sizem:#选择最大的蛋白输出
lprot = x.group()
sizem = s
return lprot
这里因为本来就是为了找到最大的蛋白,所以类似“M. . . M . . . M . . . _”.有overlap的情况不用考虑,
如果是需要把所有的蛋白找到,就要用先前提到的解决办法
Finding Protein Motifs
The Prosite database (http://prosite.expasy.org/) contains many protein motifs, represented
with different formats.蛋白模体数据库
motif的展示语法:
Some of the syntax rules of this representation are the following:
• each aminoacid is represented by one letter symbol
• a list of aminoacids within square brackets represents a list of possible aminoacids in a given position;
• the symbol “x” represents any aminoacid in a given position;
• a number within parenthesis after an aminoacid (or aminoacid list) represents the number of occurrences of those aminoacids;
• a pair of numbers separated by a comma symbol within parentheses, indicates a number of occurrences between the first and the second number (i.e. indicates a range for the
number of occurrences);• the “-” symbol is used to separate the several positions.
the “Zinc finger RING-type signature” (PS00518) motif:“C-x-H-x-[LIVMFY]-C-x(2)-C-[LIVMYA]”
现在的问题是怎么把pattern的syntax转化成RE
def find_zync_finger(seq):
from re import search
regexp = "C.H.[LIVMFY]C.{2}C[LIVMYA]"
mo = search(regexp , seq)
if (mo != None):
return mo.span()[0]
else:
return -1
def test():
seq = "HKMMLASCKHLLCLKCIVKLG"
print (find_zync_finger(seq))
test()
上面我们是在函数里面给定了motif,如果可以把regexp的motif也作为一个参数传入就好了:
def find_prosite(seq, profile):
from re import search
regexp = profile.replace("−","")
regexp = regexp.replace("x",".")
regexp = regexp.replace("(","{")
regexp = regexp.replace(")","}")
mo = search(regexp , seq)
if (mo != None):
return mo.span()[0]
else:
return -1
def test():
seq = "HKMMLASCKHLLCLKCIVKLG"
print (find_prosite(seq,"C−x−H−x−[LIVMFY]−C−x(2)−C−[LIVMYA]"))
test()
用ScanProsite查找蛋白磷酸化位点:http://www.360doc.com/content/17/0903/18/46615923_684371843.shtml
完整的syntax rule请看:https://prosite.expasy.org/scanprosite/scanprosite_doc.html
An Application to Restriction Enzymes
An Application to Restriction Enzymes:限制酶的数据库:http://rebase.neb.com/rebase/rebase.html
首先把iupac命名规则转化成RE
def iub_to_RE (iub):
dic = {"A":"A", "C":"C", "G":"G", "T":"T", "R":"[GA]", "Y":"[CT]","M":"[AC]", "K":"[GT]", "S":"[GC]", "W": "[AT]", "B":"[CGT]",
"D":"[AGT]", "H":"[ACT]", "V":"[ACG]", "N":"[ACGT]"}
site = iub.replace("^", "")
regexp = ""
for c in site:
regexp += dic[c]
return regexp
def test():
print (iub_to_RE("G^AMTV"))
test()
#GA[AC]T[ACG]
寻找酶切位点并生成子序列
def cut_positions (enzyme , sequence):#在序列中查找切点
from re import finditer
cutpos = enzyme.find("^")
regexp = iub_to_RE(enzyme)
matches = finditer(regexp , sequence)
locs = [ ]
for m in matches:
locs.append(m.start() + cutpos)
return locs
def cut_subsequences (locs, sequence):
res = []
positions = locs
positions.insert(0,0)
positions.append( len (sequence))
for i in range( len (positions)-1):
res.append(sequence[positions[i]:positions[i+1]])
return res
def test():
pos = cut_positions("G^ATTC", "GTAGAAGATTCTGAGATCGATTC")
print (pos)
print(cut_subsequences(pos, "GTAGAAGATTCTGAGATCGATTC"))
test()
#[7, 19]
# ['GTAGAAG', 'ATTCTGAGATCG', 'ATTC']
总结
简单粗暴地逐一比对、BM算法搜索、正则表达式匹配都是实现pattern finding的方法,个人比较推荐RE,python的re包还可以做更多个性化的搜索工作
网友评论