Python学习的第八天,主要学习文本挖掘和模式匹配。
生物信息大多数问题都是核酸或氨基酸序列的比对和寻找motif,利用正则表达式显得十分重要。比如一条核酸或氨基酸序列,你可能想寻找其中的磷酸化位点,甘露糖基化位点,转录结果位点等特定的序列结构,正则表达式可以帮助寻找特定模式的序列。
1. 在一条氨基酸序列中寻找特定序列
import re
seq = 'VSVLTMFRYAGWLDRLYMLVGTQLAAIIHGVALPLMMLI'
pattern = re.compile("[ST]Q") #"[ST]Q"表示"SQ"或"TQ"。
match = pattern.search(seq) #搜索seq中符合pattern的序列
print(match.start())
21
print(match.end())
23
if match:
print('%10s' %(seq[match.start() - 4:match.end() + 4])) #'%10s' 序列右对齐,
#显示从匹配初始位置前4个字符到匹配结束位置后4个字符
print('%6s' % match.group())
else:
print("no match")
MLVGTQLAAI
TQ
######
import re
seq = 'RQSAMGSNKSKPKDASQRRRSLEPAENVHGAGGGAFPASQRPSKP'
pattern = re.compile('R.[ST][^P]') #查找符合'R.[ST][^P]'的序列,
#第一个是氨基酸R,第二"."表示任意氨基酸,
#第三个是S或T,第四个是除了P以外其他的任意氨基酸。
matches = pattern.findall(seq)
print(matches)
['RQSA', 'RRSL', 'RPSK']
match_iter = pattern.finditer(seq)
for match in match_iter:
print(match.group(), match.span()) #显示匹配的字符位置
print(match.start(), match.end())
RQSA (0, 4)
0 4
RRSL (18, 22)
18 22
RPSK (40, 44)
40 44
####
import re
seq = 'QSAMGSNKSKPKDASQRRRSLEPAENVHGAGGGAFPASQRPSKP'
pattern1 = re.compile('R(.)[ST][^P]')
match1 = pattern1.search(seq)
print(match1.group())
RRSL
print(match1.group(1))
R
2. 匹配中的切片和替换
import re
separator = re.compile("\|") #以"|"为分隔符
annotation = 'ATOM:CA|RES:ALA|CHAIN:B|NUMRES:166'
columns = separator.split(annotation) #以"|"为分隔符将annotation切片
print(columns)
['ATOM:CA', 'RES:ALA', 'CHAIN:B', 'NUMRES:166']
###
new_annotation = separator.sub("@", annotation)
print(new_annotation)
ATOM:CA@RES:ALA@CHAIN:B@NUMRES:166
new_annotation2 = separator.sub('@', annotation, 2)
print(new_annotation2)
ATOM:CA@RES:ALA@CHAIN:B|NUMRES:166
new_annotation3 = separator.subn('@', annotation)
print(new_annotation3)
('ATOM:CA@RES:ALA@CHAIN:B@NUMRES:166', 3)
3. 将prosite文件模式中的符号转换为正则表达式
pattern = '[DEQN]-x-[DEQN](2)-C-x(3,14)-C-x(3,7)-C-x-[DN]-x(4)-[FY]-x-C'
pattern = pattern.replace('{', '[^')
pattern = pattern.replace('}', ']')
pattern = pattern.replace('(', '{')
pattern = pattern.replace(')', '}')
pattern = pattern.replace('-', '')
pattern = pattern.replace('x', '.')
pattern = pattern.replace('>', '$')
pattern = pattern.replace('<', '^')
print(pattern)
[DEQN].[DEQN]{2}C.{3,14}C.{3,7}C.[DN].{4}[FY].C
4. 寻找基因组序列上的转录因子结合位点
已知的转录因子结合位点
TFBS文件
import re
genome_seq = open("genome.txt").read() #读取基因组序列信息
sites = []
for line in open("TFBS.txt"): #读取已知的转录因子结合位点信息文件
fields = line.split()
tf = fields[0]
site = fields[1]
sites.append((tf,site)) #从文件中提取必要信息并格式化
sites
Out[44]:
[('UAS(G)-pMH100', 'CGGAGTACTGTCCTCCG'), #查看新的转录因子结合位点的表格
('TFIIIC-Xls-50', 'TGGATGGGAG'),
('HSE_CS_inver0', 'CTNGAANNTTCNAG'),
('ZDNA_CS', '0'),
('GCN4-his3-180', 'ATGACTCAT')]
for tf, site in sites:
tfbs_regexp = re.compile(site) #正则表达式的模式
all_matches = tfbs_regexp.findall(genome_seq) #显示出所有匹配的序列
matches = tfbs_regexp.finditer(genome_seq)
if all_matches:
print(tf, ":")
for tfbs in matches:
print("\t", tfbs.group(),tfbs.start(), tfbs.end()) #显示序列和匹配的位置
UAS(G)-pMH100 :
CGGAGTACTGTCCTCCG 0 17
CGGAGTACTGTCCTCCG 60 77
CGGAGTACTGTCCTCCG 120 137
CGGAGTACTGTCCTCCG 180 197
CGGAGTACTGTCCTCCG 240 257
CGGAGTACTGTCCTCCG 300 317
CGGAGTACTGTCCTCCG 360 377
CGGAGTACTGTCCTCCG 420 437
TFIIIC-Xls-50 :
TGGATGGGAG 17 27
TGGATGGGAG 77 87
TGGATGGGAG 137 147
TGGATGGGAG 197 207
TGGATGGGAG 257 267
TGGATGGGAG 317 327
TGGATGGGAG 377 387
TGGATGGGAG 437 447
HSE_CS_inver0 :
CTNGAANNTTCNAG 27 41
CTNGAANNTTCNAG 87 101
CTNGAANNTTCNAG 147 161
CTNGAANNTTCNAG 207 221
CTNGAANNTTCNAG 267 281
CTNGAANNTTCNAG 327 341
CTNGAANNTTCNAG 387 401
CTNGAANNTTCNAG 447 461
GCN4-his3-180 :
ATGACTCAT 50 59
ATGACTCAT 110 119
ATGACTCAT 170 179
ATGACTCAT 230 239
ATGACTCAT 290 299
ATGACTCAT 350 359
ATGACTCAT 410 419
ATGACTCAT 470 479
5. 从PubMed HTML 页面提取标题和摘要信息
import urllib.request as urllib2
import re
pmid = '18235848'
url = 'http://www.ncbi.nlm.nih.gov/pubmed?term=%s' % pmid #pmid = '18235848'的网页地址
url
Out[56]: 'http://www.ncbi.nlm.nih.gov/pubmed?term=18235848'
handler = urllib2.urlopen(url) #打开这个网页
html = handler.read()
html = html.decode("utf-8") #使用utf-8解码html
title_regexp = re.compile('<h1>.{5,400}</h1>') #html信息中标题是被<h1> 和 </h1>括起来的
title_text = title_regexp.search(html)
abstract_regexp = re.compile('<h3>Abstract</h3><div class=""><p><AbstractText>.{20,3000}</AbstractText></p></div>')
abstract_text = abstract_regexp.search(html)
print(url)
http://www.ncbi.nlm.nih.gov/pubmed?term=18235848
print('TITLE:', title_text.group())
TITLE: <h1>Quantitative high-throughput screen identifies inhibitors of the Schistosoma mansoni redox cascade.</h1>
print('ABSTRACT:', abstract_text.group()) #error!不知道原因,
#是不是ncbi页面改版导致代码现在不能用了?
6. 检测PubMed 文章摘要中的特定单词
import urllib.request as urllib2
import re
# word to be searched
word_regexp = re.compile('schistosoma')
# list of PMIDs where we want to search the word
pmids = ['18235848', '22607149', '22405002', '21630672']
for pmid in pmids:
url = 'http://www.ncbi.nlm.nih.gov/pubmed?term=' + pmid
handler = urllib2.urlopen(url)
html = handler.read()
title_regexp = re.compile('<h1>.{5,400}</h1>')
title = title_regexp.search(html)
title = title.group()
abstract_regexp = re.compile('<h3>Abstract</h3><p>.{20,3000}</p></div>')
abstract = abstract_regexp.search(html)
abstract = abstract.group()
word = word_regexp.search(abstract, re.IGNORECASE)
if word:
# display title and where the keyword was found
print(title)
print(word.group(), word.start(), word.end())
Traceback (most recent call last):
File "<ipython-input-71-3ad765bdb689>", line 23, in <module>
handler = urllib2.urlopen(url)
File "C:\ProgramData\Anaconda3\lib\urllib\request.py", line 222, in urlopen
return opener.open(url, data, timeout)
File "C:\ProgramData\Anaconda3\lib\urllib\request.py", line 531, in open
response = meth(req, response)
File "C:\ProgramData\Anaconda3\lib\urllib\request.py", line 641, in http_response
'http', request, response, code, msg, hdrs)
File "C:\ProgramData\Anaconda3\lib\urllib\request.py", line 569, in error
return self._call_chain(*args)
File "C:\ProgramData\Anaconda3\lib\urllib\request.py", line 503, in _call_chain
result = func(*args)
File "C:\ProgramData\Anaconda3\lib\urllib\request.py", line 649, in http_error_default
raise HTTPError(req.full_url, code, msg, hdrs, fp)
HTTPError: Forbidden
#失败,访问获取不了Abstract信息
网友评论