美文网首页生物信息Python
使用python处理生物信息数据(八)

使用python处理生物信息数据(八)

作者: 你猜我菜不菜 | 来源:发表于2020-03-10 09:19 被阅读0次

    Python学习的第八天,主要学习文本挖掘和模式匹配。


    生物信息大多数问题都是核酸或氨基酸序列的比对和寻找motif,利用正则表达式显得十分重要。比如一条核酸或氨基酸序列,你可能想寻找其中的磷酸化位点,甘露糖基化位点,转录结果位点等特定的序列结构,正则表达式可以帮助寻找特定模式的序列。

    1. 在一条氨基酸序列中寻找特定序列
    import re
    
    seq = 'VSVLTMFRYAGWLDRLYMLVGTQLAAIIHGVALPLMMLI'
    
    pattern = re.compile("[ST]Q") #"[ST]Q"表示"SQ"或"TQ"。
    
    match = pattern.search(seq) #搜索seq中符合pattern的序列
    
    print(match.start())
    21
    
    print(match.end())
    23
    
    if match:
        print('%10s' %(seq[match.start() - 4:match.end() + 4])) #'%10s' 序列右对齐, 
    #显示从匹配初始位置前4个字符到匹配结束位置后4个字符
        print('%6s' % match.group()) 
    else: 
        print("no match")
    MLVGTQLAAI
        TQ
    
    
    ######
    import re
    
    seq = 'RQSAMGSNKSKPKDASQRRRSLEPAENVHGAGGGAFPASQRPSKP'
    
    pattern = re.compile('R.[ST][^P]') #查找符合'R.[ST][^P]'的序列,
    #第一个是氨基酸R,第二"."表示任意氨基酸,
    #第三个是S或T,第四个是除了P以外其他的任意氨基酸。
    
    matches = pattern.findall(seq)
    
    print(matches)
    ['RQSA', 'RRSL', 'RPSK']
    
    match_iter = pattern.finditer(seq)
    
    for match in match_iter:
        print(match.group(), match.span()) #显示匹配的字符位置
        print(match.start(), match.end())
    RQSA (0, 4)
    0 4
    RRSL (18, 22)
    18 22
    RPSK (40, 44)
    40 44
    
    ####
    import re
    
    seq = 'QSAMGSNKSKPKDASQRRRSLEPAENVHGAGGGAFPASQRPSKP'
    
    pattern1 = re.compile('R(.)[ST][^P]')
    
    match1 = pattern1.search(seq)
    
    print(match1.group())
    RRSL
    
    print(match1.group(1))
    R
    
    

    2. 匹配中的切片和替换
    import re
    
    separator = re.compile("\|") #以"|"为分隔符
    
    annotation = 'ATOM:CA|RES:ALA|CHAIN:B|NUMRES:166'
    
    columns = separator.split(annotation) #以"|"为分隔符将annotation切片
    
    print(columns)
    ['ATOM:CA', 'RES:ALA', 'CHAIN:B', 'NUMRES:166']
    
    ###
    new_annotation = separator.sub("@", annotation)
    
    print(new_annotation)
    ATOM:CA@RES:ALA@CHAIN:B@NUMRES:166
    
    new_annotation2 = separator.sub('@', annotation, 2)
    print(new_annotation2)
    ATOM:CA@RES:ALA@CHAIN:B|NUMRES:166
    
    new_annotation3 = separator.subn('@', annotation)
    print(new_annotation3)
    ('ATOM:CA@RES:ALA@CHAIN:B@NUMRES:166', 3)
    

    3. 将prosite文件模式中的符号转换为正则表达式
    pattern = '[DEQN]-x-[DEQN](2)-C-x(3,14)-C-x(3,7)-C-x-[DN]-x(4)-[FY]-x-C'
    pattern = pattern.replace('{', '[^')
    pattern = pattern.replace('}', ']')
    pattern = pattern.replace('(', '{')
    pattern = pattern.replace(')', '}')
    pattern = pattern.replace('-', '')
    pattern = pattern.replace('x', '.')
    pattern = pattern.replace('>', '$')
    pattern = pattern.replace('<', '^')
    print(pattern)
    [DEQN].[DEQN]{2}C.{3,14}C.{3,7}C.[DN].{4}[FY].C
    

    4. 寻找基因组序列上的转录因子结合位点

    已知的转录因子结合位点


    TFBS文件
    import re
    
    genome_seq = open("genome.txt").read() #读取基因组序列信息
    
    sites = []
    
    for line in open("TFBS.txt"): #读取已知的转录因子结合位点信息文件
        fields = line.split()
        tf = fields[0]
        site = fields[1]
        sites.append((tf,site)) #从文件中提取必要信息并格式化
        
    
    sites
    Out[44]: 
    [('UAS(G)-pMH100', 'CGGAGTACTGTCCTCCG'), #查看新的转录因子结合位点的表格
     ('TFIIIC-Xls-50', 'TGGATGGGAG'),
     ('HSE_CS_inver0', 'CTNGAANNTTCNAG'),
     ('ZDNA_CS', '0'),
     ('GCN4-his3-180', 'ATGACTCAT')]
    
    for tf, site in sites:
        tfbs_regexp = re.compile(site) #正则表达式的模式
        all_matches = tfbs_regexp.findall(genome_seq) #显示出所有匹配的序列
        matches = tfbs_regexp.finditer(genome_seq)
        if all_matches:
            print(tf, ":")
            for tfbs in matches:
                print("\t", tfbs.group(),tfbs.start(), tfbs.end()) #显示序列和匹配的位置
                
    UAS(G)-pMH100 :
             CGGAGTACTGTCCTCCG 0 17
             CGGAGTACTGTCCTCCG 60 77
             CGGAGTACTGTCCTCCG 120 137
             CGGAGTACTGTCCTCCG 180 197
             CGGAGTACTGTCCTCCG 240 257
             CGGAGTACTGTCCTCCG 300 317
             CGGAGTACTGTCCTCCG 360 377
             CGGAGTACTGTCCTCCG 420 437
    TFIIIC-Xls-50 :
             TGGATGGGAG 17 27
             TGGATGGGAG 77 87
             TGGATGGGAG 137 147
             TGGATGGGAG 197 207
             TGGATGGGAG 257 267
             TGGATGGGAG 317 327
             TGGATGGGAG 377 387
             TGGATGGGAG 437 447
    HSE_CS_inver0 :
             CTNGAANNTTCNAG 27 41
             CTNGAANNTTCNAG 87 101
             CTNGAANNTTCNAG 147 161
             CTNGAANNTTCNAG 207 221
             CTNGAANNTTCNAG 267 281
             CTNGAANNTTCNAG 327 341
             CTNGAANNTTCNAG 387 401
             CTNGAANNTTCNAG 447 461
    GCN4-his3-180 :
             ATGACTCAT 50 59
             ATGACTCAT 110 119
             ATGACTCAT 170 179
             ATGACTCAT 230 239
             ATGACTCAT 290 299
             ATGACTCAT 350 359
             ATGACTCAT 410 419
             ATGACTCAT 470 479
    

    5. 从PubMed HTML 页面提取标题和摘要信息
    import urllib.request as urllib2
    
    import re
    
    pmid = '18235848'
    
    url = 'http://www.ncbi.nlm.nih.gov/pubmed?term=%s' % pmid #pmid = '18235848'的网页地址
    
    url
    Out[56]: 'http://www.ncbi.nlm.nih.gov/pubmed?term=18235848' 
    
    handler = urllib2.urlopen(url) #打开这个网页
    
    html = handler.read()
    
    html = html.decode("utf-8") #使用utf-8解码html
    
    title_regexp = re.compile('<h1>.{5,400}</h1>') #html信息中标题是被<h1> 和 </h1>括起来的
    
    title_text = title_regexp.search(html)
    
    abstract_regexp = re.compile('<h3>Abstract</h3><div class=""><p><AbstractText>.{20,3000}</AbstractText></p></div>')
    
    abstract_text = abstract_regexp.search(html)
    
    print(url)
    http://www.ncbi.nlm.nih.gov/pubmed?term=18235848
    
    print('TITLE:', title_text.group())
    TITLE: <h1>Quantitative high-throughput screen identifies inhibitors of the Schistosoma mansoni redox cascade.</h1>
    
    print('ABSTRACT:', abstract_text.group()) #error!不知道原因,
    #是不是ncbi页面改版导致代码现在不能用了?
    

    6. 检测PubMed 文章摘要中的特定单词
     import urllib.request as urllib2
    import re
    
    # word to be searched
    word_regexp = re.compile('schistosoma')
    
    # list of PMIDs where we want to search the word
    pmids = ['18235848', '22607149', '22405002', '21630672']
    for pmid in pmids:
        url = 'http://www.ncbi.nlm.nih.gov/pubmed?term=' + pmid
        handler = urllib2.urlopen(url)
        html = handler.read()
        title_regexp = re.compile('<h1>.{5,400}</h1>')
        title = title_regexp.search(html)
        title = title.group() 
        abstract_regexp = re.compile('<h3>Abstract</h3><p>.{20,3000}</p></div>')
        abstract = abstract_regexp.search(html)
        abstract = abstract.group()
        word = word_regexp.search(abstract, re.IGNORECASE)
        if word:
            # display title and where the keyword was found
            print(title)
            print(word.group(), word.start(), word.end())
    Traceback (most recent call last):
    
      File "<ipython-input-71-3ad765bdb689>", line 23, in <module>
        handler = urllib2.urlopen(url)
    
      File "C:\ProgramData\Anaconda3\lib\urllib\request.py", line 222, in urlopen
        return opener.open(url, data, timeout)
    
      File "C:\ProgramData\Anaconda3\lib\urllib\request.py", line 531, in open
        response = meth(req, response)
    
      File "C:\ProgramData\Anaconda3\lib\urllib\request.py", line 641, in http_response
        'http', request, response, code, msg, hdrs)
    
      File "C:\ProgramData\Anaconda3\lib\urllib\request.py", line 569, in error
        return self._call_chain(*args)
    
      File "C:\ProgramData\Anaconda3\lib\urllib\request.py", line 503, in _call_chain
        result = func(*args)
    
      File "C:\ProgramData\Anaconda3\lib\urllib\request.py", line 649, in http_error_default
        raise HTTPError(req.full_url, code, msg, hdrs, fp)
    
    HTTPError: Forbidden
    
    #失败,访问获取不了Abstract信息
    

    相关文章

      网友评论

        本文标题:使用python处理生物信息数据(八)

        本文链接:https://www.haomeiwen.com/subject/tbjjdhtx.html