美文网首页Python
python批量爬取NCBI基因注释并调用谷歌API批量翻译

python批量爬取NCBI基因注释并调用谷歌API批量翻译

作者: 机器人会画画 | 来源:发表于2020-02-09 20:15 被阅读0次

    今天是2020-02-09
    作者:沙雕学习小组
    这里有视频教程:
    https://www.bilibili.com/video/av87724182
    今天想实现这个功能:
    差异分析得到了200多个基因(甚至更多)

    1

    我要一个一个把基因的summary信息得到,要手动一个一个查可能要查到下个星期,周五就要汇报了啊……!


    2

    有python怕啥?!不要慌

    动手之前先动脑

    step1:获取这个基因在NCBI上的summary信息——输入gene.txt得到genesummary.txt
    step2:检查输出文件是否有空行,若有删掉输入genesummary.txt得到newsummary.txt
    step3:批量翻译——输入newsummary.txt,得到genetrans.txt

    step1:获取这个基因在NCBI上的summary信息——输入gene.txt得到genesummary.txt

    #!/usr/bin/env python
    # -*- coding:utf-8 -*-
    # Author:Abao
    
    from Bio import Entrez # pip install biopython
    #from translate_api.translate_api import api # pip install translate_api
    #from Pytrans import *
    import re
    
    Entrez.email = "shinningbzw@foxmail.com" # email
    
    #这里修改文件路径和文件名,绝对路径
    output_file = 'genesummary.txt' # 注意你的输出文件路径:绝对路径
    input_file = 'gene.txt'# 输入文件:去重后的基因列表 (将基因列保存为 txt,uniq *.txt>gene_list.txt )
    
    
    gene_list = []
    line_c = []
    count = len(open(input_file, 'r').readlines())
    print("Waiting...")
    
    #from Pytrans import *
    import requests
    from Pytrans import *
    
    def google_translate(content):
        '''google translation'''
        js = Pytrans()
        tk = js.getTk(content)
    
        if len(content) > 4891:
            print("too long!!!")
            return
    
        param = {'tk': tk, 'q': content}
    
        result = requests.get("""http://translate.google.cn/translate_a/single?client=t&sl=en
            &tl=zh-CN&hl=zh-CN&dt=at&dt=bd&dt=ex&dt=ld&dt=md&dt=qca&dt=rw&dt=rm&dt=ss
            &dt=t&ie=UTF-8&oe=UTF-8&clearbtn=1&otf=1&pc=1&srcrom=0&ssel=0&tsel=0&kc=2""", params=param)
    
        trans = result.json()[0]
        ret = ''
        # for i in range(len(trans)):
        #     line = trans[i][0]
        #     if line != None:
        #         ret += trans[i][0]
        for i in range(len(trans)):
            line = trans[i][0]
            if line != None:
                ret += trans[i][0]
    
        return ret
    
    
    #a = google_translate("hello,Input file will be translated, please be patient")
    #print(a)
    
    
    # get gene list
    for line in open(input_file):
        if line != "基因":
            gene_list.append(line)
    
    gene_list.remove(gene_list[0])
    rm_pattern = re.compile('\[.*?\]')
    
    
    with open(output_file, 'a+', encoding='utf-8') as f:
        for line in gene_list:
            gene = str(line.strip())
            gene_term = "(" + gene +"[Gene Name]) AND Homo sapiens[Organism]"
            Entrez.email = "shinningbzw@foxmail.com"
            handle = Entrez.esearch(db="gene", term=gene_term)
            gene_id = Entrez.read(handle)['IdList'][0]
            sum_handle = Entrez.esummary(db="gene", id=gene_id)
            sum_record = Entrez.read(sum_handle)
            r_gene_sum = sum_record['DocumentSummarySet']['DocumentSummary'][0]['Summary']
            gene_sum = rm_pattern.sub('', r_gene_sum)
            #translation = google_translate(gene_sum)
            #f.write(gene + "\n" + gene_sum + "\n" + translation + "\n")
            f.write(gene + "\n" + gene_sum + "\n" )
            line_c.append("b")
            if count % len(line_c) == 0:
                perc = (len(line_c) / count) * 100
                print("Completed " + str(int(perc)) + "%")
    
    

    step2:检查输出文件是否有空行,若有删掉。输入genesummary.txt得到newsummary.txt

    #!/usr/bin/env python
    # -*- coding:utf-8 -*-
    # Author:cici
    
    #这里修改你的文件路径,请看清文件名奥~
    with open('genesummary.txt', 'r', encoding='utf-8') as fr, open('newsummary.txt', 'w', encoding='utf-8') as fd:
        for text in fr.readlines():
            if text.split():
                fd.write(text)
        print('输出成功....')
    

    step3:批量翻译

    这里先写个函数

    #!/usr/bin/env python
    # -*- coding:utf-8 -*-
    # Author:Topshi
    
    import execjs
    
    
    class Pytrans():
        def __init__(self):
            self.ctx = execjs.compile("""
            function TL(a) {
            var k = "";
            var b = 406644;
            var b1 = 3293161072;
    
            var jd = ".";
            var $b = "+-a^+6";
            var Zb = "+-3^+b+-f";
    
            for (var e = [], f = 0, g = 0; g < a.length; g++) {
                var m = a.charCodeAt(g);
                128 > m ? e[f++] = m : (2048 > m ? e[f++] = m >> 6 | 192 : (55296 == (m & 64512) && g + 1 < a.length && 56320 == (a.charCodeAt(g + 1) & 64512) ? (m = 65536 + ((m & 1023) << 10) + (a.charCodeAt(++g) & 1023),
                e[f++] = m >> 18 | 240,
                e[f++] = m >> 12 & 63 | 128) : e[f++] = m >> 12 | 224,
                e[f++] = m >> 6 & 63 | 128),
                e[f++] = m & 63 | 128)
            }
            a = b;
            for (f = 0; f < e.length; f++) a += e[f],
            a = RL(a, $b);
            a = RL(a, Zb);
            a ^= b1 || 0;
            0 > a && (a = (a & 2147483647) + 2147483648);
            a %= 1E6;
            return a.toString() + jd + (a ^ b)
        };
    
        function RL(a, b) {
            var t = "a";
            var Yb = "+";
            for (var c = 0; c < b.length - 2; c += 3) {
                var d = b.charAt(c + 2),
                d = d >= t ? d.charCodeAt(0) - 87 : Number(d),
                d = b.charAt(c + 1) == Yb ? a >>> d: a << d;
                a = b.charAt(c) == Yb ? a + d & 4294967295 : a ^ d
            }
            return a
        }
        """)
    
        def getTk(self, text):
            return self.ctx.call("TL", text)
    

    调用这个函数——输入newsummary.txt,得到genetrans.txt

    #!/usr/bin/env python
    # -*- coding:utf-8 -*-
    # Author:Topshi
    from Pytrans import *
    import requests
    
    
    def google_translate(content):
        '''google translation'''
        js = Pytrans()
        tk = js.getTk(content)
    
        if len(content) > 4891:
            print("too long!!!")
            return
    
        param = {'tk': tk, 'q': content}
    
        result = requests.get("""http://translate.google.cn/translate_a/single?client=t&sl=en
            &tl=zh-CN&hl=zh-CN&dt=at&dt=bd&dt=ex&dt=ld&dt=md&dt=qca&dt=rw&dt=rm&dt=ss
            &dt=t&ie=UTF-8&oe=UTF-8&clearbtn=1&otf=1&pc=1&srcrom=0&ssel=0&tsel=0&kc=2""", params=param)
    
        trans = result.json()[0]
        ret = ''
        for i in range(len(trans)):
            line = trans[i][0]
            if line != None:
                ret += trans[i][0]
    
        return ret
    
    
    a = google_translate("hello,Input file will be translated, please be patient")
    print(a)
    
    genotype_annotation_list = []
    
    translate_file = open('genetrans.txt', "a+", encoding='utf-8')
    
    with open('newsummary.txt', 'r') as f: #有空行会报错!!
    
        for element in f:
            genotype_annotation_list.append(element.strip())
    # print(genotype_annotation_list)
    count = 0
    for ga in genotype_annotation_list:
        translation = google_translate(ga)
        #translate_file.write(ga + '\t' + translation + '\n')
        translate_file.write(translation + '\n')
        count += 1
        print('complete', '%.1f%%' % ((count / len(genotype_annotation_list)) * 100))
    
    
    欢迎关注我的公众号: 天黑请闭眼预言家请睁眼.jpg

    相关文章

      网友评论

        本文标题:python批量爬取NCBI基因注释并调用谷歌API批量翻译

        本文链接:https://www.haomeiwen.com/subject/dfmnxhtx.html