美文网首页
PDF文档全文翻译,保留原有的页面布局

PDF文档全文翻译,保留原有的页面布局

作者: 洗洗睡吧i | 来源:发表于2021-12-27 17:53 被阅读0次

    PDF文档全文翻译,保留原有的PDF页面布局

    1. 采用 pymupdf 识别 pdf 的图片和文字

    import fitz
    import re
    from pprint import pprint
    
    pdf_name = 'xxx-en.pdf'
    print(f'Source pdf file: {pdf_name} \n')
    src_pdf = fitz.open(pdf_name)
    new_pdf = fitz.open()
    
    for p, page in enumerate(src_pdf):
        print(f'\n- translating PAGE -{p}- ...')
    
        # 1.1 创建大小相同的新页面
        new_page = new_pdf.new_page(width=page.rect.width, height=page.rect.height)
    
        blocks = page.get_text('dict')['blocks']
    
        # 1.2 图片
        img_blks = [b for b in blocks if b['type'] == 1]
        for img in img_blks:
            # pprint(img)
            new_page.insert_image(img['bbox'], stream=img['image'])
    
        # 1.3 文字
        txt_blks = [b for b in blocks if b['type'] != 1]
        for txt in txt_blks:
            text_tmp = ''.join([s['text'] for l in txt['lines'] for s in l['spans']])
            text_tmp = re.sub('[@#$%^&*\'\"\n\r\t]', ' ', text_tmp).strip()
    
            if text_tmp:
                # print(txt['bbox'], text_tmp)
    
                text_translate = '中国 ' + text_tmp
                # text_translate = youdao(text_tmp)
                new_page.insert_textbox(txt['bbox'], text_translate,
                                        fontsize=6,
                                        fontname='simhei',
                                        fontfile=r'C:\Windows\Fonts\simhei.ttf')
    
        # if p == 1:
        #     break
    
    new_name = pdf_name.replace('.pdf', '-zh.pdf')
    new_pdf.save(new_name)
    
    print('\n------Done!-------')
    
    

    2. 有道翻译

    # %%
    # %%
    import requests
    import json
    import time
    
    
    def youdao(en_txt=''):
        api_url = 'http://fanyi.youdao.com/translate?&i={}&doctype=json'
    
        res = requests.get(api_url.format(en_txt)).json()
        time.sleep(3.0)
        # print(res)
    
        zh_txt = ''.join([seq['tgt'] for seq in res['translateResult'][0]])
    
        print(f'*** {en_txt} \n--> {zh_txt}')
    
    
        return zh_txt
    
    
    en_txt = 'so we beat on, boats against the current, borne back ceaselessly into the past.'
    youdao(en_txt)
    

    3. 百度翻译

    # %%
    import requests
    import random
    import json
    from hashlib import md5
    import time
    
    
    # ref: https://api.fanyi.baidu.com/doc/
    appid = '2222222222222222'
    appkey = 'ooooooooooooooooooo'
    
    # Generate salt and sign
    def make_md5(s, encoding='utf-8'):
        return md5(s.encode(encoding)).hexdigest()
    
    
    def baidu(en_txt=''):
        salt = random.randint(32768, 65536)
        sign = make_md5(appid + en_txt + str(salt) + appkey)
    
        api_url = 'http://api.fanyi.baidu.com/api/trans/vip/translate'
        headers = {'Content-Type': 'application/x-www-form-urlencoded'}
        payload = {'appid': appid, 'q': en_txt, 'from': 'en', 'to': 'zh', 'salt': salt, 'sign': sign}
    
        res = requests.get(api_url, params=payload, headers=headers, timeout=3.0).json()
        time.sleep(3.0)
    
        if 'trans_result' in res.keys():
            zh_txt = ''.join([seq['dst'] for seq in res['trans_result']])
            print(f'*** {en_txt} \n--> {zh_txt}')
            return zh_txt
    
        if 'error_code' in res.keys():
            print(f'*** {en_txt} \n??? {res}')
    
    
    en_txt = 'so we beat on, boats against the current, borne back ceaselessly into the past.'
    baidu(en_txt)
    

    相关文章

      网友评论

          本文标题:PDF文档全文翻译,保留原有的页面布局

          本文链接:https://www.haomeiwen.com/subject/zyqrqrtx.html