- pymupdf
- pip install pymupdf
- https://pymupdf.readthedocs.io/en/latest/index.html
- youdao api
1. 采用 pymupdf 识别 pdf 的图片和文字
import fitz
import re
from pprint import pprint
pdf_name = 'xxx-en.pdf'
print(f'Source pdf file: {pdf_name} \n')
src_pdf = fitz.open(pdf_name)
new_pdf = fitz.open()
for p, page in enumerate(src_pdf):
print(f'\n- translating PAGE -{p}- ...')
# 1.1 创建大小相同的新页面
new_page = new_pdf.new_page(width=page.rect.width, height=page.rect.height)
blocks = page.get_text('dict')['blocks']
# 1.2 图片
img_blks = [b for b in blocks if b['type'] == 1]
for img in img_blks:
# pprint(img)
new_page.insert_image(img['bbox'], stream=img['image'])
# 1.3 文字
txt_blks = [b for b in blocks if b['type'] != 1]
for txt in txt_blks:
text_tmp = ''.join([s['text'] for l in txt['lines'] for s in l['spans']])
text_tmp = re.sub('[@#$%^&*\'\"\n\r\t]', ' ', text_tmp).strip()
if text_tmp:
# print(txt['bbox'], text_tmp)
text_translate = '中国 ' + text_tmp
# text_translate = youdao(text_tmp)
new_page.insert_textbox(txt['bbox'], text_translate,
# if p == 1:
# break
new_name = pdf_name.replace('.pdf', '-zh.pdf')
2. 有道翻译
# %%
# %%
import requests
import json
import time
def youdao(en_txt=''):
api_url = 'http://fanyi.youdao.com/translate?&i={}&doctype=json'
res = requests.get(api_url.format(en_txt)).json()
# print(res)
zh_txt = ''.join([seq['tgt'] for seq in res['translateResult'][0]])
print(f'*** {en_txt} \n--> {zh_txt}')
return zh_txt
en_txt = 'so we beat on, boats against the current, borne back ceaselessly into the past.'
3. 百度翻译
# %%
import requests
import random
import json
from hashlib import md5
import time
# ref: https://api.fanyi.baidu.com/doc/
appid = '2222222222222222'
appkey = 'ooooooooooooooooooo'
# Generate salt and sign
def make_md5(s, encoding='utf-8'):
return md5(s.encode(encoding)).hexdigest()
def baidu(en_txt=''):
salt = random.randint(32768, 65536)
sign = make_md5(appid + en_txt + str(salt) + appkey)
api_url = 'http://api.fanyi.baidu.com/api/trans/vip/translate'
headers = {'Content-Type': 'application/x-www-form-urlencoded'}
payload = {'appid': appid, 'q': en_txt, 'from': 'en', 'to': 'zh', 'salt': salt, 'sign': sign}
res = requests.get(api_url, params=payload, headers=headers, timeout=3.0).json()
if 'trans_result' in res.keys():
zh_txt = ''.join([seq['dst'] for seq in res['trans_result']])
print(f'*** {en_txt} \n--> {zh_txt}')
return zh_txt
if 'error_code' in res.keys():
print(f'*** {en_txt} \n??? {res}')
en_txt = 'so we beat on, boats against the current, borne back ceaselessly into the past.'