思路步骤:
功能:将pdf文件进行ocr识别
- 步骤1:pdf转换成高精度的png图片,保存到result/revote的目录下
- 步骤2:将图片旋转摆正并保存,保存到result的目录下
- 步骤3:图片进行ocr识别并写入当前文档
# *_* coding :UTF-8 *_*
import os
import time
import fitz
import glob as glob
import requests
from aip import AipOcr
from PIL import Image
'''
申请百度ai,获取APP_ID,API_KEY,SECRET_KEY
'''
APP_ID = '你创建的APP_ID'
API_KEY = 你创建的API_KEY ' # 公钥
SECRET_KEY = '你创建的SECRET_KEY ' # 密钥
aipOcr = AipOcr(APP_ID, API_KEY, SECRET_KEY) # 初始化AipFace对象
# 定义常量,请用你刚才自己申请的AI接口
pdf_filepath = "收费设施.pdf" # 进行ocr识别的pdf路径
result_dir = "pdf_tableresult" # table类型的表格存放的路径
revolve = "90" # 需要进行逆时针旋转的角度
dir = "result/" # 旋转后图片生成的目录
type_transform = "excel" # 转换类型(txt:转换成txt文档 | excel,转换成excel文档)
#选择转换类型
def discriminate_main():
images = discriminate_pdf()
if type_transform == "txt":
txt_discriminate(images)
if type_transform == "excel":
table_discriminate(images)
else:
print("输入转换类型错误")
# 打开PDF文件,生成一个对象
def open_files1(pdf_filepath):
doc = fitz.open(pdf_filepath)
for pg in range(doc.pageCount):
page = doc[pg]
rotate = int(0)
# 每个尺寸的缩放系数为2,这将为我们生成分辨率提高四倍的图像。
zoom_x = 2.0
zoom_y = 2.0
trans = fitz.Matrix(zoom_x, zoom_y).preRotate(rotate)
pm = page.getPixmap(matrix=trans, alpha=False)
pm.writePNG(dir + 'revolve/' + '%s.png' % pg)
im = Image.open(dir + 'revolve/' + '%s.png' % pg)
im2 = im.rotate(int(revolve)) # 图片逆时针旋转
im2.save(dir + '%s.png' % pg) # 旋转后的图片进行保存
# 识别相应的图片名称
def pdf_photo_open():
img_name = []
for file_name in glob.glob(dir + "*png"):
img_name.append(file_name)
return img_name
def get_file_content(filePath):
with open(filePath, 'rb') as fp:
return fp.read()
def file_download(url, file_path): # 进行excel文件的下载
r = requests.get(url)
with open(file_path, 'wb') as f:
f.write(r.content)
options = {}
options["detect_direction"] = "true" # 检测朝向
options["language_type"] = "CHN_ENG" # 检测语言
options["result_type"] = "excel" # ocr识别表格的时候使用(json excel)
def discriminate_pdf(): #
open_files1(pdf_filepath) #pdf转换成图片,并旋转成想要的角度
img_names = pdf_photo_open() #识别所有需要转换的图片的名称
images = []
for filePath in img_names:
image = get_file_content(filePath) #读取需要转换的图片
images.append(image)
return images
def txt_discriminate(images):
for image in images:
result = aipOcr.webImage(image, options) # 识别网络文字
# print(result)
for i in range(len(result['words_result'])):
ocr_txt = result['words_result'][i]['words']
print(ocr_txt)
with open('test.txt', 'a+', encoding='UTF-8') as file:
file.write(ocr_txt + '\n')
def table_discriminate(images):
for image in images:
result = aipOcr.tableRecognitionAsync(image, options) #识别为表格类型
table_requestid = result["result"][0]["request_id"]
# print(table_requestid)
for count in range(1, 10): # OCR识别也需要一定时间,设定10秒内每隔1秒查询一次
res = aipOcr.getTableRecognitionResult(table_requestid) # 通过ID获取表格文件XLS地址
print(res['result']['ret_msg'])
if res['result']['ret_msg'] == '已完成': #ret_msg变成已完成需要一定的时间,已完成的时候result_data才有值
print(images[7:]) #打印图片名称
print(res['result']['result_data']) #打印excel下载链接
break # 云端处理完毕,成功获取表格文件下载地址,跳出循环
else:
url = res['result']['result_data']
xls_name = images[7:] + '.xls'
file_download(url, os.path.join(result_dir, xls_name)) # 调用下载,(os.path.join用于路径拼接文件路径)
num = num + 1
print('{0}: {1} 下载完成。'.format(num, xls_name))
time.sleep(1)
discriminate_main()
网友评论