美文网首页
python实现PDF文档间对比(百度文本识别接口)

python实现PDF文档间对比(百度文本识别接口)

作者: SyKay | 来源:发表于2020-11-30 10:58 被阅读0次

    一、原理
    1、将PDF文档每页转换为图像
    2、调用百度通用文本识别页面接口,对图像进行内容识别
    3、对图像内容进行对比,并将对比不一致的内容在文档图像上进行标记(红框)
    4、将对比结果表格输出为html,以便进行识别

    二、范围和限制
    1、目前仅支持PDF文档之间的对比
    2、无法识别图形(盖章和logo)、不清晰字迹
    3、需要联网使用(OCR使用的是百度通用文本识别接口,仅限测试使用,暂不限次数)
    4、对比存在误差(原因为百度OCR识别无法达到100%准确)

    三、安装库
    pip install pymupdf
    pip install requests

    四、参数
    originPDF: PDF文档原件路径
    contrastPDF: PDF文档扫描件路径
    resultRoot: 输出结果路径(提示:程序运行后会清空该目录,请不要直接设置桌面)
    输出 : 标注差异的文档图像、Html文档

    五、源码

    #!/usr/bin/env python3
    # -*- coding: utf-8 -*-
    
    __author__ = '孙思锴'
    
    import os
    import shutil
    import fitz
    import difflib
    from datetime import datetime
    import base64
    from PIL import Image
    from PIL import ImageDraw
    import requests
    from concurrent.futures import ThreadPoolExecutor
    
    session = requests.session()
    originDic = {}  # 空字典,用于保存原件中每一页对比不一致的文本
    contrastDic = {}  # 文档扫描件
    url = 'https://ai.baidu.com/aidemo'  # 百度文本识别接口URL
    headers = {
        "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Safari/537.36",
    }
    
    
    def initRoot(rootPath):
        """
        初始化目录
        :param rootPath:
        :return:rootPath
        """
        rootPath = os.path.abspath(rootPath)
        if os.path.exists(rootPath):
            # 检查用于放图片的目录是否存在,是的话删除
            shutil.rmtree(rootPath)  # 清空图片目录
        os.makedirs(rootPath)  # 创建图片目录
        return rootPath
    
    
    def conver_img(pdfFilepath, outputPath):
        """
        pdf转换PNG图片
        :param outputPath: PNG图片输出路径
        :param pdfFilepath: pdf文档路径
        :return: doc.pageCount, ImagePath 文档图像张数,保存地址
        """
    
        pdfFilepath = os.path.abspath(pdfFilepath)  # 绝对路径
        if not os.path.exists(pdfFilepath):
            # 检查文件是否存在
            print('文件不存在:', pdfFilepath)
            exit(0)
    
        # 获取文件同名目录和类型
        pdfName = os.path.basename(pdfFilepath)  # 返回文件名
        pdfNamePath, extension = os.path.splitext(pdfName)
        ImagePath = os.path.join(outputPath, pdfNamePath)  # pdf文档图像保存地址
        if os.path.exists(ImagePath):
            # 检查用于放图片的目录是否存在,是的话删除
            shutil.rmtree(ImagePath)  # 清空图片目录
        os.makedirs(ImagePath)  # 创建图片目录
    
        # 读取文件
        doc = fitz.open(pdfFilepath)
        for page_index in range(doc.pageCount):
            page = doc[page_index]  # 逐页读取pdf
            # 每个尺寸的缩放系数为2,这将为我们生成分辨率提高四倍的图像。
            zoom_x = 2.0
            zoom_y = 2.0
            trans = fitz.Matrix(zoom_x, zoom_y)  # .preRotate(0)  # .preRotate(rotate)是执行一个旋转
            pm = page.getPixmap(matrix=trans, alpha=False)
            pm.writePNG(os.path.join(ImagePath, str(page_index) + '.png'))  # 保存图片
        return doc.pageCount, ImagePath
    
    
    def getImageInfo(filename):
        """
        调用百度接口进行图像内容识别,通用文本识别(高精度含位置版)
        1、将image转为base64
        2、拼装请求,发送请求
        3、检验请求结果,返回
        :param filename:图片地址
        :return:json
        {'errno': 102, 'msg': '请求Demo过于频繁', 'data': ''}
        {'errno': 106, 'msg': '文件类型错误', 'data': ''}
        {'errno': 0, 'msg': 'success', 'data': {'log_id': '9163508383702196122', 'words_result_num': 30, 'words_result': [{'location': {'width': 142, 'top': 87, 'left': 202, 'height': 41}, 'words': '发银行'}, {'location': {'width': 86, 'top': 106, 'left': 909, 'height': 28}, 'words': '保密协议'}]}}
        """
    
        with open(filename, 'rb') as f:
            base64image = base64.b64encode(f.read()).decode()
            base64image = 'data:image/png;base64,' + base64image
        dic = {
            "image": base64image,
            "image_url": "",
            "type": "https://aip.baidubce.com/rest/2.0/ocr/v1/accurate",
            "detect_direction": "false",
        }
        for _ in range(5):
            # 循环5次进行请求,防止请求过程提示请求繁忙
            result = session.post(url=url, headers=headers, data=dic).json()
            if result['errno'] == 102:
                continue
            return result
    
    
    def imageDiff(resultRoot, originFile, contrastFile, page=1):
        """
        对比两张照片的区别
        :param resultRoot: 输出目录
        :param originFile: 源文件
        :param contrastFile: 扫描件
        :param page: 页数
        :return:
        """
        # 通过百度接口识别内容
        originResult = getImageInfo(filename=originFile)  # 识别原件内容
        contrastResult = getImageInfo(filename=contrastFile)  # 识别扫描件内容
    
        offset = 40  # 设置偏差值,防止原文档图像和扫描版图像出现位置偏差
        # 将原件的所有词块,一个个拿去扫描版的里对比,若位置偏差在设置范围内和词性一致,则评定词块相等
        for origin_words in originResult['data']['words_result'][:]:
            # 获取词块的相关位置信息
            left, top = origin_words['location']['left'], origin_words['location']['top']
            # right, bottom = left + origin_words['location']['width'], top + origin_words['location']['height']
            for contrast_words in contrastResult['data']['words_result'][:]:
                # 获取词块的相关位置信息
                result_left, result_top = contrast_words['location']['left'], contrast_words['location']['top']
                # result_right, result_bottom = result_left + contrast_words['location']['width'], result_top + \
                #                               contrast_words['location']['height']
                if abs(top - result_top) < offset:
                    # 判断词块距离顶部的位置是否在偏差范围内,可理解为两个词块位置是否一致
                    if origin_words['words'] == contrast_words['words']:
                        contrastResult['data']['words_result'].remove(contrast_words)  # 删除原件词块
                        originResult['data']['words_result'].remove(origin_words)  # 删除原件词块
                        break  # 已找到词块退出循环
                    elif origin_words['words'] in contrast_words['words']:
                        # 说明扫描件内容和原件不一样
                        originResult['data']['words_result'].remove(origin_words)  # 删除原件词块
                        contrast_words['words'] = contrast_words['words'].replace(origin_words['words'], '', 1)
                        break  # 已找到词块退出循环
    
        # 文档图像标注,画框标注出不一样的内容
        originImage = Image.open(originFile)
        originDraw = ImageDraw.ImageDraw(originImage)
        originText = ''  # 保存对比不一致的文本
        for words in originResult['data']['words_result']:
            originText += words['words'] + '\n'
            left, top = words['location']['left'], words['location']['top']
            right, bottom = left + words['location']['width'], top + words['location']['height']
            originDraw.rectangle(((left, top), (right, bottom)), outline='red', width=2)
        originDic[page] = originText  # 空字典,用于保存原件中每一页对比不一致的文本
    
        contrastImage = Image.open(contrastFile)
        contrastDraw = ImageDraw.ImageDraw(contrastImage)
        contrastText = ''
        for words in contrastResult['data']['words_result']:
            # 获取扫描版的每个词块
            contrastText += words['words'] + '\n'
            left, top = words['location']['left'], words['location']['top']
            right, bottom = left + words['location']['width'], top + words['location']['height']
            contrastDraw.rectangle(((left, top), (right, bottom)), outline='red', width=2)
        contrastDic[page] = contrastText  # 文档扫描件
    
        # 图像合并,生成对比图
        originSize = originImage.size  # 获取原始照片大小
        contrastSize = contrastImage.size  # 获取扫描件大小
        newImage_width = originSize[0] + contrastSize[0]
        newImage_hight = originSize[1] if originSize[1] > contrastSize[1] else contrastSize[1]
        new_Image = Image.new('RGB', (newImage_width, newImage_hight), "#000000")
        new_Image.paste(originImage, (0, 0))
        new_Image.paste(contrastImage, (originSize[0], 0))
        new_Image.save(os.path.join(resultRoot, "第" + str(page) + '页文档.png'))
    
    
    if __name__ == '__main__':
        startTime = datetime.now()
        # 读取要对比的文件
        originPDF = r'E:\Workspace\PycharmProjects\Python学习\合同文档比对\测试文档\测试文档-扫描件.pdf'  # 文档原件
        contrastPDF = r'E:\Workspace\PycharmProjects\Python学习\合同文档比对\测试文档\测试文档-改字.pdf'  # 文档扫描件
        resultRoot = r'E:\Workspace\PycharmProjects\Python学习\合同文档比对\测试文档\对比结果'  # 输出目录
    
        resultRoot = initRoot(resultRoot)  # 清空输出目录
        originImageNum, originImagePath = conver_img(originPDF, resultRoot)  # 将原件pdf文档转换为图像
        contrastImageNum, contrastImagePath = conver_img(contrastPDF, resultRoot)  # 将扫描件pdf文档转换为图像
        if originImageNum != contrastImageNum:
            print('文档页数不一致!请查看', resultRoot)
            exit(0)
        resultRoot = os.path.join(resultRoot, '对比结果')  # 创建输出结果目录
        os.makedirs(resultRoot)  # 创建输出目录
        executor = ThreadPoolExecutor()  # 开启线程池
        for i in range(originImageNum):
            originFile = os.path.join(originImagePath, str(i) + '.png')
            contrastFile = os.path.join(contrastImagePath, str(i) + '.png')
            executor.submit(imageDiff, resultRoot, originFile, contrastFile, i + 1)  # 图像对比
        executor.shutdown(wait=True)  # 等待线程池为空后,关闭线程池
    
        # 输出对比到Html文件
        diff = difflib.HtmlDiff()
        with open(os.path.join(resultRoot, '结果.html'), 'w', encoding="utf-8") as f:
            for i in range(originImageNum):
                make_content = diff.make_file(fromlines=originDic[i + 1].splitlines(),
                                              tolines=contrastDic[i + 1].splitlines(),
                                              fromdesc='原件第' + str(i + 1) + '页', todesc='扫描件第' + str(i + 1) + '页')
                f.write(make_content)
    
        session.close()  # 关闭Session
        endTime = datetime.now()
        print('文档共', originImageNum, '页,执行总时间:', endTime - startTime)
        print('执行成功,请查看输出目录:', resultRoot)
    
    

    六、执行结果示例:


    标注差异的对比照片
    Html文档表格

    相关文章

      网友评论

          本文标题:python实现PDF文档间对比(百度文本识别接口)

          本文链接:https://www.haomeiwen.com/subject/ncojwktx.html