美文网首页
利用python去除pdf水印

利用python去除pdf水印

作者: 是东东 | 来源:发表于2021-03-04 13:35 被阅读0次

    去水印只针对pdf
    通过给与的水印图,找出相似水印并去除

    def is_same_img(area_chart, target_img, VPT):
        """
    
        :param area_chart: 去水印图
        :param target_img: 目标图
        :param VPT: 阈值
        :return: 是否相似
        """
    
        # 规范图片尺寸
        with Image.open(area_chart) as img2:
            size = img2.size
        with Image.open(target_img) as img1:
            img1 = img1.convert('RGB')
            resize_img = img1.resize(size, Image.ANTIALIAS)  # x, y 为压缩后的宽和高  Image.ANTIALIAS  抗锯齿
        resize_img.save(target_img, quality=100)
    
        path = './imgs'
        if switch == 0:
            path = './'
    
        highfreq_factor = 4  # resize的尺度
        hash_size = 32  # 最终返回hash数值长度
        image_scale = 64
        list_file = []
        list_phash = []
        list_ahash = []
        list_dhash = []
        list_whash = []
        for file in [area_chart, target_img]:
            if os.path.splitext(file)[1] == '.png':
                path_file = os.path.join(path, file)  # 拼路径
                list_file.append(file)
                phash = imagehash.phash(Image.open(path_file), hash_size=hash_size, highfreq_factor=highfreq_factor)  # 感知哈希(perception hashing)
                ahash = imagehash.average_hash(Image.open(path_file), hash_size=hash_size)  # 平均散列(average hashing)
                dhash = imagehash.dhash(Image.open(path_file), hash_size=hash_size)  # 梯度散列(difference hashing)
                whash = imagehash.whash(Image.open(path_file), image_scale=image_scale, hash_size=hash_size, mode='db4')  # 离散小波变换(wavelet hashing)
                list_phash.append(phash)
                list_ahash.append(ahash)
                list_dhash.append(dhash)
                list_whash.append(whash)
        for i in range(len(list_file)):
            for j in range(i + 1, len(list_file)):
                phash_value = 1 - (list_phash[i] - list_phash[j]) / len(list_phash[i].hash) ** 2
                ahash_value = 1 - (list_ahash[i] - list_ahash[j]) / len(list_ahash[i].hash) ** 2
                dhash_value = 1 - (list_dhash[i] - list_dhash[j]) / len(list_dhash[i].hash) ** 2
                whash_value = 1 - (list_whash[i] - list_whash[j]) / len(list_whash[i].hash) ** 2
                value_hash = max(phash_value, ahash_value, dhash_value, whash_value)
                if (value_hash > VPT):  # 阈值设为0.7
                    size_i = os.path.getsize(path + '\\' + list_file[i])
                    size_j = os.path.getsize(path + '\\' + list_file[j])
                    print(list_file[i], str(size_i / 1024) + 'KB')
                    print(list_file[j], str(size_j / 1024) + 'KB')
                    print(value_hash)
                    print('***********************')
                    return True
        return False
    
    
    def delete_wartermark(target_path, area_chart, VPT=0.9):
        """
        :param target_path: 目标路径
        :param area_chart: 去水印图
        :param VPT: 图片相似图 阈值
        :return:  文件路径
        """
        save_pdf_path = ''
        try:
            if '.pdf' in target_path:
                with fitz.open(target_path) as pdf_document:
                    for current_page in range(len(pdf_document)):
                        for image in pdf_document.getPageImageList(current_page):
                            xref = image[0]
                            pix = fitz.Pixmap(pdf_document, xref)
                            if pix.n < 4:  # this is GRAY or RGB
                                save_path = "page%s_%s.png" % (current_page, xref)
                                pix.writePNG(save_path)
                                if is_same_img(save_path, area_chart, VPT):
                                    pdf_document._deleteObject(image[0])
                                if os.path.exists(save_path):
                                    os.remove(save_path)
                    splittext = os.path.splitext(target_path)
                    save_pdf_path = splittext[-2] + '1' + splittext[-1]
                    pdf_document.save(save_pdf_path)
                    print('成功----删除水印')
                if os.path.exists(target_path):
                    os.remove(target_path)
        except Exception as e:
            print(e)
            print('失败----删除水印')
        return save_pdf_path
    
    
    pdf_document = r'C:\Users\Administrator\OneDrive\all_huaqiu\huaqiu_spider\test\input.pdf'
    delete_wartermark(pdf_document, area_chart='area_chart.png', VPT=0.73)
    

    本文参考资料:
    Python操作PDF-文本和图片提取(使用PyPDF2和PyMuPDF)
    Python处理PDF的实用姿势
    使用PyPDF2在PDF上去除水印

    相关文章

      网友评论

          本文标题:利用python去除pdf水印

          本文链接:https://www.haomeiwen.com/subject/lkohqltx.html