#!/usr/bin/env python3
# -*- coding: utf-8 -*-
__author__ = 'silva'
__doc__ = """
传入待拆分的pdf文件夹目录的绝对路径: dir_path
新建拆分文件保存拆分后的pdf:dir_path_split
Note: 异常不可拆分文件会保留copy到拆分文件夹里
"""
import os
from shutil import copy
from PyPDF2 import PdfFileReader, PdfFileWriter
split_dir_list = [
# r'C:\Users\silva\Desktop\银行文档拆分\海峡银行原始数据',
# r'C:\Users\silva\Desktop\银行文档拆分\桂林银行原始数据',
# r'C:\Users\silva\Desktop\银行文档拆分\温州银行原始数据',
r'C:\Users\silva\Desktop\新增追加提取_appdend\原文档copy'
]
error_list = []
def run():
for dir_path in split_dir_list:
input_dir_name = os.path.split(dir_path)[-1]
output_dir_name = input_dir_name + '_split'
output_dir_path = os.path.join(os.path.split(dir_path)[0], output_dir_name)
# 创建拆分文件夹
if not os.path.exists(output_dir_path):
os.makedirs(output_dir_path)
# 待拆分文件
for file in os.listdir(dir_path):
# 拼接源文件路径
file_path = os.path.join(dir_path, file)
if os.path.isfile(file_path) and file.endswith('.pdf'):
filename = os.path.split(file_path)[-1].strip('.pdf')
split_pdf(file_path, output_dir_path, filename)
else:
print('WARMING: 异常文件路径不能读写', file_path)
error_list.append(file_path)
if os.path.isdir(file_path):
continue
error_copy_file = os.path.join(output_dir_path, file)
copy(file_path, error_copy_file)
print('异常文件路径汇总:', error_list)
def split_pdf(infile_path, out_path, filename):
"""
:param infile: 待拆分的pdf文件
:param out_path: 拆分成单页的pdf文件的存储路径
:return: 无
"""
if not os.path.exists(out_path):
os.makedirs(out_path)
with open(infile_path, 'rb') as fp:
reader = PdfFileReader(fp, strict=False)
number_of_pages = reader.getNumPages() # 计算此PDF文件中的页数
print('---------当前文件 {0} 共:{1} 页----------'.format(infile_path, number_of_pages))
for i in range(number_of_pages):
writer = PdfFileWriter()
writer.addPage(reader.getPage(i))
out_file_name = os.path.join(out_path, filename+'_{}.pdf'.format(i+1))
with open(out_file_name, 'wb') as outfile:
writer.write(outfile)
print('拆分成功:', out_file_name)
if __name__ == '__main__':
print(__doc__)
run()
网友评论