import pdfplumber
def pdfRestore(path, is_footer=True): # pdf还原
"""
pdf排版格式还原
:param path:
:param is_footer:
:return:
"""
# is_footer = pdfContent(path)
# print('is_footer:', is_footer)
global result
try:
with pdfplumber.open(path) as pdf:
content = ''
for i in range(len(pdf.pages)):
page = pdf.pages[i]
if is_footer:
# print('page.extract_text:', page.extract_text().split('\n'))
page_content = '\n'.join(page.extract_text().split('\n')[:-1])
content = content + page_content
else:
page_content = '\n'.join(page.extract_text().split('\n'))
content = content + page_content
# print('$' * 100)
"""数据处理"""
result = content.replace(' \n', '$$')
result = ' '.join(result.split())
result = result.replace(' ', '##') # 原始
result = result.replace('$$', ' \n')
result = ' '.join(result.split())
result = result.replace(' ', '\n')
result = result.replace('##', ' ')
result = result.replace('\n ', ' ')
except Exception as e:
print('pdfRestore erro:', e)
return result
path = 'D:\Downloads\创业板首次公开发行证券发行与承销特别规定(2021年修订).pdf'
data = pdfRestore(path)
print(data)
网友评论