import re
from pdfminer.high_level import extract_pages
from pdfminer.layout import LTTextContainer, LTChar
path = r'\\192.168.3.201\szse\regulatory_function\gkxx_jgsy_00000740626.pdf'
def main(file_path):
featureData = []
pageIndex = 1
for page_layout in extract_pages(file_path):
if (pageIndex > 0):
for element in page_layout:
if isinstance(element, LTTextContainer):
for text_line in element:
text = text_line.get_text()
for character in text_line:
if isinstance(character,LTChar) and 7.5 <= character.size < 11 and 10 < text.__len__() and re.findall(".*号.*", text).__len__() > 0:
featureData.append(str(text).replace('\n', '').replace(' ', ''))
break
return featureData[0]
text = main(path)
print(text)
网友评论