from pdfminer.high_level import extract_pages
from pdfminer.layout import LTTextContainer, LTChar
import re
def pdfMaxSize(file_path):
max_size = []
for page_layout in extract_pages(file_path):
for element in page_layout:
if isinstance(element, LTTextContainer):
for text_line in element:
fontSizeMax = 0
for character in text_line:
if isinstance(character, LTChar) and (character.size > fontSizeMax and character.size >= 12):
fontSizeMax = character.size
max_size.append(fontSizeMax)
return max(max_size)
def pdfTitle(file_path):
featureData = []
pageIndex = 1
max_size = pdfMaxSize(file_path)
for page_layout in extract_pages(file_path):
if (pageIndex == 1):
for element in page_layout:
if isinstance(element, LTTextContainer):
for text_line in element:
text = text_line.get_text()
fontSizeMax = 0
for character in text_line:
if isinstance(character, LTChar) and (character.size > fontSizeMax and character.size >= max_size):
fontSizeMax = character.size
if (fontSizeMax > 0):
feature = [text.strip(), pageIndex, text_line.y0,fontSizeMax]
featureData.append(feature)
pageIndex += 1
feature_list = sorted(featureData, key=lambda featureData: (featureData[1], -int(featureData[2])))
return feature_list
path2 = r'\\192.168.3.201\szse\annual_inquiry_letter\CDD00003738968HF.pdf'
if __name__ == '__main__':
data = pdfTitle(path2)
print(data)
网友评论