def pdfTitle(path):
featureData = []
pageIndex = 1
for page_layout in extract_pages(path, page_numbers=[0], maxpages=1): # page_numbers=[0], maxpages=1 页码
if pageIndex == 1:
for element in page_layout:
if isinstance(element, LTTextContainer):
for text_line in element:
text = text_line.get_text()
fontSizeMax = 0
for character in text_line:
if isinstance(character, LTChar) and (
character.size > fontSizeMax and character.size >= 16):
fontSizeMax = character.size
if (fontSizeMax > 0):
feature = [text.strip(), pageIndex, text_line.y0, fontSizeMax]
featureData.append(feature)
feature_list = sorted(featureData, key=lambda featureData: (featureData[1], -int(featureData[2])))
data = [i[0] for i in feature_list]
data = str('&'.join(data)).replace(' ','').strip('&').replace('&&《','《').replace('》&','》')
return data
网友评论