pdfTitle

作者: 月夜星空下 | 来源:发表于2022-01-05 14:23 被阅读0次
    def pdfTitle(path):
        featureData = []
        pageIndex = 1
        for page_layout in extract_pages(path, page_numbers=[0], maxpages=1):  # page_numbers=[0], maxpages=1 页码
            if pageIndex == 1:
                for element in page_layout:
                    if isinstance(element, LTTextContainer):
                        for text_line in element:
                            text = text_line.get_text()
                            fontSizeMax = 0
                            for character in text_line:
                                if isinstance(character, LTChar) and (
                                        character.size > fontSizeMax and character.size >= 16):
                                    fontSizeMax = character.size
                            if (fontSizeMax > 0):
                                feature = [text.strip(), pageIndex, text_line.y0, fontSizeMax]
                                featureData.append(feature)
        feature_list = sorted(featureData, key=lambda featureData: (featureData[1], -int(featureData[2])))
        data = [i[0] for i in feature_list]
        data = str('&'.join(data)).replace(' ','').strip('&').replace('&&《','《').replace('》&','》')
        return data
    

    相关文章

      网友评论

          本文标题:pdfTitle

          本文链接:https://www.haomeiwen.com/subject/mbxrcrtx.html