美文网首页
pdfMaxSize

pdfMaxSize

作者: 月夜星空下 | 来源:发表于2022-01-06 11:04 被阅读0次
    from pdfminer.high_level import extract_pages
    from pdfminer.layout import LTTextContainer, LTChar
    import re
    
    
    def pdfMaxSize(file_path):
        max_size = []
        for page_layout in extract_pages(file_path):
            for element in page_layout:
                if isinstance(element, LTTextContainer):
                    for text_line in element:
                        fontSizeMax = 0
                        for character in text_line:
                            if isinstance(character, LTChar) and (character.size > fontSizeMax and character.size >= 12):
                                fontSizeMax = character.size
                                max_size.append(fontSizeMax)
        return max(max_size)
    
    
    def pdfTitle(file_path):
        featureData = []
        pageIndex = 1
        max_size = pdfMaxSize(file_path)
        for page_layout in extract_pages(file_path):
            if (pageIndex == 1):
                for element in page_layout:
                    if isinstance(element, LTTextContainer):
                        for text_line in element:
                            text = text_line.get_text()
                            fontSizeMax = 0
                            for character in text_line:
                                if isinstance(character, LTChar) and (character.size > fontSizeMax and character.size >= max_size):
                                    fontSizeMax = character.size
    
                            if (fontSizeMax > 0):
                                feature = [text.strip(), pageIndex, text_line.y0,fontSizeMax]
                                featureData.append(feature)
            pageIndex += 1
        feature_list = sorted(featureData, key=lambda featureData: (featureData[1], -int(featureData[2])))
        return feature_list
    
    
    path2 = r'\\192.168.3.201\szse\annual_inquiry_letter\CDD00003738968HF.pdf'
    
    if __name__ == '__main__':
        data = pdfTitle(path2)
        print(data)
    
    

    相关文章

      网友评论

          本文标题:pdfMaxSize

          本文链接:https://www.haomeiwen.com/subject/xywlcrtx.html