美文网首页
内容格式还原

内容格式还原

作者: 月夜星空下 | 来源:发表于2022-04-05 11:43 被阅读0次
    import pdfplumber
    
    
    def pdfRestore(path, is_footer=True):  # pdf还原
        """
        pdf排版格式还原
        :param path:
        :param is_footer:
        :return:
        """
        # is_footer = pdfContent(path)
        # print('is_footer:', is_footer)
        global result
        try:
            with pdfplumber.open(path) as pdf:
                content = ''
                for i in range(len(pdf.pages)):
                    page = pdf.pages[i]
                    if is_footer:
                        # print('page.extract_text:', page.extract_text().split('\n'))
                        page_content = '\n'.join(page.extract_text().split('\n')[:-1])
                        content = content + page_content
                    else:
                        page_content = '\n'.join(page.extract_text().split('\n'))
                        content = content + page_content
                # print('$' * 100)
            """数据处理"""
            result = content.replace(' \n', '$$')
            result = ' '.join(result.split())
            result = result.replace(' ', '##')  # 原始
            result = result.replace('$$', ' \n')
            result = ' '.join(result.split())
            result = result.replace(' ', '\n')
            result = result.replace('##', ' ')
            result = result.replace('\n ', ' ')
        except Exception as e:
            print('pdfRestore erro:', e)
        return result
    
    path = 'D:\Downloads\创业板首次公开发行证券发行与承销特别规定(2021年修订).pdf'
    data = pdfRestore(path)
    print(data)
    

    相关文章

      网友评论

          本文标题:内容格式还原

          本文链接:https://www.haomeiwen.com/subject/jddksrtx.html