美文网首页
PDF年报|非年报识别

PDF年报|非年报识别

作者: 月夜星空下 | 来源:发表于2021-05-20 09:57 被阅读0次
    import re
    import os
    # import jieba
    import pdfplumber
    
    path = 'D:/Users/Desktop/test'
    
    
    # jieba.load_userdict("./dict.txt")
    
    
    def File_Eli(path):
        # 剔除隐藏的文件,需要提供被测文件的路径,生成一个剔除隐藏文件后的列表。
        path = os.listdir(path)
        ls = []
        for f in path:
            # print(f)
            if not f.startswith('.'):
                ls.append(f)
        return ls
    
    
    def PdfPath(path):
        with pdfplumber.open(path) as p:
            page = p.pages[0]
    
            fpage = page.extract_text()
            content = re.sub('\s+', '', fpage).strip()
            # print(x)
            # print('###'*100)
        # paper = x.split()[0] + x.split()[1]
        # seg = jieba.lcut(fpage)
        return content
    
    
    eli = File_Eli(path)
    
    
    def Txt_Create(Target_Path, msg):
        # 新创建的txt文件的存放路径,需要提供url,生成文本及内容。msg是str。
        full_path = Target_Path + 'SoftLink.txt'  # 也可以创建一个.doc的word文档
    
        file = open(full_path, 'w')
        file.write(msg)  # msg也就是下面的Hello world
        file.close()
        return file
    
    
    def ReadPdf(Dir, ls):
        # 提供path(路径)和ls(剔除隐藏文件的的列表),能打印出列表文本中的内容
        # print("#"+Path,ls)
        res = []
        all = []
        for i in range(len(ls)):
            url = Dir + '/' + ls[i]
            all.append(url)
            print(url)
            try:
                f = PdfPath(url)
                if '年度报告' in f:
                    if '摘要' not in f:
                        if '半年' not in f:
                            res.append(f)
                            # print(f)
                            print(url)
            except:
                pass
        ret3 = list(set(res) - set(all))
        print('ret3:', ret3)
        return res
    
    
    sss = ReadPdf(path, eli)
    print(len(sss))
    
    

    相关文章

      网友评论

          本文标题:PDF年报|非年报识别

          本文链接:https://www.haomeiwen.com/subject/eqevjltx.html