import re
import os
# import jieba
import pdfplumber
path = 'D:/Users/Desktop/test'
# jieba.load_userdict("./dict.txt")
def File_Eli(path):
# 剔除隐藏的文件,需要提供被测文件的路径,生成一个剔除隐藏文件后的列表。
path = os.listdir(path)
ls = []
for f in path:
# print(f)
if not f.startswith('.'):
ls.append(f)
return ls
def PdfPath(path):
with pdfplumber.open(path) as p:
page = p.pages[0]
fpage = page.extract_text()
content = re.sub('\s+', '', fpage).strip()
# print(x)
# print('###'*100)
# paper = x.split()[0] + x.split()[1]
# seg = jieba.lcut(fpage)
return content
eli = File_Eli(path)
def Txt_Create(Target_Path, msg):
# 新创建的txt文件的存放路径,需要提供url,生成文本及内容。msg是str。
full_path = Target_Path + 'SoftLink.txt' # 也可以创建一个.doc的word文档
file = open(full_path, 'w')
file.write(msg) # msg也就是下面的Hello world
file.close()
return file
def ReadPdf(Dir, ls):
# 提供path(路径)和ls(剔除隐藏文件的的列表),能打印出列表文本中的内容
# print("#"+Path,ls)
res = []
all = []
for i in range(len(ls)):
url = Dir + '/' + ls[i]
all.append(url)
print(url)
try:
f = PdfPath(url)
if '年度报告' in f:
if '摘要' not in f:
if '半年' not in f:
res.append(f)
# print(f)
print(url)
except:
pass
ret3 = list(set(res) - set(all))
print('ret3:', ret3)
return res
sss = ReadPdf(path, eli)
print(len(sss))
网友评论