工作中遇到一个文件内容搜索的问题。有很多份不同的PDF文件,需要在它们里面搜索出还有关键词的段落。
一开始我查找了PDF库,可以实现关键词的搜索,但是文件中,每一段落都有它自己的Heading No.。而PDF库搜索出来的结果无法得出这个Heading No。
于是我尝试把文件改为Word文档。利用Word的Docx库去做搜索,同样还是无法处理Heading No.的问题。
准备放弃的时候,想到了HTML格式。如果我把Word文件另存为HTML格式,那么Heading No所处的位置是否有对应的标签可以查询到。于是尝试了一大轮后,有了以下代码。代码中我还添加了一个窗口,给用户相对一个好些的界面。
image.png
代码中通过正则表达式,判断每一段落的每一行开头,是否以特定的数字开始,因为Heading No是以数字开头的。如果是就把该段落内容保存到字典中,其中以Heading No作为字典的键。
片段如下:
all_content=soup.find_all(['h1','h2','h3', 'h4', 'h5','p'])
new_list=[]
for item in all_content:
if item.text not in new_list:
new_list.append(item.text)
dic1={} #Build a empty dic to store each clause no, and its detail content from every paragraph
Target=""
content=""
for line in new_list:
line=str(line.replace("\n"," "))
pattern=re.compile(r"(^[1-9].+)") #Judge the paragraph whether start with heading no.
line_no=bool(pattern.search(line))
if line_no: #If the paragraph start with heading no
dic1[Target]=content #Save the conent to heading no. in dic.
Target=line
content=""
continue
else: #if the paragraph is detail, not heading line,
content=content+line+"\n" # save the content
continue
完整的程序代码如下:
from tkinter import *
from bs4 import BeautifulSoup
import requests
import re
import os,sys
import tkinter.font as tf
def search_file(file,word):
global output_content
output_content=output_content+"\n"+"*"*30+file.split("\\")[-1]+" Search Result" +"*"*30+"\n"*2
url=file
htmlfile = open(url, 'r', encoding='utf-8')
demo = htmlfile.read()
soup=BeautifulSoup(demo,'lxml')
all_content=soup.find_all(['h1','h2','h3', 'h4', 'h5','p'])
new_list=[]
for item in all_content:
if item.text not in new_list:
new_list.append(item.text)
dic1={} #Build a empty dic to store each clause no, and its detail content from every paragraph
Target=""
content=""
for line in new_list:
line=str(line.replace("\n"," "))
pattern=re.compile(r"(^[1-9].+)") #Judge the paragraph whether start with heading no.
line_no=bool(pattern.search(line))
if line_no: #If the paragraph start with heading no
dic1[Target]=content #Save the conent to heading no. in dic.
Target=line
content=""
continue
else: #if the paragraph is detail, not heading line,
content=content+line+"\n" # save the content
continue
result=[] #The keyword search from the dic item, if the keyword in the item, shall print the dic key and item at the same time.
for value in dic1.values():
pattern=re.compile(r".*%s.*"%word,re.I|re.M)
rel=pattern.findall(value)
if len(rel):
result.append((list(dic1.keys())[list(dic1.values()).index(value)]))
result.append(list(rel))
result.append("\n")
return print_result(file,result)
def print_result(file,nums):
global output_content
for i in nums:
if isinstance(i, list):
print_result(file,i)
else:
output_content=output_content+i
def get_file(): #check reference files
allfile=[]
Attlink="D:\\My Documents\\CheckQP\\QP"
for name in os.listdir(Attlink): #acheive folder file list
if name!="":
filelink=os.path.join(Attlink,name) #create the file link
allfile.append(filelink)
return allfile #return a list of folder flies link
def get_process_file(): #检查有没有附件
allfile=[] #定义一个空列表
Attlink="D:\\My Documents\\CheckQP\\QP"
for name in os.listdir(Attlink): #获取该目录下的文件列表
if name.split(".")[-1]=="html":
filelink=os.path.join(Attlink,name) #构造包含附件名的绝对路径
allfile.append(filelink)
return allfile
def find_files(root_dir,word):
global output_content
output_content=""
process_list=get_process_file()
for file in process_list:
search_file(file,word)
return output_content
root=Tk()
root.title('Priscilla')
ft=tf.Font(family="Segoe UI",size=12)
frame1=Frame(root)
frame2=Frame(root)
frame3=Frame(root)
frame1.grid(row=1,column=0)
frame2.grid(row=2,column=0)
frame3.grid(row=3,column=0)
Label(frame1,text="Welcome to use Priscilla, it is used to seach design guideline content").grid(row=0,column=0,padx=5,pady=5)
Label(frame2, text="Enter a key word: ").grid(row=3,column=0,padx=5,pady=5,sticky=W)
v1=StringVar()
e1=Entry(frame2,textvariable=v1)
e1.grid(row=3,column=1,padx=5,pady=5,sticky=E)
def show():
root_dir="D:\\My Documents\\CheckQP\\QP\\"
word=e1.get()
find_files(root_dir,word)
text.insert(INSERT,output_content)
Button(frame2,text="Search",width=10,command=show)\
.grid(row=3,column=2,padx=10,pady=5,sticky=E)
scrollbar = Scrollbar(frame3)
scrollbar.pack(side=RIGHT, fill=Y)
text = Text(frame3,font=ft, yscrollcommand=scrollbar.set)
text.pack(side=LEFT,fill=BOTH)
scrollbar.config(command=text.yview)
mainloop()
网友评论