美文网首页Python学习内容整理
多文件关键词内容搜索

多文件关键词内容搜索

作者: Lorence | 来源:发表于2020-02-09 13:43 被阅读0次

工作中遇到一个文件内容搜索的问题。有很多份不同的PDF文件,需要在它们里面搜索出还有关键词的段落。
一开始我查找了PDF库,可以实现关键词的搜索,但是文件中,每一段落都有它自己的Heading No.。而PDF库搜索出来的结果无法得出这个Heading No。
于是我尝试把文件改为Word文档。利用Word的Docx库去做搜索,同样还是无法处理Heading No.的问题。
准备放弃的时候,想到了HTML格式。如果我把Word文件另存为HTML格式,那么Heading No所处的位置是否有对应的标签可以查询到。于是尝试了一大轮后,有了以下代码。代码中我还添加了一个窗口,给用户相对一个好些的界面。


image.png

代码中通过正则表达式,判断每一段落的每一行开头,是否以特定的数字开始,因为Heading No是以数字开头的。如果是就把该段落内容保存到字典中,其中以Heading No作为字典的键。
片段如下:

all_content=soup.find_all(['h1','h2','h3', 'h4', 'h5','p'])
    new_list=[]
    for item in all_content:
        if item.text not in new_list:
            new_list.append(item.text)
    dic1={}   #Build a empty dic to store each clause no, and its detail content from every paragraph
    Target=""
    content=""
    for line in new_list:
        line=str(line.replace("\n"," "))
        pattern=re.compile(r"(^[1-9].+)")   #Judge the paragraph whether start with heading no. 
        line_no=bool(pattern.search(line))  
        if line_no:                                          #If the paragraph start with heading no
            dic1[Target]=content               #Save the conent to heading no. in dic.
            Target=line                                  
            content=""
            continue
        else:                                                   #if the paragraph is detail, not heading line, 
            content=content+line+"\n"     # save the content
            continue

完整的程序代码如下:

from tkinter import *
from bs4 import BeautifulSoup
import requests
import re
import os,sys
import tkinter.font as tf

def search_file(file,word):
    global output_content
    output_content=output_content+"\n"+"*"*30+file.split("\\")[-1]+" Search Result" +"*"*30+"\n"*2
    url=file
    htmlfile = open(url, 'r', encoding='utf-8')
    demo = htmlfile.read()
    soup=BeautifulSoup(demo,'lxml')
    all_content=soup.find_all(['h1','h2','h3', 'h4', 'h5','p'])
    new_list=[]
    for item in all_content:
        if item.text not in new_list:
            new_list.append(item.text)
    dic1={}   #Build a empty dic to store each clause no, and its detail content from every paragraph
    Target=""
    content=""
    for line in new_list:
        line=str(line.replace("\n"," "))
        pattern=re.compile(r"(^[1-9].+)")   #Judge the paragraph whether start with heading no. 
        line_no=bool(pattern.search(line))  
        if line_no:                                          #If the paragraph start with heading no
            dic1[Target]=content               #Save the conent to heading no. in dic.
            Target=line                                  
            content=""
            continue
        else:                                                   #if the paragraph is detail, not heading line, 
            content=content+line+"\n"     # save the content
            continue
    result=[]  #The keyword search from the dic item, if the keyword in the item, shall print the dic key and item at the same time.     
    for value in dic1.values():
        pattern=re.compile(r".*%s.*"%word,re.I|re.M)
        rel=pattern.findall(value)
        if len(rel):
            result.append((list(dic1.keys())[list(dic1.values()).index(value)]))
            result.append(list(rel))
            result.append("\n")
    return print_result(file,result)
            
def print_result(file,nums):
    global output_content
    for i in nums:
        if isinstance(i, list):
            print_result(file,i)
        else:
            output_content=output_content+i
    

def get_file():  #check reference files
    allfile=[] 
    Attlink="D:\\My Documents\\CheckQP\\QP"
    for name in os.listdir(Attlink):  #acheive folder file list
        if name!="":
            filelink=os.path.join(Attlink,name)  #create the file link
            allfile.append(filelink)
    return allfile  #return a list of folder flies link

def get_process_file():  #检查有没有附件
    allfile=[] #定义一个空列表
    Attlink="D:\\My Documents\\CheckQP\\QP"
    for name in os.listdir(Attlink):  #获取该目录下的文件列表
        if name.split(".")[-1]=="html":
            filelink=os.path.join(Attlink,name)  #构造包含附件名的绝对路径
            allfile.append(filelink)
    return allfile

def find_files(root_dir,word):
    global output_content
    output_content=""
    process_list=get_process_file()
    for file in process_list:
        search_file(file,word)
    return output_content

root=Tk()
root.title('Priscilla')
ft=tf.Font(family="Segoe UI",size=12)
frame1=Frame(root)
frame2=Frame(root)
frame3=Frame(root)
frame1.grid(row=1,column=0)
frame2.grid(row=2,column=0)
frame3.grid(row=3,column=0)
Label(frame1,text="Welcome to use Priscilla, it is used to seach design guideline content").grid(row=0,column=0,padx=5,pady=5)
Label(frame2, text="Enter a key word:  ").grid(row=3,column=0,padx=5,pady=5,sticky=W)
v1=StringVar()
e1=Entry(frame2,textvariable=v1)
e1.grid(row=3,column=1,padx=5,pady=5,sticky=E)

def show():
    root_dir="D:\\My Documents\\CheckQP\\QP\\"
    word=e1.get()
    find_files(root_dir,word)
    text.insert(INSERT,output_content)

Button(frame2,text="Search",width=10,command=show)\
                                                  .grid(row=3,column=2,padx=10,pady=5,sticky=E)

scrollbar = Scrollbar(frame3)
scrollbar.pack(side=RIGHT, fill=Y)
text = Text(frame3,font=ft, yscrollcommand=scrollbar.set)
text.pack(side=LEFT,fill=BOTH)
scrollbar.config(command=text.yview)

mainloop()

相关文章

网友评论

    本文标题:多文件关键词内容搜索

    本文链接:https://www.haomeiwen.com/subject/rfwgxhtx.html