美文网首页Python学习内容整理
多文件关键词内容搜索

多文件关键词内容搜索

作者: Lorence | 来源:发表于2020-02-09 13:43 被阅读0次

    工作中遇到一个文件内容搜索的问题。有很多份不同的PDF文件,需要在它们里面搜索出还有关键词的段落。
    一开始我查找了PDF库,可以实现关键词的搜索,但是文件中,每一段落都有它自己的Heading No.。而PDF库搜索出来的结果无法得出这个Heading No。
    于是我尝试把文件改为Word文档。利用Word的Docx库去做搜索,同样还是无法处理Heading No.的问题。
    准备放弃的时候,想到了HTML格式。如果我把Word文件另存为HTML格式,那么Heading No所处的位置是否有对应的标签可以查询到。于是尝试了一大轮后,有了以下代码。代码中我还添加了一个窗口,给用户相对一个好些的界面。


    image.png

    代码中通过正则表达式,判断每一段落的每一行开头,是否以特定的数字开始,因为Heading No是以数字开头的。如果是就把该段落内容保存到字典中,其中以Heading No作为字典的键。
    片段如下:

    all_content=soup.find_all(['h1','h2','h3', 'h4', 'h5','p'])
        new_list=[]
        for item in all_content:
            if item.text not in new_list:
                new_list.append(item.text)
        dic1={}   #Build a empty dic to store each clause no, and its detail content from every paragraph
        Target=""
        content=""
        for line in new_list:
            line=str(line.replace("\n"," "))
            pattern=re.compile(r"(^[1-9].+)")   #Judge the paragraph whether start with heading no. 
            line_no=bool(pattern.search(line))  
            if line_no:                                          #If the paragraph start with heading no
                dic1[Target]=content               #Save the conent to heading no. in dic.
                Target=line                                  
                content=""
                continue
            else:                                                   #if the paragraph is detail, not heading line, 
                content=content+line+"\n"     # save the content
                continue
    

    完整的程序代码如下:

    from tkinter import *
    from bs4 import BeautifulSoup
    import requests
    import re
    import os,sys
    import tkinter.font as tf
    
    def search_file(file,word):
        global output_content
        output_content=output_content+"\n"+"*"*30+file.split("\\")[-1]+" Search Result" +"*"*30+"\n"*2
        url=file
        htmlfile = open(url, 'r', encoding='utf-8')
        demo = htmlfile.read()
        soup=BeautifulSoup(demo,'lxml')
        all_content=soup.find_all(['h1','h2','h3', 'h4', 'h5','p'])
        new_list=[]
        for item in all_content:
            if item.text not in new_list:
                new_list.append(item.text)
        dic1={}   #Build a empty dic to store each clause no, and its detail content from every paragraph
        Target=""
        content=""
        for line in new_list:
            line=str(line.replace("\n"," "))
            pattern=re.compile(r"(^[1-9].+)")   #Judge the paragraph whether start with heading no. 
            line_no=bool(pattern.search(line))  
            if line_no:                                          #If the paragraph start with heading no
                dic1[Target]=content               #Save the conent to heading no. in dic.
                Target=line                                  
                content=""
                continue
            else:                                                   #if the paragraph is detail, not heading line, 
                content=content+line+"\n"     # save the content
                continue
        result=[]  #The keyword search from the dic item, if the keyword in the item, shall print the dic key and item at the same time.     
        for value in dic1.values():
            pattern=re.compile(r".*%s.*"%word,re.I|re.M)
            rel=pattern.findall(value)
            if len(rel):
                result.append((list(dic1.keys())[list(dic1.values()).index(value)]))
                result.append(list(rel))
                result.append("\n")
        return print_result(file,result)
                
    def print_result(file,nums):
        global output_content
        for i in nums:
            if isinstance(i, list):
                print_result(file,i)
            else:
                output_content=output_content+i
        
    
    def get_file():  #check reference files
        allfile=[] 
        Attlink="D:\\My Documents\\CheckQP\\QP"
        for name in os.listdir(Attlink):  #acheive folder file list
            if name!="":
                filelink=os.path.join(Attlink,name)  #create the file link
                allfile.append(filelink)
        return allfile  #return a list of folder flies link
    
    def get_process_file():  #检查有没有附件
        allfile=[] #定义一个空列表
        Attlink="D:\\My Documents\\CheckQP\\QP"
        for name in os.listdir(Attlink):  #获取该目录下的文件列表
            if name.split(".")[-1]=="html":
                filelink=os.path.join(Attlink,name)  #构造包含附件名的绝对路径
                allfile.append(filelink)
        return allfile
    
    def find_files(root_dir,word):
        global output_content
        output_content=""
        process_list=get_process_file()
        for file in process_list:
            search_file(file,word)
        return output_content
    
    root=Tk()
    root.title('Priscilla')
    ft=tf.Font(family="Segoe UI",size=12)
    frame1=Frame(root)
    frame2=Frame(root)
    frame3=Frame(root)
    frame1.grid(row=1,column=0)
    frame2.grid(row=2,column=0)
    frame3.grid(row=3,column=0)
    Label(frame1,text="Welcome to use Priscilla, it is used to seach design guideline content").grid(row=0,column=0,padx=5,pady=5)
    Label(frame2, text="Enter a key word:  ").grid(row=3,column=0,padx=5,pady=5,sticky=W)
    v1=StringVar()
    e1=Entry(frame2,textvariable=v1)
    e1.grid(row=3,column=1,padx=5,pady=5,sticky=E)
    
    def show():
        root_dir="D:\\My Documents\\CheckQP\\QP\\"
        word=e1.get()
        find_files(root_dir,word)
        text.insert(INSERT,output_content)
    
    Button(frame2,text="Search",width=10,command=show)\
                                                      .grid(row=3,column=2,padx=10,pady=5,sticky=E)
    
    scrollbar = Scrollbar(frame3)
    scrollbar.pack(side=RIGHT, fill=Y)
    text = Text(frame3,font=ft, yscrollcommand=scrollbar.set)
    text.pack(side=LEFT,fill=BOTH)
    scrollbar.config(command=text.yview)
    
    mainloop()
    
    

    相关文章

      网友评论

        本文标题:多文件关键词内容搜索

        本文链接:https://www.haomeiwen.com/subject/rfwgxhtx.html