美文网首页Python新世界python热爱者
python实现简易搜索引擎(含代码)

python实现简易搜索引擎(含代码)

作者: 48e0a32026ae | 来源:发表于2018-11-28 15:15 被阅读10次

    今天我们使用python来搭建简易的搜索引擎。

    搜索引擎的本质其实就是对数据的预处理,分词构建索引和查询。

    学习Python中有不明白推荐加入交流群

                    号:516107834

                    群里有志同道合的小伙伴,互帮互助,

                    群里有不错的学习教程!

    (这边我们默认所有的数据都是utf-8的数据类型)

    我们在一个网站上去获取所有的URL:

    def crawl(pages,depth=2):

    for i in range(depth):

    newpages = set()

    for page in pages:

    try:

    c = urllib.request.urlopen(page)

    except:

    print('Invaild page:',page)

    continue

    soup = bs4.BeautifulSoup(c.read())

    links = soup('a')

    for link in links:

    if('href' in dict(link.attrs)):

    url = urllib.urljoin(page,link['href'])

    if url.find("'")!=-1:continue

    url = url.split('#')[0]

    if url[0:3]=='http':

    newpages.add(url)

    pages = newpages

    通过一个循环抓取当前页面上所有的链接,我们尽可能多的去抓取链接,之所以选择set而不使用list是防止重复的现象,我们可以将爬取的的网站存放到文件或者MySQL或者是MongoDB里。

    output = sys.stdout

    outputfile = open('lujing.txt', 'w')

    sys.stdout = outputfile

    list = GetFileList(lujing, [])

    将生成的路径文件lujing.txt读取,并按照路径文件对文本处理

    # 将生成的路径文件lujing.txt读取,并按照路径文件对文本处理,去标签

    for line in open("lujing.txt"):

    print(line)

    # line=line[0:-2]

    line1 = line[0:12]

    line2 = line[13:16]

    line3 = line[17:-1]

    line4 = line[17:-6]

    line = line1 + '\' + line2 + '\' + line3

    print(line4)

    path = line

    fb = open(path, "rb")

    data = fb.read()

    bianma = chardet.detect(data)['encoding'] # 获取当前文件的编码方式,并按照此编码类型处理文档

    page = open(line, 'r', encoding=bianma, errors='ignore').read()

    dr = re.compile(r'<[^>]+>', re.S) # 去HTML标签

    dd = dr.sub('', page)

    print(dd)

    fname = 'TXT' + "\" + line4 + ".txt"

    # print(fname)

    f = open(fname, "w+", encoding=bianma) # 将去标签的文件写到文件夹内,并按照原命名以txt文档方式保存

    # fo=open(fname,"w+")

    f.write(dd)

    下面我们进行分词索引:

    因为大家都比较熟悉sql语句那我在这里就写成MySQL的版本了,如果需要mongodb的可以私信公众号。

    import jieba

    import chardet

    import pymysql

    import importlib, sys

    importlib.reload(sys)

    # 如果使用MongoDB

    # from pymongo import MongoClient

    # #data processing

    # client = MongoClient('localhost',27017)

    # apiDB = client['urlDB'] #serverDB_name:test_nodedata

    # questionnaires = apiDB['weburl']

    # data = list(questionnaires.find())

    conn = pymysql .connect(host="localhost",user="root",

    password="123456",db="suoyin",port=3307)

    conn.text_factory = str

    c = conn.cursor()

    c.execute('drop table doc')

    c.execute('create table doc (id int primary key,link text)')

    c.execute('drop table word')

    c.execute('create table word (term varchar(25) primary key,list text)')

    conn.commit()

    conn.close()

    def Fenci():

    num = 0

    for line in open("url.txt"):

    lujing = line

    print(lujing)

    num += 1

    print(line)

    line = line[17:-5]

    print(line)

    line = 'TXT' + '\' + line + 'Txt' # line为文件位置

    print(line) # 文件名称

    path = line

    fb = open(path, "rb")

    data = fb.read()

    bianma = chardet.detect(data)['encoding'] # 获取文件编码 print(bianma)

    # page = open(line, 'r', encoding=bianma, errors='ignore').read()

    # page1=page.decode('UTF-8')

    if bianma == 'UTF-16':

    data = data.decode('UTF-16')

    data = data.encode('utf-8')

    word = jieba.cut_for_search(data)

    seglist = list(word)

    print(seglist)

    # 创建数据库

    c = conn.cursor() # 创建游标

    c.execute('insert into doc values(?,?)', (num, lujing))

    # 对每个分出的词语建立词表

    for word in seglist:

    # print(word)

    # 检验看看这个词语是否已存在于数据库

    c.execute('select list from word where term=?', (word,))

    result = c.fetchall()

    # 如果不存在

    if len(result) == 0:

    docliststr = str(num)

    c.execute('insert into word values(?,?)', (word, docliststr))

    # 如果已存在

    else:

    docliststr = result[0][0] # 得到字符串

    docliststr += ' ' + str(num)

    c.execute('update word set list=? where term=?', (docliststr, word))

    conn.commit()

    conn.close()

    Fenci()

    最后一步,查询:

    import pymsql

    import jieba

    import math

    conn = pymysql .connect(host="localhost",user="root",

    password="123456",db="suoyin",port=3307)

    c = conn.cursor()

    c.execute('select count(*) from doc')

    N = 1 + c.fetchall()[0][0] # 文档总数

    target = input('请输入搜索词:')

    seggen = jieba.cut_for_search(target)

    score = {} # 文档号:匹配度

    for word in seggen:

    print('得到查询词:', word)

    # 计算score

    tf = {} # 文档号:文档数

    c.execute('select list from word where term=?', (word,))

    result = c.fetchall()

    if len(result) > 0:

    doclist = result[0][0]

    doclist = doclist.split(' ')

    # 把字符串转换为元素为int的list

    doclist = [int(x) for x in doclist]

    # 当前word对应的df数

    df = len(set(doclist))

    idf = math.log(N / df)

    print('idf:', idf)

    for num in doclist:

    if num in tf:

    tf[num] = tf[num] + 1

    else:

    tf[num] = 1

    # tf统计结束,现在开始计算score

    for num in tf:

    if num in score:

    # 如果该num文档已经有分数了,则累加

    score[num] = score[num] + tf[num] * idf

    else:

    score[num] = tf[num] * idf

    sortedlist = sorted(score.items(), key=lambda d: d[1], reverse=True)

    cnt = 0

    for num, docscore in sortedlist:

    cnt = cnt + 1

    c.execute('select link from doc where id=?', (num,))

    url = c.fetchall()[0][0]

    print("Result Ranking:", cnt)

    print('url:', url, 'match degree:', docscore)

    if cnt > 20:

    break

    if cnt == 0:

    print('No result')

    搞定。

    相关文章

      网友评论

        本文标题:python实现简易搜索引擎(含代码)

        本文链接:https://www.haomeiwen.com/subject/jszaqqtx.html