python实现简易搜索引擎（含代码）

作者: 48e0a32026ae | 来源:发表于2018-11-28 15:15 被阅读10次

python实现简易搜索引擎（含代码）
简易爬虫代码实现——基于python2.7
Python+PhantomJS+selenium+Beauti
一个简易版的智能搜索系统（下）
简易文件注册--Python版
简易代码编缉器的编写
Redis-简易投票
Redis-简易秒杀
【Python】python实现jpg图片文字转成pdf格式
【Python】用python xmlrpc调用实现发布博文

今天我们使用python来搭建简易的搜索引擎。

搜索引擎的本质其实就是对数据的预处理，分词构建索引和查询。

学习Python中有不明白推荐加入交流群

号：516107834

群里有志同道合的小伙伴，互帮互助，

群里有不错的学习教程！

（这边我们默认所有的数据都是utf-8的数据类型）

我们在一个网站上去获取所有的URL：

def crawl(pages,depth=2):

for i in range(depth):

newpages = set()

for page in pages:

try:

c = urllib.request.urlopen(page)

except:

print('Invaild page:',page)

continue

soup = bs4.BeautifulSoup(c.read())

links = soup('a')

for link in links:

if('href' in dict(link.attrs)):

url = urllib.urljoin(page,link['href'])

if url.find("'")!=-1:continue

url = url.split('#')[0]

if url[0:3]=='http':

newpages.add(url)

pages = newpages

通过一个循环抓取当前页面上所有的链接，我们尽可能多的去抓取链接，之所以选择set而不使用list是防止重复的现象，我们可以将爬取的的网站存放到文件或者MySQL或者是MongoDB里。

output = sys.stdout

outputfile = open('lujing.txt', 'w')

sys.stdout = outputfile

list = GetFileList(lujing, [])

将生成的路径文件lujing.txt读取，并按照路径文件对文本处理

# 将生成的路径文件lujing.txt读取，并按照路径文件对文本处理，去标签

for line in open("lujing.txt"):

print(line)

# line=line[0:-2]

line1 = line[0:12]

line2 = line[13:16]

line3 = line[17:-1]

line4 = line[17:-6]

line = line1 + '\' + line2 + '\' + line3

print(line4)

path = line

fb = open(path, "rb")

data = fb.read()

bianma = chardet.detect(data)['encoding'] # 获取当前文件的编码方式，并按照此编码类型处理文档

page = open(line, 'r', encoding=bianma, errors='ignore').read()

dr = re.compile(r'<[^>]+>', re.S) # 去HTML标签

dd = dr.sub('', page)

print(dd)

fname = 'TXT' + "\" + line4 + ".txt"

# print(fname)

f = open(fname, "w+", encoding=bianma) # 将去标签的文件写到文件夹内，并按照原命名以txt文档方式保存

# fo=open(fname,"w+")

f.write(dd)

下面我们进行分词索引：

因为大家都比较熟悉sql语句那我在这里就写成MySQL的版本了，如果需要mongodb的可以私信公众号。

import jieba

import chardet

import pymysql

import importlib, sys

importlib.reload(sys)

# 如果使用MongoDB

# from pymongo import MongoClient

# #data processing

# client = MongoClient('localhost',27017)

# apiDB = client['urlDB'] #serverDB_name:test_nodedata

# questionnaires = apiDB['weburl']

# data = list(questionnaires.find())

conn = pymysql .connect(host="localhost",user="root",

password="123456",db="suoyin",port=3307)

conn.text_factory = str

c = conn.cursor()

c.execute('drop table doc')

c.execute('create table doc (id int primary key,link text)')

c.execute('drop table word')

c.execute('create table word (term varchar(25) primary key,list text)')

conn.commit()

conn.close()

def Fenci():

num = 0

for line in open("url.txt"):

lujing = line

print(lujing)

num += 1

print(line)

line = line[17:-5]

print(line)

line = 'TXT' + '\' + line + 'Txt' # line为文件位置

print(line) # 文件名称

path = line

fb = open(path, "rb")

data = fb.read()

bianma = chardet.detect(data)['encoding'] # 获取文件编码 print(bianma)

# page = open(line, 'r', encoding=bianma, errors='ignore').read()

# page1=page.decode('UTF-8')

if bianma == 'UTF-16':

data = data.decode('UTF-16')

data = data.encode('utf-8')

word = jieba.cut_for_search(data)

seglist = list(word)

print(seglist)

# 创建数据库

c = conn.cursor() # 创建游标

c.execute('insert into doc values(?,?)', (num, lujing))

# 对每个分出的词语建立词表

for word in seglist:

# print(word)

# 检验看看这个词语是否已存在于数据库

c.execute('select list from word where term=?', (word,))

result = c.fetchall()

# 如果不存在

if len(result) == 0:

docliststr = str(num)

c.execute('insert into word values(?,?)', (word, docliststr))

# 如果已存在

else:

docliststr = result[0][0] # 得到字符串

docliststr += ' ' + str(num)

c.execute('update word set list=? where term=?', (docliststr, word))

conn.commit()

conn.close()

Fenci()

最后一步，查询：

import pymsql

import jieba

import math

conn = pymysql .connect(host="localhost",user="root",

password="123456",db="suoyin",port=3307)

c = conn.cursor()

c.execute('select count(*) from doc')

N = 1 + c.fetchall()[0][0] # 文档总数

target = input('请输入搜索词：')

seggen = jieba.cut_for_search(target)

score = {} # 文档号：匹配度

for word in seggen:

print('得到查询词：', word)

# 计算score

tf = {} # 文档号：文档数

c.execute('select list from word where term=?', (word,))

result = c.fetchall()

if len(result) > 0:

doclist = result[0][0]

doclist = doclist.split(' ')

# 把字符串转换为元素为int的list

doclist = [int(x) for x in doclist]

# 当前word对应的df数

df = len(set(doclist))

idf = math.log(N / df)

print('idf：', idf)

for num in doclist:

if num in tf:

tf[num] = tf[num] + 1

else:

tf[num] = 1

# tf统计结束，现在开始计算score

for num in tf:

if num in score:

# 如果该num文档已经有分数了，则累加

score[num] = score[num] + tf[num] * idf

else:

score[num] = tf[num] * idf

sortedlist = sorted(score.items(), key=lambda d: d[1], reverse=True)

cnt = 0

for num, docscore in sortedlist:

cnt = cnt + 1

c.execute('select link from doc where id=?', (num,))

url = c.fetchall()[0][0]

print("Result Ranking：", cnt)

print('url：', url, 'match degree：', docscore)

if cnt > 20:

break

if cnt == 0:

print('No result')

搞定。

网友评论

本文标题：python实现简易搜索引擎（含代码）

本文链接：https://www.haomeiwen.com/subject/jszaqqtx.html

延伸阅读

深度阅读

您也可以注册成为美文阅读网的作者，发表您的原创作品、分享您的心情！

python实现简易搜索引擎（含代码）

相关文章