# -*- coding: utf-8 -*-
'''
百度百科词条获取
'''
###############################################################################
'''
百度百科词条爬虫
1 获得初始分类链接,可以在https://baike.baidu.com,和每个初始分类链接页面内爬取
2 爬取每个分类链接下的所有词条的链接,要获取的数据有,(词条名,链接)
3 使用MySQL存储数据标签数据,(词条名,链接表)
用json 文件存储完成进度,记录的信息有:未下载的分类链接,下载的分类链接
4 使用多线程
'''
###############################################################################
import urllib.request as request
from multiprocessing import Process
from lxml import etree
import threading
import json
import time
import urllib
#网页下载器
def urlopen(url):
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36',
'Referer': 'https://baike.baidu.com/',
'Connection': 'keep-alive',
'Content-Type': 'text/html',
}
req = request.Request(url, headers=headers)
page = request.urlopen(req,timeout=20).read()
page = page.decode('utf-8')
return page
#主页上的大类链接
def main_links():
url='https://baike.baidu.com'
page=urlopen(url)
parse=etree.HTML(page,etree.HTMLParser())
Mlinks=parse.xpath('//*[@id="commonCategories"]/dl/dd/div[1]/a')
Mlinks=[(i.text,i.xpath('@href')[0]) for i in Mlinks]
return Mlinks
#获取一个大类中的所有的词条链接
def one_link(url):
index=0#控制爬取页数
switch=1#循环开关
one_links=[]#存储词条
root=url#初始网址
while switch:
index+=1
url=root+'?limit=30&index='+str(index)+'&offset=30#gotoList'
#print(root,' ',index)
try:
page=urlopen(url)
except:
print(url,'错误。。。')
continue
parse=etree.HTML(page,etree.HTMLParser())
urls=parse.xpath('//*[@id="content-main"]/div[3]/div[2]/div[1]/div[3]/div[1]/ul/li/div[2]/a')
if urls:
urls=[(ele.text,ele.xpath("@href")[0]) for ele in urls]
one_links=one_links.__add__(urls)
#print(one_links)
else:
switch=0
l1=parse.xpath('//*[@id="content-main"]/div[3]/div[2]/div[1]/div[2]/div/a')
l2=parse.xpath('//*[@id="content-main"]/div[3]/div[2]/div[1]/div[2]/div/div/a')
l=[(i.text,i.xpath("@href")[0]) for i in l1]
l=l.__add__([(i.text,i.xpath("@href")[0]) for i in l2])
print("完成下载:",root)
return (one_links,l)
threadLock = threading.Lock()
class Thread_One (threading.Thread):
def __init__(self,undo_name='',done_name='',entries_name='',switch_name='',max_name='',now_name='',times_name=''):
threading.Thread.__init__(self)
self.undo_name=undo_name
self.done_name=done_name
self.entries_name=entries_name
self.switch_name=switch_name
self.max_name=max_name
self.now_name=now_name
self.times_name=times_name
def run(self):
print('kaishiyunxing')
while (eval(self.switch_name)):
#获取锁
threadLock.acquire()
self.url=eval("'http://baike.baidu.com'+"+self.undo_name+'[0][1]')
exec(self.done_name+'.append('+self.undo_name+'[0])')
print('当前处理:',eval(self.undo_name+'[0][0]\n'))
exec("del "+self.undo_name+'[0]')
exec(self.now_name+'+=1')
if eval(self.now_name)>eval(self.max_name):
exec('main.save_schedule()')#保存进程表
exec(self.now_name+'=0')
if eval(self.times_name)>10000:#循环10000次
print('已经完成第',eval(self.times_name),'下载')
break
else:
exec(self.switch_name+'=1')
exec(self.times_name+'+=1')
for i in range(120):
time.sleep(1)
print(120-i,'s后,开始第',eval(self.times_name),'轮下载')
#解锁
threadLock.release()
links,MLinks=one_link(self.url)
#获取锁
threadLock.acquire()
#缓存entries
exec(self.entries_name+'='+self.entries_name+'.__add__(links)')
#缓存大类链接
undo=[i for i in MLinks if eval('i not in '+self.undo_name+' not in '+self.done_name)]
for i in undo:
exec(self.undo_name+'.append(i)')
#解锁
threadLock.release()
class baike_spider():
def __init__(self,thread=1):
self.undo_links=[]#未下载的大类链接
self.done_links=[]#下载的大类链接
self.entries=[]#词条链接
self.threadings=[]#线程存储列表
self.threadings_N=thread#线程数
self.switch=1#线程开关
self.max=30#最大爬取量
self.now=0#当前爬取量
self.times=3#循环最大次数,美一次循环,暂停一分钟
def load_history(self):
with open('history.json') as f:
self.undo_links,self.done_links=json.loads(f.read())
self.undo_links=[i for i in self.undo_links if i[1][0]=='/']
def save_schedule(self):
self.undo_links=[i for i in self.undo_links if i[1][0]=='/']
with open('history.json','w+') as f:
undo=list(set([str(i[0])+'<s>'+i[1] for i in self.undo_links]))
done=list(set([str(i[0])+'<s>'+i[1] for i in self.done_links]))
self.undo_links=[i.split('<s>') for i in undo]
self.done_links=[i.split('<s>') for i in done]
f.write(json.dumps([self.undo_links,self.done_links],indent=2))
with open('entries.txt','a+') as f:
for i in self.entries:
f.write(i[0]+' '+i[1]+'\n')
self.entries=[]#清空词条列表
def run(self):
for i in range(self.threadings_N):
t=Thread_One (undo_name='main.undo_links',done_name='main.done_links',entries_name='main.entries',switch_name='main.switch',max_name='main.max',now_name='main.now',times_name='main.times')#得修改
self.threadings.append(t)
for i in range(len(self.threadings)):
self.threadings[i].start()
###############################################################################
MLinks=[(i[0],urllib.parse.quote(i[1])) for i in main_links()]
main=baike_spider(10)
main.undo_links=MLinks
main.load_history()
main.run()
'''
t=10
c=1
while 1:
if not main.switch:
#main.save_schedule()
print('第',c,'次下载完成,现在暂停')
for i in range(5):
time.sleep(1)
print(i)
if c>=t:
break
else:
c+=1
main.run()
main.switch=1
else:
time.sleep(10)
'''
网友评论