2019-10-23
作者: 半大人 | 来源:发表于2019-10-23 14:09 被阅读0次
# -*- coding: utf-8 -*-
'''
百度百科词条获取
'''
###############################################################################
'''
百度百科词条爬虫
1 获得初始分类链接，可以在https://baike.baidu.com，和每个初始分类链接页面内爬取

2 爬取每个分类链接下的所有词条的链接，要获取的数据有，（词条名，链接）

3 使用MySQL存储数据标签数据，（词条名，链接表）
  用json 文件存储完成进度，记录的信息有：未下载的分类链接，下载的分类链接
 
4 使用多线程
'''
###############################################################################
import urllib.request as request
from multiprocessing import  Process

from lxml import etree
import threading
import json
import time
import urllib

#网页下载器
def urlopen(url):
    headers = {
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36',
    'Referer': 'https://baike.baidu.com/',
    'Connection': 'keep-alive',
    'Content-Type': 'text/html',
    
    }
    req = request.Request(url, headers=headers)
    page = request.urlopen(req,timeout=20).read()
    page = page.decode('utf-8')
    
    return page


#主页上的大类链接
def main_links():
    
    url='https://baike.baidu.com'
    page=urlopen(url)
    parse=etree.HTML(page,etree.HTMLParser())
    Mlinks=parse.xpath('//*[@id="commonCategories"]/dl/dd/div[1]/a')
    Mlinks=[(i.text,i.xpath('@href')[0]) for i in Mlinks]
    return Mlinks

#获取一个大类中的所有的词条链接
def one_link(url):
    
    index=0#控制爬取页数
    switch=1#循环开关
    one_links=[]#存储词条
    root=url#初始网址
    while switch:
        index+=1
        url=root+'?limit=30&index='+str(index)+'&offset=30#gotoList'
        #print(root,'  ',index)
        try:
            page=urlopen(url)
        except:
            print(url,'错误。。。')
            continue
        parse=etree.HTML(page,etree.HTMLParser())
        urls=parse.xpath('//*[@id="content-main"]/div[3]/div[2]/div[1]/div[3]/div[1]/ul/li/div[2]/a')
        if urls:
            urls=[(ele.text,ele.xpath("@href")[0]) for ele in urls]
            one_links=one_links.__add__(urls)
            #print(one_links)
        else:
            switch=0
            l1=parse.xpath('//*[@id="content-main"]/div[3]/div[2]/div[1]/div[2]/div/a')
            l2=parse.xpath('//*[@id="content-main"]/div[3]/div[2]/div[1]/div[2]/div/div/a')
            l=[(i.text,i.xpath("@href")[0]) for i in l1]
            l=l.__add__([(i.text,i.xpath("@href")[0]) for i in l2]) 
    print("完成下载：",root)
    return (one_links,l)


threadLock = threading.Lock()
class Thread_One (threading.Thread):
    def __init__(self,undo_name='',done_name='',entries_name='',switch_name='',max_name='',now_name='',times_name=''):
        threading.Thread.__init__(self)
        self.undo_name=undo_name
        self.done_name=done_name
        self.entries_name=entries_name
        self.switch_name=switch_name
        self.max_name=max_name
        self.now_name=now_name
        self.times_name=times_name
    def run(self):
        print('kaishiyunxing')
        while (eval(self.switch_name)):
            #获取锁
            threadLock.acquire()
            self.url=eval("'http://baike.baidu.com'+"+self.undo_name+'[0][1]')
            exec(self.done_name+'.append('+self.undo_name+'[0])')
            print('当前处理：',eval(self.undo_name+'[0][0]\n'))
            exec("del "+self.undo_name+'[0]')            

            exec(self.now_name+'+=1')
            if eval(self.now_name)>eval(self.max_name):
                exec('main.save_schedule()')#保存进程表
                exec(self.now_name+'=0')
                if eval(self.times_name)>10000:#循环10000次
                    print('已经完成第',eval(self.times_name),'下载')
                    break
                else:
                    exec(self.switch_name+'=1')
                    exec(self.times_name+'+=1')
                    for i in range(120):
                        time.sleep(1)
                        print(120-i,'s后，开始第',eval(self.times_name),'轮下载')
                
            #解锁
            threadLock.release()
            links,MLinks=one_link(self.url)
            #获取锁
            threadLock.acquire()
            #缓存entries
            exec(self.entries_name+'='+self.entries_name+'.__add__(links)')
            #缓存大类链接
            undo=[i for i in MLinks if eval('i not in '+self.undo_name+' not in '+self.done_name)]
            for i in undo:
                exec(self.undo_name+'.append(i)')  
            #解锁
            threadLock.release()       
            
        
class baike_spider():
    def __init__(self,thread=1):
        self.undo_links=[]#未下载的大类链接
        self.done_links=[]#下载的大类链接
        self.entries=[]#词条链接
        self.threadings=[]#线程存储列表
        self.threadings_N=thread#线程数
        self.switch=1#线程开关
        self.max=30#最大爬取量
        self.now=0#当前爬取量
        self.times=3#循环最大次数，美一次循环，暂停一分钟
    
    def load_history(self):
        with open('history.json') as f:
            self.undo_links,self.done_links=json.loads(f.read())
            self.undo_links=[i for i in self.undo_links if i[1][0]=='/']
    def save_schedule(self):
        self.undo_links=[i for i in self.undo_links if i[1][0]=='/']
        with open('history.json','w+') as f:
            undo=list(set([str(i[0])+'<s>'+i[1] for i in self.undo_links]))
            done=list(set([str(i[0])+'<s>'+i[1] for i in self.done_links]))
            self.undo_links=[i.split('<s>') for i in undo]
            self.done_links=[i.split('<s>') for i in done]
            
            f.write(json.dumps([self.undo_links,self.done_links],indent=2))
        with open('entries.txt','a+') as f:
            for i in self.entries:
                f.write(i[0]+' '+i[1]+'\n')
            self.entries=[]#清空词条列表
                
    def run(self):
        for i in range(self.threadings_N):
            t=Thread_One (undo_name='main.undo_links',done_name='main.done_links',entries_name='main.entries',switch_name='main.switch',max_name='main.max',now_name='main.now',times_name='main.times')#得修改
            self.threadings.append(t)
        for i in range(len(self.threadings)):
            self.threadings[i].start()
    
###############################################################################        

MLinks=[(i[0],urllib.parse.quote(i[1])) for i in main_links()]
main=baike_spider(10)
main.undo_links=MLinks
main.load_history()
main.run()


'''
t=10
c=1
while 1:
    if not main.switch:
        #main.save_schedule()
        print('第',c,'次下载完成，现在暂停')
        for i in range(5):
            time.sleep(1)
            print(i)
        if c>=t:
            break
        else:
            c+=1
        main.run()
        main.switch=1
    else:
        time.sleep(10)

'''
网友评论

本文标题：2019-10-23
本文链接：https://www.haomeiwen.com/subject/ceckvctx.html
延伸阅读

深度阅读

您也可以注册成为美文阅读网的作者，发表您的原创作品、分享您的心情！
2019-10-23

相关文章

网友评论

延伸阅读

深度阅读

栏目导航

热点阅读