美文网首页
2019-10-23

2019-10-23

作者: 半大人 | 来源:发表于2019-10-23 14:09 被阅读0次
    # -*- coding: utf-8 -*-
    '''
    百度百科词条获取
    '''
    ###############################################################################
    '''
    百度百科词条爬虫
    1 获得初始分类链接,可以在https://baike.baidu.com,和每个初始分类链接页面内爬取
    
    2 爬取每个分类链接下的所有词条的链接,要获取的数据有,(词条名,链接)
    
    3 使用MySQL存储数据标签数据,(词条名,链接表)
      用json 文件存储完成进度,记录的信息有:未下载的分类链接,下载的分类链接
     
    4 使用多线程
    '''
    ###############################################################################
    import urllib.request as request
    from multiprocessing import  Process
    
    from lxml import etree
    import threading
    import json
    import time
    import urllib
    
    #网页下载器
    def urlopen(url):
        headers = {
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36',
        'Referer': 'https://baike.baidu.com/',
        'Connection': 'keep-alive',
        'Content-Type': 'text/html',
        
        }
        req = request.Request(url, headers=headers)
        page = request.urlopen(req,timeout=20).read()
        page = page.decode('utf-8')
        
        return page
    
    
    #主页上的大类链接
    def main_links():
        
        url='https://baike.baidu.com'
        page=urlopen(url)
        parse=etree.HTML(page,etree.HTMLParser())
        Mlinks=parse.xpath('//*[@id="commonCategories"]/dl/dd/div[1]/a')
        Mlinks=[(i.text,i.xpath('@href')[0]) for i in Mlinks]
        return Mlinks
    
    #获取一个大类中的所有的词条链接
    def one_link(url):
        
        index=0#控制爬取页数
        switch=1#循环开关
        one_links=[]#存储词条
        root=url#初始网址
        while switch:
            index+=1
            url=root+'?limit=30&index='+str(index)+'&offset=30#gotoList'
            #print(root,'  ',index)
            try:
                page=urlopen(url)
            except:
                print(url,'错误。。。')
                continue
            parse=etree.HTML(page,etree.HTMLParser())
            urls=parse.xpath('//*[@id="content-main"]/div[3]/div[2]/div[1]/div[3]/div[1]/ul/li/div[2]/a')
            if urls:
                urls=[(ele.text,ele.xpath("@href")[0]) for ele in urls]
                one_links=one_links.__add__(urls)
                #print(one_links)
            else:
                switch=0
                l1=parse.xpath('//*[@id="content-main"]/div[3]/div[2]/div[1]/div[2]/div/a')
                l2=parse.xpath('//*[@id="content-main"]/div[3]/div[2]/div[1]/div[2]/div/div/a')
                l=[(i.text,i.xpath("@href")[0]) for i in l1]
                l=l.__add__([(i.text,i.xpath("@href")[0]) for i in l2]) 
        print("完成下载:",root)
        return (one_links,l)
    
    
    threadLock = threading.Lock()
    class Thread_One (threading.Thread):
        def __init__(self,undo_name='',done_name='',entries_name='',switch_name='',max_name='',now_name='',times_name=''):
            threading.Thread.__init__(self)
            self.undo_name=undo_name
            self.done_name=done_name
            self.entries_name=entries_name
            self.switch_name=switch_name
            self.max_name=max_name
            self.now_name=now_name
            self.times_name=times_name
        def run(self):
            print('kaishiyunxing')
            while (eval(self.switch_name)):
                #获取锁
                threadLock.acquire()
                self.url=eval("'http://baike.baidu.com'+"+self.undo_name+'[0][1]')
                exec(self.done_name+'.append('+self.undo_name+'[0])')
                print('当前处理:',eval(self.undo_name+'[0][0]\n'))
                exec("del "+self.undo_name+'[0]')            
    
                exec(self.now_name+'+=1')
                if eval(self.now_name)>eval(self.max_name):
                    exec('main.save_schedule()')#保存进程表
                    exec(self.now_name+'=0')
                    if eval(self.times_name)>10000:#循环10000次
                        print('已经完成第',eval(self.times_name),'下载')
                        break
                    else:
                        exec(self.switch_name+'=1')
                        exec(self.times_name+'+=1')
                        for i in range(120):
                            time.sleep(1)
                            print(120-i,'s后,开始第',eval(self.times_name),'轮下载')
                    
                #解锁
                threadLock.release()
                links,MLinks=one_link(self.url)
                #获取锁
                threadLock.acquire()
                #缓存entries
                exec(self.entries_name+'='+self.entries_name+'.__add__(links)')
                #缓存大类链接
                undo=[i for i in MLinks if eval('i not in '+self.undo_name+' not in '+self.done_name)]
                for i in undo:
                    exec(self.undo_name+'.append(i)')  
                #解锁
                threadLock.release()       
                
            
    class baike_spider():
        def __init__(self,thread=1):
            self.undo_links=[]#未下载的大类链接
            self.done_links=[]#下载的大类链接
            self.entries=[]#词条链接
            self.threadings=[]#线程存储列表
            self.threadings_N=thread#线程数
            self.switch=1#线程开关
            self.max=30#最大爬取量
            self.now=0#当前爬取量
            self.times=3#循环最大次数,美一次循环,暂停一分钟
        
        def load_history(self):
            with open('history.json') as f:
                self.undo_links,self.done_links=json.loads(f.read())
                self.undo_links=[i for i in self.undo_links if i[1][0]=='/']
        def save_schedule(self):
            self.undo_links=[i for i in self.undo_links if i[1][0]=='/']
            with open('history.json','w+') as f:
                undo=list(set([str(i[0])+'<s>'+i[1] for i in self.undo_links]))
                done=list(set([str(i[0])+'<s>'+i[1] for i in self.done_links]))
                self.undo_links=[i.split('<s>') for i in undo]
                self.done_links=[i.split('<s>') for i in done]
                
                f.write(json.dumps([self.undo_links,self.done_links],indent=2))
            with open('entries.txt','a+') as f:
                for i in self.entries:
                    f.write(i[0]+' '+i[1]+'\n')
                self.entries=[]#清空词条列表
                    
        def run(self):
            for i in range(self.threadings_N):
                t=Thread_One (undo_name='main.undo_links',done_name='main.done_links',entries_name='main.entries',switch_name='main.switch',max_name='main.max',now_name='main.now',times_name='main.times')#得修改
                self.threadings.append(t)
            for i in range(len(self.threadings)):
                self.threadings[i].start()
        
    ###############################################################################        
    
    MLinks=[(i[0],urllib.parse.quote(i[1])) for i in main_links()]
    main=baike_spider(10)
    main.undo_links=MLinks
    main.load_history()
    main.run()
    
    
    '''
    t=10
    c=1
    while 1:
        if not main.switch:
            #main.save_schedule()
            print('第',c,'次下载完成,现在暂停')
            for i in range(5):
                time.sleep(1)
                print(i)
            if c>=t:
                break
            else:
                c+=1
            main.run()
            main.switch=1
        else:
            time.sleep(10)
    
    '''
    
            
            
        
    
    
    

    相关文章

      网友评论

          本文标题:2019-10-23

          本文链接:https://www.haomeiwen.com/subject/ceckvctx.html