美文网首页
2019-06-17多线程爬虫

2019-06-17多线程爬虫

作者: hcc_9bf4 | 来源:发表于2019-06-18 21:13 被阅读0次

    分析:
    两类线程:下载(3),解析(3)
    内容队列:下载线程往队列中put数据,解析线程从队列get数据
    url队列:下载线程从url队列get数据
    写数据:上锁
    案例代码:

    import threading
    import time
    from queue import Queue
    from lxml import etree
    import json
    import requests
    
    #用来存放采集线程
    g_crawl_list=[]
    
    #用来存放解析线程
    g_parse_list=[]
    
    class CrawlThread(threading.Thread):
        """docstring fos CrawlThread"""
        def __init__(self, name,page_queue,data_queue):
            super(CrawlThread, self).__init__()
            self.name = name
            self.page_queue=page_queue
            self.data_queue=data_queue
            self.url='http://www.fanjian.net/jiantu-{}'
            self.headers={
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
            }
        def run(self):
            print('%s---------线程启动' % self.name)
            while 1:
                #判断采集线程何时退出
                if self.page_queue.empty():
                    break
                #从队列中取出页码   
                page=self.page_queue.get()
                
                #拼接url,发送请求
                url=self.url.format(page)
                
                r=requests.get(url,headers=self.headers)
                #将响应的内容存放到data_queue中
                self.data_queue.put(r.text)
            print('%s---------线程结束' % self.name)
    class ParseThread(threading.Thread):
        """docstring fos CrawlThread"""
        def __init__(self, name,data_queue,fp,lock):
            super(ParseThread, self).__init__()
            self.name = name
            self.data_queue=data_queue
            self.fp=fp
            self.lock=lock
    
        def run(self):
            print('%s---------线程启动' % self.name)
            while 1:
                # if self.data_queue.empty():
                #   break
                #从data_queue中取出一页数据
                data=self.data_queue.get(True,10)
                #解析内容即可
    
                self.page_content(data)
            print('%s---------线程结束' % self.name)
        def page_content(self,data):
            
            tree=etree.HTML(data)
            #先找所有的li,再从li里面找自己的标题和url
            
            li_list=tree.xpath('//ul[@class="cont-list"]/li')
        
            items=[]
            for oli in li_list: 
            #获取标题
                title=oli.xpath('.//h2/a/text()')[0]
            #获取图片url
                url_page=oli.xpath('.//div[@class="cont-list-main"]//img/@data-src')
                item={
                '标题': title,
                '链接':url_page
                }
                items.append(item)
            #写到文件中
            self.lock.acquire()
            self.fp.write(json.dumps(items,ensure_ascii=False) + '\n')
            self.lock.release()
    
    def create_queue():
        #创建页码队列
        page_queue=Queue()
        for page in range(1,6):
            page_queue.put(page)
        #创建内容队列
        data_queue=Queue()
    
        return page_queue,data_queue
    #创建采集线程     
    def create_crawl_thread(page_queue,data_queue):
        crawl_name = ['采集线程1号','采集线程2号','采集线程3号']
        for name in crawl_name:
            #创建一个采集线程
            tcrawl=CrawlThread(name,page_queue,data_queue)
            #保存到列表中
            g_crawl_list.append(tcrawl)
    
    def create_parse_thread(data_queque,fp,lock):
        parse_name = ['解析线程1号','解析线程2号','解析线程3号']
        for name in parse_name:
            #创建一个解析线程
            tparse=ParseThread(name,data_queque,fp,lock)
            #保存到列表中
            g_parse_list.append(tparse)
    def main():
        #创建队列
        page_queue,data_queue = create_queue()
        #打开文件
        
        fp=open('jian.json','a',encoding='utf8')
        #创建锁
        lock=threading.Lock()
        #创建采集线程
        create_crawl_thread(page_queue,data_queue)
        time.sleep(3)
    
        #创建解析线程
        create_parse_thread(data_queue,fp,lock)
    
        #启动所有采集线程
        for tcrawl in g_crawl_list:
            tcrawl.start()
        #启动所有解析线程
        for tparse in g_parse_list:
            tparse.start()
    
        #主线程等待子线程结束
        for tcrawl in g_crawl_list:
            tcrawl.join()
        for tparse in g_parse_list:
            tparse.join()
        #在这里关闭文件
        fp.close()
        print('主线程子线程全部结束')
    if __name__ == '__main__':
        main()
    

    显示:

    
    采集线程1号---------线程启动
    采集线程2号---------线程启动
    采集线程3号---------线程启动
    解析线程1号---------线程启动
    解析线程2号---------线程启动
    解析线程3号---------线程启动
    采集线程2号---------线程结束
    采集线程1号---------线程结束
    采集线程3号---------线程结束
    Exception in thread 解析线程3号:
    Traceback (most recent call last):
      File "C:\Users\Administrator\AppData\Local\Programs\Python\Python37\lib\threading.py", line 917, in _bootstrap_inner
        self.run()
      File "E:\Sublime Text 3\day1\2_xincheng.py", line 56, in run
        data=self.data_queue.get(True,10)
      File "C:\Users\Administrator\AppData\Local\Programs\Python\Python37\lib\queue.py", line 178, in get
        raise Empty
    _queue.Empty
    
    Exception in thread 解析线程1号:
    Traceback (most recent call last):
      File "C:\Users\Administrator\AppData\Local\Programs\Python\Python37\lib\threading.py", line 917, in _bootstrap_inner
        self.run()
      File "E:\Sublime Text 3\day1\2_xincheng.py", line 56, in run
        data=self.data_queue.get(True,10)
      File "C:\Users\Administrator\AppData\Local\Programs\Python\Python37\lib\queue.py", line 178, in get
        raise Empty
    _queue.Empty
    
    Exception in thread 解析线程2号:
    Traceback (most recent call last):
      File "C:\Users\Administrator\AppData\Local\Programs\Python\Python37\lib\threading.py", line 917, in _bootstrap_inner
        self.run()
      File "E:\Sublime Text 3\day1\2_xincheng.py", line 56, in run
        data=self.data_queue.get(True,10)
      File "C:\Users\Administrator\AppData\Local\Programs\Python\Python37\lib\queue.py", line 178, in get
        raise Empty
    _queue.Empty
    
    主线程子线程全部结束
    [Finished in 15.1s]
    

    报错代码待优化...
    json数据


    image.png

    相关文章

      网友评论

          本文标题:2019-06-17多线程爬虫

          本文链接:https://www.haomeiwen.com/subject/lavzfctx.html