美文网首页
python爬虫之多线程爬虫小Demo

python爬虫之多线程爬虫小Demo

作者: Pickupthesmokes | 来源:发表于2018-12-30 00:15 被阅读0次
      import queue
      import requests
      import threading
      from lxml.html import etree
      import json
    
      # #maxsize:指定队列中能够存储的最大的数据量
      # dataqueue = queue.Queue(maxsize=40)
      #
      # for i in range(0,50):
      #     if not dataqueue.full():
      #         dataqueue.put(i)
      #
      # #判断队列是否为空
      # isempty = dataqueue.empty()
      # print(isempty)
      #
      # #判断队列是否存满了
      # isfull = dataqueue.full()
      # print(isfull)
      #
      # #n发挥对列的大小
      # size = dataqueue.qsize()
      # print(size)
      #
      # #FIFO(先进的先出)
      # print(dataqueue.get())
    
      #注意:队列是线程之间常用的数据交换形式,因为队列在线程间,是线程安全的
      """
      1.创建一个任务队列:存放的是带爬取的url地址
      2.创建爬取线程,执行任务的下载
      3.创建数据队列:存放爬取线程获取的页面源码
      4.创建解析线程:解析html源码,提取目标数据,数据持久化
      """
      #获取jobbole的文章列表
      #http://blog.jobbole.com/all-posts/page/1/
      #http://blog.jobbole.com/all-posts/page/2/
      #http://blog.jobbole.com/all-posts/page/3/
    
      def download_page_data(taskQueue,dataQueue):
          """
          执行下载任务
          :param taskQueue: 从任务队列里面取出任务
          :param dataQueue: 将获取到的页面源码存到dataQueue队列中
          :return:
          """
          while not taskQueue.empty():
              page = taskQueue.get()
              print('正在下载第'+str(page)+'页',threading.currentThread().name)
              full_url = 'http://blog.jobbole.com/all-posts/page/%s/' % str(page)
              req_header = {
                  'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'
              }
              response = requests.get(full_url,headers=req_header)
      
              if response.status_code == 200:
                  #将获取到的页面源码存到dataQueue队列中
                  dataQueue.put(response.text)
              else:
                  taskQueue.put(page)
    
    
      def parse_data(dataQueue,lock):
          """
          解析数据,从dataQueue中取出数据进行解析
          :param dataQueue:
          :return:
          """
          while not dataQueue.empty():
              print('正在解析', threading.currentThread().name)
              html = dataQueue.get()
              html_element = etree.HTML(html)
              articles = html_element.xpath('//div[@class="post floated-thumb"]')
    
              for article in articles:
                  articleInfo = {}
                  #标题
                  articleInfo['title'] = article.xpath('.//a[@class="archive-title"]/text()')[0]
                  #封面
                  img_element = article.xpath('.//div[@class="post-thumb"]/a/img')
                  if len(img_element) > 0:
                      articleInfo['coverImage'] = img_element[0].xpath('./@src')[0]
                  else:
                     articleInfo['coverImage'] = '暂无图片'
                  p_as = article.xpath('.//div[@class="post-meta"]/p[1]//a')
                  if len(p_as) > 2:
                      #tag类型
                      articleInfo['tag'] = p_as[1].xpath('./text()')[0]
                      #评论量
                      articleInfo['commentNum'] = p_as[2].xpath('./text()')[0]
                  else:
                      # tag类型
                      articleInfo['tag'] = p_as[1].xpath('./text()')[0]
                      # 评论量
                      articleInfo['commentNum'] = '0'
                  #简介
                  articleInfo['content'] = article.xpath('.//span[@class="excerpt"]/p/text()')[0]
                  #时间
                  articleInfo['publishTime'] = ''.join(article.xpath('.//div[@class="post-meta"]/p[1]/text()')).replace('\n','').replace(' ','').replace('\r','').replace('·','')
    
                  lock.acquire() #加锁
                  with open('jobbole.json','a+') as file:
                      json_str = json.dumps(articleInfo,ensure_ascii=False) + '\n'
                      file.write(json_str)
                 lock.release() #解锁
    
      if __name__ == '__main__':
    
          #创建任务队列
          taskQueue = queue.Queue()
    
          for i in range(1,201):
              taskQueue.put(i)
    
          #创建数据队列
         dataQueue = queue.Queue()
    
          #创建线程执行下载任务
          threadName = ['下载线程1号','下载线程2号','下载线程3号','下载线程4号']
          crawl_thread = []
          for name in threadName:
              #创建线程
              thread_crawl = threading.Thread(
                  target=download_page_data,
                  name=name,
                  args=(taskQueue,dataQueue)
              )
              crawl_thread.append(thread_crawl)
              #开启线程
              thread_crawl.start()
    
          #让所有的爬取线程执行完毕,在回到主线程中继续执行
          for thread in crawl_thread:
              thread.join()
    
          #加线程锁
          lock = threading.Lock()
          #创建解析线程,从dataQueue队列中取出页面源码进行解析
          threadName = ['解析线程1号', '解析线程2号', '解析线程3号', '解析线程4号']
          parse_thread = []
          for name in threadName:
              # 创建线程
              thread_parse = threading.Thread(
                  target=parse_data,
                  name=name,
                  args=(dataQueue,lock)
              )
              parse_thread.append(thread_crawl)
              # 开启线程
              thread_parse.start()
    
          for thread in parse_thread:
              thread.join()
    
          print('结束了')

    相关文章

      网友评论

          本文标题:python爬虫之多线程爬虫小Demo

          本文链接:https://www.haomeiwen.com/subject/atpmlqtx.html