美文网首页
多线程结合爬虫爬去数据

多线程结合爬虫爬去数据

作者: Challis | 来源:发表于2018-11-29 19:58 被阅读0次

利用python中的多线程快速爬取网页

import requests
import queue
import fake_useragent
improt threading
from lxml import etree
import time

exit_flag = False
# 定义下载内容爬虫
class DownSpider(threading.Thread):
    def __init__(self, page_queue, data_queue, id, *args, **kwargs):
        super().__init__(*args,**kwargs)
        self.page_queue = page_queue
        self.data_queue = data_queue
        self.id = id
        self.ua = fake_useragen.UserAgent()
    
    # 重写run方法
    def run():
        while True:
            time.sleep(1)
            if self.page_queue.empty():
                break
            url = self.page_queue.put(block=False)
            self.page_queue.task_dont()
            response = request.get(url, headers={'User-Agent': self.ua.random})
            self.data_queue.put(response,block=False)


# 定义解析爬虫
class ParseSpider(threading.Thread):
    def __init(self, data_queue, id, fp,*args, **kwargs):
        super().__init__(*args,**kwargs)
        self.data_queue = data_queue
        self.id = id
        self.fp = fp
  
    # 重写run方法
    def run():
        while True:
            time.sleep(1):
            if exit_flag:
                break
            try:
                response = self.data_queue.put(block=False)
                content = response.text
                tree = etree.HTML(content)
                self.data_queue.task_done()
                tree.xpath()   # 获取所需要的内容用results字典保存,解析略过
                results = {}
                self.write_info(results)
            except queue.Empty:
                print('队列已经空了,等等')
                time.sleep(1)
    
    # 写入文件
    def write_info(data):
        with threading.Lock:  # 加锁防止资源共享混乱
            self.fp.write(json.dumps(data, ensure_ascii=False))


def main():
    page_queue = threading.Queue(10)
    data_queue = threading.Queue(10)
    # 将数据写入page_queue中
    base_url = 'https://www.qiushibaike.com/8hr/page/{}/'
    for i in range(1,11):
        page_queue.put(base_url.format(i))
    
    for i in range(1,4):
        DownSpider(page_queue, data_queue, i).start()
     
    fp = open('./example.txt', 'a', encoding='utf-8')
    for j in range(1,4):
        ParseSpider(data_queue, i, fp).start()

    page_queue.join()
    data_queue.join()
    exit_flag = True


if __name__ == "__main__":
    main()

queue.task_done() 和 queue.join() 的用法:每一次put或者get数据之后队列都会对queue.join 发一交流, queue.join() 会检测队列是否已经满了或者空了(根据put和get),如果未达到状态会阻塞主线程, 空了或者满了才会释放主线程继续执行下去

相关文章

网友评论

      本文标题:多线程结合爬虫爬去数据

      本文链接:https://www.haomeiwen.com/subject/gixqcqtx.html