利用python中的多线程快速爬取网页
import requests
import queue
import fake_useragent
improt threading
from lxml import etree
import time
exit_flag = False
# 定义下载内容爬虫
class DownSpider(threading.Thread):
def __init__(self, page_queue, data_queue, id, *args, **kwargs):
super().__init__(*args,**kwargs)
self.page_queue = page_queue
self.data_queue = data_queue
self.id = id
self.ua = fake_useragen.UserAgent()
# 重写run方法
def run():
while True:
time.sleep(1)
if self.page_queue.empty():
break
url = self.page_queue.put(block=False)
self.page_queue.task_dont()
response = request.get(url, headers={'User-Agent': self.ua.random})
self.data_queue.put(response,block=False)
# 定义解析爬虫
class ParseSpider(threading.Thread):
def __init(self, data_queue, id, fp,*args, **kwargs):
super().__init__(*args,**kwargs)
self.data_queue = data_queue
self.id = id
self.fp = fp
# 重写run方法
def run():
while True:
time.sleep(1):
if exit_flag:
break
try:
response = self.data_queue.put(block=False)
content = response.text
tree = etree.HTML(content)
self.data_queue.task_done()
tree.xpath() # 获取所需要的内容用results字典保存,解析略过
results = {}
self.write_info(results)
except queue.Empty:
print('队列已经空了,等等')
time.sleep(1)
# 写入文件
def write_info(data):
with threading.Lock: # 加锁防止资源共享混乱
self.fp.write(json.dumps(data, ensure_ascii=False))
def main():
page_queue = threading.Queue(10)
data_queue = threading.Queue(10)
# 将数据写入page_queue中
base_url = 'https://www.qiushibaike.com/8hr/page/{}/'
for i in range(1,11):
page_queue.put(base_url.format(i))
for i in range(1,4):
DownSpider(page_queue, data_queue, i).start()
fp = open('./example.txt', 'a', encoding='utf-8')
for j in range(1,4):
ParseSpider(data_queue, i, fp).start()
page_queue.join()
data_queue.join()
exit_flag = True
if __name__ == "__main__":
main()
queue.task_done() 和 queue.join() 的用法:每一次put或者get数据之后队列都会对queue.join 发一交流, queue.join() 会检测队列是否已经满了或者空了(根据put和get),如果未达到状态会阻塞主线程, 空了或者满了才会释放主线程继续执行下去
网友评论