multiprocessing库是Python自带的多进程处理库,在使用多进程爬虫时时,使用方法如下:
from multiprocessing import Pool
pool = Pool(processes=2) #创建进程池,进程数为2
pool.map(func, iterable[,chunksize]) #执行多进程,其中func是需运行的爬虫函数,iterable为迭代参数,在爬虫中可为多个URL列表进行迭代。
pool.close()
pool.join() #结束进程
以爬取糗事百科文字为例,对比一下单进程和多进程的效率:
import time,re
from lxml import etree
import requests
from multiprocessing import Pool
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 \
(KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'}
def re_scraper(url):
r = requests.get(url, headers=headers)
selector = etree.HTML(r.text)
infos = selector.xpath('//div[@id="content-left"]/div') #取大标签,从此处循环
for info in infos:
try:
userId = info.xpath('div[1]/a[2]/h2/text()')[0]
except IndexError:
userId = '匿名用户'
content = info.xpath('a/div/span/text()')[0]
laugh = info.xpath('div[2]/span[1]/i/text()')[0] + '好笑'
comment = info.xpath('div[2]/span[2]/a/i/text()')[0] + '评论'
return url*2
pass
if __name__ == '__main__': #windows环境下必须有这句,否则多进程不起效
urls = ['https://www.qiushibaike.com/text/page/{}/'\
.format(str(i)) for i in range(1,36)]
start_1 = time.time()
for url in urls:
re_scraper(url) #单进程
end_1 = time.time()
print('单进程爬虫用时:',end_1-start_1)
start_2 = time.time()
pool = Pool(processes=2) #2个进程
pool.map(re_scraper, urls)
end_2 = time.time()
print('2进程爬虫用时:', end_2-start_2)
start_3 = time.time()
pool = Pool(4) #4个进程
pool.map(re_scraper, urls)
end_3 = time.time()
print('4进程爬虫用时:', end_3-start_3)
image
多进程在爬取数据量较大时优势很明显,数据量小时可用可不用。
网友评论