使用Pool实现多进程爬虫
重点就是用到pool.apply_async
函数,示例如下:
from multiprocessing import Pool
import time
import os
from urllib.request import urlretrieve
poem_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'poem_test')
def download_mp3(p_name, p_url):
print('开始下载{}...'.format(p_name))
start = time.time()
poem_name = p_name + '.mp3'
local_file = os.path.join(poem_dir, poem_name)
urlretrieve(url=p_url, filename=local_file)
print('下载{}持续了{}秒'.format(p_name, time.time() - start))
pool = Pool(4) # 建立了4个进程
with open('poem_20190803.txt') as f: # 保存了下载链接的文件
for l in f:
name = l.split('*')[0]
url = l.split('*')[1]
pool.apply_async(download_mp3, args=(name, url)) # 重点是此函数
pool.close() # 必加,必须在pool.join的前面
pool.join() # 必加
注意,在windows系统中,上述pool = Pool(4)
及后续代码,必须放在if __name__ == '__main__':
下面,否则会报错。https://stackoverflow.com/questions/18204782/runtimeerror-on-windows-trying-python-multiprocessing
使用ThreadPoolExecutor实现多线程爬虫
重点使用pool.submit
函数,示例如下:
# 使用ThreadPoolExecutor线程池并发编程
import time
import os
from urllib.request import urlretrieve
from concurrent.futures import ThreadPoolExecutor
poem_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'poem_test')
def download_mp3(p_name, p_url):
print('开始下载{}...'.format(p_name))
start = time.time()
poem_name = p_name + '.mp3'
local_file = os.path.join(poem_dir, poem_name)
urlretrieve(url=p_url, filename=local_file)
print('下载{}持续了{}秒'.format(p_name, time.time() - start))
pool = ThreadPoolExecutor(max_workers=10) # 无max_workers参数,使用默认值会有惊喜
# pool = ThreadPoolExecutor() # 无max_workers参数,使用默认值会有惊喜
with open('poem_20190803.txt') as f:
for l in f:
name = l.split('*')[0]
url = l.split('*')[1]
pool.submit(download_mp3, name, url) # 注意参数的调用方式,直接只用位置参数,无元组,无args
使用ProcessPoolExecutor实现多线程爬虫
重点使用pool.submit
函数,示例如下:
# 使用ProcessPoolExecutor进程池并发编程
import time
import os
from urllib.request import urlretrieve
from concurrent.futures import ProcessPoolExecutor
poem_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'poem_test')
def download_mp3(p_name, p_url):
print('开始下载{}...'.format(p_name))
start = time.time()
poem_name = p_name + '.mp3'
local_file = os.path.join(poem_dir, poem_name)
urlretrieve(url=p_url, filename=local_file)
print('下载{}持续了{}秒'.format(p_name, time.time() - start))
pool = ProcessPoolExecutor(max_workers=10)
# pool = ProcessPoolExecutor() # 无max_workers参数,使用默认值,但不同于ThreadPoolExecutor,实测可能下载任务不完整,不建议使用
with open('poem_20190803.txt') as f:
for l in f:
name = l.split('*')[0]
url = l.split('*')[1]
pool.submit(download_mp3, name, url) # 注意参数的调用方式,直接只用位置参数,无元组,无args
网友评论