本文所使用的多线程用到了concurrent.futures模块,该模块是Python中最广为使用的并发库,它可以非常方便地将任务并行化。在concurrent.futures模块中,共有两种并发模块,分别如下:
- 多线程模式:ThreadPoolExecutor,适合 IO密集型任务;
- 多进程模式:ProcessPoolExecutor,适合计算密集型任务。
具体的关于该模块的介绍可以参考其官方网址:https://docs.python.org/3/library/concurrent.futures.html。本次爬虫项目将会用到concurrent.futures模块中的ThreadPoolExecutor类。
下面是具体实现代码:
# -*- coding: UTF-8 -*-
import json
import os
import re
import time
import requests
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor, wait, ALL_COMPLETED
def paragraph_info(id, order=0):
artic = {}
artic['id'] = id
url = "https://dict.eudic.net/webting/play?id={}&order={}".format(id, order)
print(url)
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")
article = soup.select_one("div #article")
sentence = article.select("span.sentence")[1:]
script = str(soup.select("script[type='text/javascript']")[0].get_text())
translate = re.search('{"translation":.*,"wordhints"', script)
start, end = translate.span()
translate = script[start + 16:end - 13]
translate = [t + "}" for t in translate.split("},")]
translate[-1] = translate[-1][:-1]
artic['sentence'] = []
for index, sen in enumerate(sentence):
temp = {}
en = sen.get_text()
cn = json.loads(translate[index])
temp['order'] = index
temp['timestamps'] = cn['timestamps']
temp['english'] = en
temp['chinese'] = cn['text']
artic['sentence'].append(temp)
mp3 = re.search('initPlayPage(.*)', script)
start, end = mp3.span()
mp3_url = script[start + 14:end - 4]
req = requests.get(mp3_url)
save_path = os.path.abspath(os.curdir) + "/audios/" + id + ".mp3" # 当前工作目录下的文件路径
with open(save_path, 'wb') as code:
code.write(req.content)
artic['save_path'] = save_path
print(artic)
return artic
def main():
# 统计该爬虫的消耗时间
print('*' * 50)
start = time.time()
# 利用并发下载
executor = ThreadPoolExecutor(max_workers=2) # 可以自己调整max_workers,即线程的个数
# submit()的参数: 第一个为函数, 之后为该函数的传入参数,允许有多个
ids = ["e27a4e2d-f7dc-442a-937e-6b10691275e5", "520ae730-20d2-11e6-bcc9-000c29ffef9b"]
future_tasks = [executor.submit(paragraph_info, id) for id in ids]
# 等待所有的线程完成,才进入后续的执行
wait(future_tasks, return_when=ALL_COMPLETED)
end = time.time()
print('使用多线程,总共耗时:%s' % (end - start))
print('*' * 50)
if __name__ == '__main__':
main()
其中,获取当前工作路径可以通过以下方法:
import os
print os.getcwd()#获得当前工作目录
print os.path.abspath('.')#获得当前工作目录
print os.path.abspath('..')#获得当前工作目录的父目录
print os.path.abspath(os.curdir)#获得当前工作目录
参考:https://segmentfault.com/a/1190000015326189
https://blog.csdn.net/qq_15188017/article/details/53991216
网友评论