本文没有写太多代码逻辑分析,使用的爬虫是最基本的爬虫知识,只是想试试使用线程池的效果,线程池的知识点参考上文:https://www.jianshu.com/p/82f1a574ad8c
就直接上代码了,实践结果证明,多线程在有延迟的网络请求及文件写入等IO操作上是能极大提高效率的,且使用futures下的线程池模块实现非常容易。
以下代码可直接使用,爬取的图片用来自动更换壁纸很棒。注意更改下代码中的路径或者在当前目录新建“Bing每日一图”目录即可。
import re
import requests
from concurrent import futures
import time
class BingImageDownload(object):
"""
下载bing每日一图,实例化时需传入pagelist参数,为下载页的可迭代对象
"""
def __init__(self, pagelist):
self.url = "http://bing.plmeizi.com/show/{}"
self.headers = {
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36"
}
self.pagelist =pagelist
# 获取每个页面的标题和大图链接字典
def get_image(self,page):
this_url = self.url.format(page)
# 获取快照页,通过快照页获取图片名及高清图链接
response = requests.get(this_url,headers=self.headers)
if response.status_code == 200:
content = response.text
message = re.findall(r'<span id="title">(.*?)>>查看大图</a>', content, re.S)[0]
title = re.findall(r'(.*?) ', message, re.S)[0]
image_url = re.findall(r'<a href="(.*?)" target="_blank" id="picurl">', message, re.S)[0]
image_info_dict = {"title":title,"image_url":image_url}
return self.download_HDimage(image_info_dict,page)
else:
print('页面信息获取失败:第{}页'.format(page))
# 稍微设置延迟,减轻对服务器压力
time.sleep(0.2)
# 写入文件
def to_file(self, content, filename,page):
filename = filename.strip('"')
with open('./Bing每日一图/%s.jpg' % filename, 'wb') as f:
f.write(content)
print("第%s页:【%s】-->下载完成" % (page, filename))
#下载大图
def download_HDimage(self,image_info_dict,page):
title = image_info_dict.get("title")
image_url = image_info_dict.get("image_url")
response = requests.get(image_url,headers=self.headers)
if response.status_code == 200:
content = response.content
self.to_file(content,title,page)
else:
print("HD大图下载失败 -->title:{},url:{}".format(title,image_url))
# 使用线程池并发管理并发下载,设定下载页
def main(self):
start = time.time()
DOWNLOAD_PAGE_LIST = self.pagelist
# 线程池开启,设置最大线程量20
workers = 20
with futures.ThreadPoolExecutor(workers) as excutor:
result = excutor.map(self.get_image, DOWNLOAD_PAGE_LIST)
end = time.time()
total_time = end-start
print("---下载完成,总计下载数量:{},总耗时:{}---".format(len(list(result)),total_time))
return len(list(result))
if __name__ == '__main__':
PAGE_LIST = range(1,1000)
downloader = BingImageDownload(PAGE_LIST)
downloader.main()
网友评论