一. 需求
比较喜欢动漫,想从网站下通过爬虫程序,下载一些动漫的壁纸
https://desk.zol.com.cn/bizhi/8366_103760_2.html

二. 解决方案
代码:
from concurrent.futures import ThreadPoolExecutor
import requests
import re
import json
def download(imgsrc):
"""下载图片到本地"""
name = imgsrc.split("/")[-1]
print(f"准备开始下载{name}")
# 1. 发送网络请求
resp_img = requests.get(imgsrc)
# 2. 此时拿不到resp.text
# resp.content -> 拿到文件的字节
with open(f"img/{name}", mode="wb") as f:
f.write(resp_img.content)
print(f"{name}下载完毕")
def main():
url = "https://desk.zol.com.cn/bizhi/8366_103760_2.html"
resp = requests.get(url)
obj = re.compile(r"var deskPicArr.*?=(?P<deskPicArr>.*?);", re.S)
result = obj.search(resp.text)
deskPicStr = result.group("deskPicArr") # 从正则.*?提取的内容一定是字符串
# 把类似字典的字符串变成真的字符串
deskPic = json.loads(deskPicStr)
with ThreadPoolExecutor(3) as t:
for item in deskPic['list']:
oriSize = item.get("oriSize")
imgsrc = item.get("imgsrc")
imgsrc = imgsrc.replace("##SIZE##", oriSize)
t.submit(download, imgsrc)
print("all over!")
if __name__ == '__main__':
main()
测试记录:

网友评论