通过xpath获取每个图片的url地址,免去了使用正则
多玩图库:http://tu.duowan.com/tu
from lxml import etree # 需要安装lxml模块 pip install lxml
import requests
import os
# 定义一个类
class Tuku(object):
# 初始化方法
def __init__(self):
self.headers = {
"User-Agent": "Mozilla/5.0 (Linux; Android 5.1.1; Nexus 6 Build/LYZ28E) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Mobile Safari/537.36",
}
self.start_url = "http://tu.duowan.com/tu?offset={}"
# 列表里面是待会需要循环出来的每一页的url
self.url = [] # ["http://tu.duowan.com/tu?offset=30", "http://tu.duowan.com/tu?offset=60"]
for i in range(20):
url = self.start_url.format(i * 30)
self.url.append(url)
# print(self.url)
def get_xpath(self, url):
req = requests.get(url, headers = self.headers)
# print(req)
# 通过etree.HTML(req.content)得到的是xml数据类型
html = etree.HTML(req.content)
# 可以直接通过写xpath语言来得到我们想要的数据
# html_content是所有图片的url,它是一个列表
# "//ul[@id='pic-list']/li/a/img/@src" 获取图片url地址
html_content = html.xpath("//ul[@id='pic-list']/li/a/img/@src")
# print(html_content)
path = r"C:\Users\yanji\Desktop\新建文件夹"
for html in html_content:
print(type(html))
# 打开一个文件,保存图片,因为图片是二进制的,所以是b
with open(os.path.join(path,"{}").format(html[-10:]), "ab") as f:
ret = requests.get(html)
# 写入图片的二进制
f.write(ret.content)
def get_content(self):
pass
def run(self):
# 循环从url列表中拿出每个url
for url in self.url:
# 拿出单个url地址后 发送请求并保存
self.get_xpath(url)
if __name__ == '__main__':
tuku = Tuku()
tuku.run()
效果图.png
网友评论