import requests
import os
from lxml import etree
class spider(object):
def __init__(self):
self.headers = {
"user-agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36",
"referer": "https://www.mzitu.com/"
}
# 1.取得网站数据
def requsts_dada(self):
response = requests.get("https://www.mzitu.com/",headers = self.headers)
html = etree.HTML(response.text)
# 2.获取大链接地址和分类标题
class_tit = html.xpath('//ul[@id="pins"]/li/span/a/text()')
class_href = html.xpath('//ul[@id="pins"]/li/span/a/@href')
# print(class_href)
# 建立文件夹
for tit,src in zip(class_tit,class_href):
if os.path.exists(tit) == False:
os.mkdir(tit)
self.download_img_data(src,tit)
def download_img_data(self,src,tit):
# 3.取得分类页面数据
response = requests.get(src,headers = self.headers)
html = etree.HTML(response.text)
img_num = html.xpath('//div[@class="pagenavi"]/a[5]/span/text()')
for i in range(1,int(img_num[0])+1):
# 4.获取分类页面大图标题及大图链接
img_tit = html.xpath('//h2/text()')
img_data = requests.get(src + "/" + str(i),headers = self.headers)
html = etree.HTML(img_data.text)
img_href = html.xpath('//div[@class ="main-image"]/p/a/img/@src')
for imgtit,imgsrc in zip(img_tit,img_href):
jpg_name = tit + "\\" + tit + str(i) + ".jpg"
response = requests.get(imgsrc,headers = self.headers).content
print("正在下载图片……")
# 5.保存图片
with open(jpg_name,"wb") as f:
f.write(response)
spider = spider()
spider.requsts_dada()
网友评论