从开始学习 Python 到第一次开始动手写爬虫也算是小小的进步了,作为练习来爬几张图巩固一下 Requests 和 XPath 的常用方法。
就用下面这个网站开练吧
爬壁纸大致思路
从不同分类标签,进入对应的主页,构造主页 url ,获取不同组图详情页的 url,分析详情页,找到大图资源并保存。
爬虫代码
import requests
from lxml import etree
import os
def choice_class(tag):
if tag == 1:
return 'meinv'
elif tag == 2:
return 'chemo'
elif tag == 3:
return 'fengjing'
elif tag == 4:
return 'dongwu'
else:return 'qiche'
def get_html(url,path):
html = requests.get(url)
html.encoding = 'gb2312'
selector = etree.HTML(html.text)
infos = selector.xpath(path)
return infos
def get_source(url,path_1,path_2):
infos_1 = get_html(url,path_1)
for info_1 in infos_1:
pic_url = 'http://desk.zol.com.cn'+ info_1.xpath('@href')[0]
title = info_1.xpath('span/@title')[0]
infos_2 = get_html(pic_url,path_2)
for info_2 in infos_2:
img_url = info_2.xpath('@src')[0]
img = requests.get(img_url).content
with open(os.getcwd()+ '/图片/' +title + '.jpg', 'wb') as f:
f.write(img)
def spider(begin,end,tag_name):
for page in range(begin,end):
url = 'http://desk.zol.com.cn/'+tag_name+'/'+ str(page) +'.html'
path_1 = '//ul[@class="pic-list2 clearfix"]/li/a'
path_2 = '//img[@id="bigImg"]'
get_source(url,path_1,path_2)
if __name__ == '__main__':
print('壁纸的分类有: 1.美女 2.车模 3.风景 4.动物 5.汽车')
tag = int(input('请输入一种壁纸类型的序号:'))
while tag <6:
begin = int(input('输入开始爬取的页码:'))
if begin <= 666 :
tag_name = choice_class(tag)
end = int(input('输入结束的页码:'))
break
spider(begin, end, tag_name)
网友评论