嘿嘿 召唤老色批
今天带大家爬去一下美女的图片
用的是requests和xpath去解析
获取网页和解析网页的函数
def get_tag(response,tag):
html=etree.HTML(response)
ret=html.xpath(tag)
return ret
def parse_url(url):
response=requests.get(url,headers=headers)
return response.text
获取网页url
def url_find(url):
r=parse_url(url)
url_list=get_tag(r,'//*[@id="pins"]/li/span[1]/a/@href')
title=get_tag(r, '//*[@id="pins"]/li/span[1]/a/text()')
# print(len(url_list))
for i in range(len(url_list)):
url_jpg_find(url_list[i],title[i])
print(title,'保存完毕')
获取图片的url
def url_jpg_find(url,title):
global page
page=0
r=parse_url(url)
url_last=int(get_tag(r,'/html/body/div[2]/div[1]/div[4]/a[5]/span/text()')[0])
url_list=[url]+[url + '/' + str(i) for i in range(2, url_last + 1)]
if not os.path.exists(title):
os.makedirs(title)
# else:
# return
for i in url_list:
content_find(i,title)
# break
获取图片的信息
def content_find(url,title):
# print(url)
r=parse_url(url)
# print(r)
name=get_tag(r,'/html/body/div[2]/div[1]/h2/text()')[0]
url_jpg=get_tag(r,'//div[@class="main-image"]//a/img/@src')[0]
# print(name,url_jpg)
time.sleep(0.2)
save(name,url_jpg,title)
保存图片
def save(name,url_jpg,title):
global page
r=requests.get(url_jpg,headers=headers)
with open(os.getcwd()+'/'+title+'/'+name+'.jpg','wb') as j:
j.write(r.content)
j.close()
page+=1
print(page)
import requests,os,time
from lxml import etree
headers={
"User-Agent" : "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36",
"Referer" : "https://www.mzitu.com",
}
page=0
def get_tag(response,tag):
html=etree.HTML(response)
ret=html.xpath(tag)
return ret
def parse_url(url):
response=requests.get(url,headers=headers)
return response.text
def url_find(url):
r=parse_url(url)
url_list=get_tag(r,'//*[@id="pins"]/li/span[1]/a/@href')
title=get_tag(r, '//*[@id="pins"]/li/span[1]/a/text()')
# print(len(url_list))
for i in range(len(url_list)):
url_jpg_find(url_list[i],title[i])
print(title,'保存完毕')
def url_jpg_find(url,title):
global page
page=0
r=parse_url(url)
url_last=int(get_tag(r,'/html/body/div[2]/div[1]/div[4]/a[5]/span/text()')[0])
url_list=[url]+[url + '/' + str(i) for i in range(2, url_last + 1)]
if not os.path.exists(title):
os.makedirs(title)
# else:
# return
for i in url_list:
content_find(i,title)
# break
def content_find(url,title):
# print(url)
r=parse_url(url)
# print(r)
name=get_tag(r,'/html/body/div[2]/div[1]/h2/text()')[0]
url_jpg=get_tag(r,'//div[@class="main-image"]//a/img/@src')[0]
# print(name,url_jpg)
time.sleep(0.2)
save(name,url_jpg,title)
def save(name,url_jpg,title):
global page
r=requests.get(url_jpg,headers=headers)
with open(os.getcwd()+'/'+title+'/'+name+'.jpg','wb') as j:
j.write(r.content)
j.close()
page+=1
print(page)
def main():
start_url='https://www.mzitu.com'
r=parse_url(start_url)
url_last=int(get_tag(r,'/html/body/div[2]/div[1]/div[3]/div/a[4]/text()')[0])
url='https://www.mzitu.com/page/'
url_list=['https://www.mzitu.com']+[url+str(i) for i in range(2,url_last+1)]
# print(url_list)
for url in url_list:
url_find(url)
# break
if __name__ == '__main__':
main()
效果图就不放了
咳咳 太诱人 会被封掉
请大家自行脑补一下
一起学习python,小白指导,教学分享记得私信我
网友评论