import requests
from lxml import etree
from urllib import request
url = 'http://i.jandan.net/ooxx'
headers = {'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.67 Safari/537.36'}
def get_next_page(url):
resp = requests.get(url, headers=headers)
r_path = etree.HTML(resp.text)
r_Xpath = r_path.xpath("//a[@title='Older Comments']/@href")
return 'http:'+r_Xpath[0]
def parse_urls(url):
resp = requests.get(url, headers=headers)
r_path = etree.HTML(resp.text)
r_Xpath = r_path.xpath("//div[@class='commenttext']//a/@href")
for i in r_Xpath:
title = i.split('/')[-1]
htmls = 'http:'+ i
request.urlretrieve(htmls,title)
href = []
while True:
number = input('enter a word: ')
if number != 'q':
html = get_next_page(url)
href.append(html)
print(url)
parse_urls(url)
url = href[-1]
else:
break
这个没有那么复杂,只用了requests,lxml的etree,urllib的request.
urlretrieve.
原来这么好玩。我膨胀了。
每一次循环我都要输入一个字母,字母只要不是q,就能运行一次循环,下载图片,现在就是输入一个字母下载一页,并且把图片来源网址打印出来。
运行,打印网址,并且下载网址的图片
然后看看有没有下载图片
网友评论