import requests
import re
import os
#设置下载路径
picture_path = '/Downloads/project1/图片/'
#向url发起请求,如果状态值=200表示成功
def get_page(url):
response = requests.get(url)
if response.status_code == 200:
return response.text
#将url解析,自动创建文件夹
def get_path(index_page):
file_path = re.findall(r'<title>(.*?)</title>', index_page)
file_path = str(file_path).strip("[']").split('|')[0]
new_path = os.path.join(picture_path, file_path)
if not os.path.exists(new_path):
os.mkdir(new_path)
return new_path
#将url解析,自动获取需要到URL
def parse_index(index_page):
urls = re.findall('href="(.*?)"',index_page,re.S)
for url in urls:
if url.startswith('/d'):
url = 'http://www.xiaohuar.com'+url
yield url
#将url解析后的图片存入自动的创建文件夹
def get_picture(new_path,detail_page):
try:
response = requests.get(detail_page)
if response.status_code == 200:
name = detail_page.split('/')[-1]
filepath = '%s/%s'%(new_path,name)
with open(filepath,'wb') as f:
f.write(response.content)
print('%s 下载成功'%detail_page)
except Exception as e:
print(e)
def main():
#需要解析的URL
base_url = 'http://www.xiaohuar.com/s-1-19{page_num}.html#p1'
#计数下载的图片
count = 0
for i in range(1,99):
url = base_url.format(page_num=i)
index_page = get_page(url)
new_path = get_path(index_page)
detail_urls = parse_index(index_page)
for detail_url in detail_urls:
count+=1
get_picture(new_path,detail_url)
print('共爬取到%s张相关图片' % count)
if __name__ == '__main__':
main()
网友评论