练习如何爬一个动态加载的网页,并将图片下载到本地。
美女图片网址打开不容易,换成爬knewone的图片。加载原理相同。
#!usr/bin/env python
#_*_ coding: utf-8 _*_
#
# filter out specifical info from a dynamic webpage
#
from bs4 import BeautifulSoup
import requests
import os
import urllib
def getemtext(element):
return element.get_text().strip()
def get_target_info(url):
wbdata = requests.get(url)
soup = BeautifulSoup(wbdata.text, 'lxml')
titlelsit = soup.select('section.content > h4 > a')
imglist = soup.select('header.cover > a > img')
# link = soup.select('section.content > h4 > a')
datalist = []
for (title,img) in zip(titlelsit, imglist):
data = {
"title": getemtext(title), #title.get_text(),
"img": img.get('src').split('!')[0]
}
print(data)
datalist.append(data)
return datalist
#downimg(data['img'], data['title'])
def downimg(url, filename=''):
if not filename:
filename = os.path.basename(url)
else:
filename = filename + '.' + url.split('.')[-1]
filepath = os.path.join('knewonepic', filename)
urllib.request.urlretrieve(url, filepath)
#print(filepath)
def downimgproxy(url, filename=''):
#urllib.request.urlretrieve(url, filepath)
#print(filepath)
proxies = {'http': "207.62.234.53:8118"}
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
request = requests.get(url, proxies = proxies, headers = headers)
if request.status_code != 200:
return
if not filename:
filename = os.path.basename(url)
else:
filename = filename + '.' + url.split('.')[-1]
filepath = os.path.join('knewonepic', filename)
with open(filepath, 'wb') as f:
f.write(request.content)
if __name__ == "__main__":
folder = 'knewonepic'
if not os.path.exists(folder):
os.mkdir(folder)
url = 'https://knewone.com/things/categories/sheng-huo'
urls = [url + "?page={}".format(pageid) for pageid in range(1, 10)]
for url in urls:
datas = get_target_info(url)
for data in datas:
downimgproxy(data['img'], data['title'])
截取部分运行结果
{'img': 'https://making-photos.b0.upaiyun.com/photos/2b70f6cd1b3f54f693a04746c697dc4c.jpg', 'title': 'Humanscale World Chair 工作椅'}
{'img': 'https://making-photos.b0.upaiyun.com/photos/abe59bbfa954ba252bcdb69d21893246.jpg', 'title': 'Lithe Clock'}
{'img': 'https://making-photos.b0.upaiyun.com/photos/93a1293213d0c97322f457dedd484576.jpg', 'title': '日式取暖桌こたつ'}
{'img': 'https://making-photos.b0.upaiyun.com/photos/503d0ec2e327089a8b05b1b94f0a1611.jpg', 'title': '磁力沙漏'}
{'img': 'https://making-photos.b0.upaiyun.com/photos/fcddd7860eb8d3a80ab6c4c7676ea899.jpg', 'title': 'Anglepoise Original 1227 台灯'}
{'img': 'https://making-photos.b0.upaiyun.com/photos/5aa04046f2a53d4a3eb5a74d92fc0981.jpg', 'title': 'Starry Light'}
{'img': 'https://making-photos.b0.upaiyun.com/photos/549b66529d656c3dd1194926a8d1b71e.png', 'title': 'The Swiss Musical Starship'}
{'img': 'https://making-photos.b0.upaiyun.com/photos/91ed4e9eab2d3969ae6fae753a731a92.jpg', 'title': 'Philips Hue LED 灯泡'}
{'img': 'https://making-photos.b0.upaiyun.com/photos/cc2f649e5ff5990492ebd32931ce90f5.jpg', 'title': 'Slimline'}
下载到本地文件夹中的图片,用从页面中爬出来的title为图片命名�
Screen Shot 2016-06-27 at 9.28.00 PM.png
总结
- 下载图片的方法
(1) downimg() 用urllib.request.urlretrieve方法实现图片下载
(2) downimgproxy() 用proxy实现图片下载 - 使用代理的方法还得再研究研究。 照视频里老师的例子把代码运行成功了,但还没有真正想清楚
网友评论