1,基础知识
2, 自己动手写程序
from bs4 import BeautifulSoup
import requests, urllib, time
#proxies = {"http": "http://139.162.8.118"}
proxy_support = urllib.request.ProxyHandler({'http': '127.0.0.1:8787'})
opener = urllib.request.build_opener(proxy_support)
urllib.request.install_opener(opener)
def get_page(url, page, data = None):
wb_data = requests.get(url)
soup = BeautifulSoup(wb_data.text, 'lxml')
imgs = soup.select('img.entry-thumbnail')
if data == None:
for i in range(len(imgs)):
img_link = imgs[i]
img = img_link.get('src')
str_num = str(i)
print(img)
#download(img, str_num)
pic = urllib.request.urlopen(img)
name = '/Users/aipengya/Downloads/pictures_test/' + '(' + str(page) + ')' + str_num + '.jpg'
f = open(name, 'wb')
f.write(pic.read())
f.close()
print("Done!")
def get_more_pages(start, end):
for page in range(start, end):
url = 'http://weheartit.com/inspirations/taylorswift?scrolling=true&page={}'.format(page)
get_page(url, page)
time.sleep(2)
#def download(img, str_num):
# file_name = path + str_num + '.jpg'
# img_data = urllib.request.urlopen(img).read()
# f = open(file_name, 'wb').write(img_data)
# f.close()
get_more_pages(1, 3)
#main-container > div > div > div > div > div > a > img
3, 反思与总结
- 代码中也要设代理,虽然目前还不清楚代理应该怎么设。
网友评论