# 爬去豆瓣电影其中一部电影中所有的剧照 例子里用的是《鸟人》
# 先得到所有urls 这种策略似乎有点愚蠢 如果得不到url就悲剧了
# 可能边找边保存 或者 用多线程 都会好一些。
import requests
from bs4 import BeautifulSoup
import time
import os
# 得到所有的网页
def get_urls(n):
urls = []
for i in range(n):
url ="https://movie.douban.com/subject/20438962/photos?type=S&start=%i"%(i*30) # 鸟人的url
urls.append(url)
return(urls)
# 得到图片的链接
def parse_url(urls,headers):
picture_urls = []
for ui in urls:
ri = requests.get(url=ui,headers=headers)
# print(ri.text)
soup = BeautifulSoup(ri.text,'lxml')
ul = soup.find('ul',class_="poster-col3 clearfix")
lis = ul.find_all('li')
# print(len(lis))
for li in lis:
url_link = li.find("img")["src"]
picture_urls.append(url_link)
print(url_link)
# time.sleep(1) # 休息1s
# print(picture_urls)
return(picture_urls)
# print(len(li))
# 保存图片
def save_pictures(urls,path):
for pic in urls:
picture = requests.get(pic)
name =pic.split("/")[-1]
savepath = path + '/' + name
with open(savepath,"wb") as f:
f.write(picture.content)
print("已经保存" + name)
if __name__ == '__main__':
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36'
}
# cookie = {'bid=pEVRx5Atsbg; gr_user_id=55cb2cb6-72d2-4edc-849c-41c97efe6ed1; _vwo_uuid_v2=D6942293ABD2C06C0FD297FF0C094A22F|b6163324a54034fd46a0c0ff38c052dd; push_noty_num=0; push_doumail_num=0; __utmv=30149280.8901; ll="118282"; douban-profile-remind=1; douban-fav-remind=1; ct=y; __utmz=30149280.1564624985.50.9.utmcsr=google|utmccn=(organic)|utmcmd=organic|utmctr=(not%20provided); UM_distinctid=16c4aea9cd774-0a35554905ca2d-37627c02-384000-16c4aea9cd8263; __utmc=30149280; __utmc=223695111; viewed="30463116_1291204_26999123_1072313_26895988_25913349_4237482_30400047_30395230_3584987"; __utma=30149280.1328571580.1561017049.1565080925.1565086173.59; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1565086184%2C%22https%3A%2F%2Fbook.douban.com%2Fsubject_search%3Fsearch_text%3D%25E9%25B8%259F%25E4%25BA%25BA%26cat%3D1001%22%5D; _pk_ses.100001.4cf6=*; __utma=223695111.2042501461.1562549977.1565066378.1565086184.17; __utmb=223695111.0.10.1565086184; __utmz=223695111.1565086184.17.12.utmcsr=book.douban.com|utmccn=(referral)|utmcmd=referral|utmcct=/subject_search; ap_v=0,6.0; gr_session_id_22c937bbd8ebd703f2d8e9445f7dfd03=b71b9e87-9527-42d0-a06d-1c24303c4899; gr_cs1_b71b9e87-9527-42d0-a06d-1c24303c4899=user_id%3A0; __utmt_douban=1; gr_session_id_22c937bbd8ebd703f2d8e9445f7dfd03_b71b9e87-9527-42d0-a06d-1c24303c4899=true; __utmt=1; __utmb=30149280.6.10.1565086173; _pk_id.100001.4cf6=e3f875611916fee4.1562549977.16.1565089087.1565066378.'}
douban_urls = get_urls(51)
all_pic_urls = parse_url(douban_urls,headers)
save_path = './pictures'
# 如果文件路径不存在即创建
if not os.path.exists(save_path):
os.makedirs(save_path)
save_pictures(all_pic_urls,save_path)
网友评论