分析网页
通过浏览器查看网页源代码,可以找到相应的电影信息以及图片链接,可知目标网页为静态网页,因此可以直接爬取。
爬取思路
- 通过urllib.request获取网页源码
- 用BeautifulSoup解析网页源码,使用find_all方法筛选出电影信息节点和图片链接节点
- 将筛选出的电影信息以及图片链接写入Excel表
- 使用urllib.request的urlretrieve方法下载封面图片
代码实现
- 通过urllib.request获取网页源码
def get_source(url):
content = urllib.request.urlopen(url)
source = content.read().decode('utf-8')
return source
- 用BeautifulSoup解析网页源码,使用find_all方法筛选出电影信息和图片链接
def get_movielist(source):
movielist = []
soup = BeautifulSoup(source, 'html.parser')
info = soup.find_all('div', class_="item")
for j, i in enumerate(info):
name = i.div.img['alt']
img = i.div.img['src']
score = i.find_all('span', class_="rating_num")[0].get_text()
if i.find_all(class_="inq"):
remark = i.find_all(class_="inq")[0].get_text()
else:
remark = None
movielist.append([name, img, score, remark])
return movielist
- 将筛选出的电影信息以及图片链接写入Excel表
def excel(ws, movlist):
global index
style = xlwt.XFStyle()
font = xlwt.Font()
font.name = '黑体'
font.bold = True
style.font = font
headdata = ['电影名称', '电影封面', '豆瓣评分', '经典评语']
for i in range(4):
ws.write(0, i, headdata[i], style)
for i in movlist:
for j in range(4):
ws.write(index, j, i[j])
index += 1
- 使用urllib.request的urlretrieve方法下载封面图片
def img_download(movieslist):
for i in movieslist:
urllib.request.urlretrieve(i[1], 'd:/pic/{}.jpg'.format(i[0]))
- 爬取源码
# !/usr/bin/env python3.6
# coding:utf-8
# @Author : Natsume
# @Filename : douban.py
'''
@Description:
豆瓣电影Top250及封面图片下载爬虫,修改URL的start参数可以指定爬取某一页的电影信息
'''
import urllib.request
from bs4 import BeautifulSoup
import xlwt
# 获取网页源码
def get_source(url):
content = urllib.request.urlopen(url)
source = content.read().decode('utf-8')
return source
# 使用BeautifulSoup解析网页,通过find_all方法获取电影信息及图片链接
def get_movielist(source):
movielist = []
soup = BeautifulSoup(source, 'html.parser')
info = soup.find_all('div', class_="item")
for i in info:
name = i.div.img['alt']
img = i.div.img['src']
score = i.find_all('span', class_="rating_num")[0].get_text()
if i.find_all(class_="inq"):
remark = i.find_all(class_="inq")[0].get_text()
else:
remark = None
movielist.append([name, img, score, remark])
return movielist
# 将爬取的电影信息及图片链接写入Excel表
def excel(ws, movlist):
global index
style = xlwt.XFStyle()
font = xlwt.Font()
font.name = '黑体'
font.bold = True
style.font = font
headdata = ['电影名称', '电影封面', '豆瓣评分', '经典评语']
for i in range(4):
ws.write(0, i, headdata[i], style)
for i in movlist:
for j in range(4):
ws.write(index, j, i[j])
index += 1
# 根据爬取的图片链接下载封面图片
def img_download(movieslist):
for i in movieslist:
urllib.request.urlretrieve(i[1], 'd:/pic/{}.jpg'.format(i[0]))
# 爬虫执行入口
if __name__ == '__main__':
index = 1
wb = xlwt.Workbook()
wsheet = wb.add_sheet('movie', cell_overwrite_ok=True)
for x in range(10):
print('正在爬取第%s页' % (x + 1))
source = get_source('https://movie.douban.com/top250?start={}'\
'&filter='.format(x*25))
movieslist = get_movielist(source)
excel(wsheet, movieslist)
print('正在下载第{}页'.format(x+1))
img_download(movieslist)
savepath = 'd:/python/豆瓣top250.xls'
wb.save(savepath)
网友评论