爬取豆瓣网首页的电影信息,包括电影名、星级、导演、演员表,存入指定的文件。
爬取步骤:
1、定义HTML解析类
2、获取到指定的文件路径
3、把信息写入文件
4、保存电影信息,该步是前面三步的综合逻辑
#-*- coding: utf-8-*-
from urllib import request
import urllib
from html.parser import HTMLParser
import os
'''
爬取豆瓣网首页的电影信息,包括电影名、星级、导演、演员表
步骤:
1、定义HTML解析类
2、获取到指定的文件路径
3、把信息写入文件
4、保存电影信息,该步是前面三步的综合逻辑
'''
'''
自定义的HTML解析类
'''
class MovieParse(HTMLParser):
def __init__(self):
HTMLParser.__init__(self)
self.movies = []
#重载HTMLParse自带的 handle_starttag函数
def handle_starttag(self, tag, attrs):
def _attr(attrlist, attrnmae):
for attr in attrlist:
if attr[0] == attrnmae:
return attr[1]
return None
#在li标签里,找到需要的特征属性
if tag == 'li' and _attr(attrs, 'data-title'):
movie = {}
movie['title'] = _attr(attrs, 'data-title')
movie['rate'] = _attr(attrs, 'data-rate')
movie['director'] = _attr(attrs, 'data-director')
movie['actors'] = _attr(attrs, 'data-actors')
self.movies.append(movie)
# print('%(title)s|%(rate)s|%(director)s|%(actors)s' % movie)
'''
获取到文件路径
'''
def get_path():
path = os.getcwd()
new_path = os.path.join(path, 'douban')
if not os.path.isdir(new_path):
os.mkdir(new_path)
new_path += '/'
return new_path
'''
把信息写入文件
'''
def write_file(file_path, movielist):
with open(file_path + 'movies_info.txt', 'w', encoding='utf-8') as f:
for movie in movielist:
movie_info = '%(title)s|%(rate)s|%(director)s|%(actors)s' % movie + '\n'
f.write(movie_info)
'''
保存电影信息
'''
def save_movies_info(url):
headers = {}
req = urllib.request.Request(url, headers=headers)
res = urllib.request.urlopen(req)
filepath = get_path()
parse = MovieParse()
parse.feed((res.read()).decode('utf-8'))
res.close()
write_file(filepath, parse.movies)
if __name__ == '__main__':
url = 'https://movie.douban.com/'
#返回一个电影列表
save_movies_info(url)
网友评论