爬取猫眼电影存入csv
from urllib import request
import time
import re
import csv
class MaoyanSpider(object):
def __init__(self):
self.baseurl = 'https://maoyan.com/board/4?offset='
self.headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36'
}
# 爬取页数计数
self.page = 1
# 获取页面
def get_page(self,url):
req = request.Request(url,headers=self.headers)
res = request.urlopen(req)
html = res.read().decode('utf-8')
# 直接调用解析函数
self.parse_page(html)
# 解析页面
def parse_page(self,html):
# 正则解析
p = re.compile('<div class="movie-item-info">.*?title="(.*?)".*?class="star">(.*?)</p>.*?releasetime">(.*?)</p>',re.S)
r_list = p.findall(html)
self.write_page(r_list)
# 保存数据
def write_page(self,r_list):
film_list = []
with open('maoyanfilm.csv','a') as f:
writer = csv.writer(f)
for rt in r_list:
film = ( rt[0].strip(),rt[1].strip(),rt[2].strip() )
film_list.append(film)
writer.writerows(film_list)
# 主函数
def main(self):
# 用range函数可获取某些查询参数的值
for offset in range(0,41,10):
url = self.baseurl + str(offset)
self.get_page(url)
print('第%d页爬取成功' % self.page)
self.page += 1
time.sleep(1)
if __name__ == '__main__':
spider = MaoyanSpider()
spider.main()
网友评论