1、首先创建好爬虫项目和爬虫类(下面为maoyanmovie.py爬虫类),编写时用scrapy shell来调试。
# -*- coding: utf-8 -*-
import scrapy
import json
class MaoyanmovieSpider(scrapy.Spider):
name = 'maoyanmovie'
# allowed_domains = ['https://maoyan.com/board/4']
list = []
for n in range(0, 10):
url = 'https://maoyan.com/board/4?offset={}'.format(n*10)
list.append(url)
start_urls = list
print(start_urls)
def parse(self, response):
html1 = response.xpath('//*[@id="app"]/div/div/div[1]/dl')[0]
titles = html1.xpath('//div[@class="movie-item-info"]/p[@class="name"]/a/@title').extract()
actors = html1.xpath('//div[@class="movie-item-info"]/p[@class="star"]/text()').extract()
times = html1.xpath('//div[@class="movie-item-info"]/p[@class="releasetime"]/text()').extract()
scores1 = html1.xpath(
'//div[@class="movie-item-number score-num"]/p[@class="score"]/i[@class="integer"]/text()').extract()
scores2 = html1.xpath(
'//div[@class="movie-item-number score-num"]/p[@class="score"]/i[@class="fraction"]/text()').extract()
ranks = html1.xpath('//dd/i[1]/text()').extract()
scores = []
new_actors = []
new_releastimes = []
for score1, score2 in zip(scores1, scores2):
new_score = score1 + score2
scores.append(new_score)
for i in actors:
i = str(i)
i = i.strip()
if i:
i = i.replace('主演:', '')
new_actors.append(i)
else:
new_actors.append(i)
for i in times:
i = str(i)
i = i.strip()
if i:
i = i.replace('上映时间:', '')
new_releastimes.append(i)
else:
new_releastimes.append(i)
with open('movie.json', 'a', encoding='utf-8') as f:
item = {}
for title, actor, releastime, score, rank in zip(titles, new_actors, new_releastimes, scores, ranks): # 拉链函数
print(str(title) + '----' + str(actor) + '----' + str(releastime) + '---' + str(score) + '---' + rank)
item['电影名称'] = str(title)
item['演员'] = str(actor)
item['上映时间'] = str(releastime)
item['评分'] = str(score)
item['排行榜'] = str(rank)
data = json.dumps(item, ensure_ascii=False) + ',' + '\n' # 把字典、列表转化为字符串
f.write(data)
2、对settings.py进行一系列变量设置
BOT_NAME = 'maoyan' #自己带有的(创建爬虫项目就有的)
SPIDER_MODULES = ['maoyan.spiders']# 创建项目就有
NEWSPIDER_MODULE = 'maoyan.spiders'#创建项目就有
USER_AGENT = 'maoyan (+http://www.yourdomain.com)'
ROBOTSTXT_OBEY = True
SPIDER_MIDDLEWARES = {
'maoyan.middlewares.MaoyanSpiderMiddleware': 543,
}
DOWNLOADER_MIDDLEWARES = {
'maoyan.middlewares.MaoyanDownloaderMiddleware': 543,
}
注意:复制一行:Ctrl+d、删除一行:Ctrl+y
网友评论