item :
import scrapy
class MovieItem(scrapy.Item):
# define the fields for your item here like:
name = scrapy.Field()
score = scrapy.Field()
MaoyanSpider :
# -*- coding: utf-8 -*-
import scrapy
from demo1.items import MovieItem
class MaoyanSpider(scrapy.Spider):
name = 'maoyan'
allowed_domains = ['maoyan.com']
start_urls = ['http://maoyan.com/films?offset=30']
def parse(self, response):
names = response.xpath('//div[@class="channel-detail movie-item-title"]/@title').extract()
scores_div = [score.xpath('string(.)').extract_first() for score in response.xpath('//div[@class="channel-detail channel-detail-orange"]')]
scores = []
# for score in scores_div:
# scores.append(score.xpath('string(.)').extract_first())
# for name, score in zip(names, scores_div):
# # print(name, ':', score)
# yield {"name": name, "score": score}
item = MovieItem()
for name, score in zip(names, scores_div):
item['name'] = name
item['score'] = score
yield item
#yield只能返回字典与定义的item,pipeline接收到的也是对应的字典与item
pipeline :
import json
class Demo1Pipeline(object):
def open_spider(self, spider):
self.filename = open('movie.txt', 'w', encoding='utf-8')
def process_item(self, item, spider):
# with open('movie.txt', 'a', encoding='utf-8') as f:
# f.write(json.dumps(item, ensure_ascii=False) + '\n')
# print(item)
#序列化注意先将item转成字典
self.filename.write(json.dumps(dict(item), ensure_ascii=False) + '\n')
return item #return是为了让其他pipeline也能用
def close_spider(self, spider):
self.filename.close()
setting :
ITEM_PIPELINES = {
'demo1.pipelines.Demo1Pipeline': 300, #pipelines的路径 : 300代表优先级顺序,越小启动级别越高
}
网友评论