与settings同级创建一个py文件
1. 撸代码
#coding=utf-8
from scrapy import signals
hahaha = 0
class QianlongwangSpiderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
crawler.signals.connect(s.item_scraped, signal=signals.item_scraped)
crawler.signals.connect(s.spider_closed, signal=signals.spider_closed)
crawler.signals.connect(s.spider_error, signal=signals.spider_error)
crawler.signals.connect(s.spider_idle, signal=signals.spider_idle)
return s
# 当spider开始爬取时发送该信号。该信号一般用来分配spider的资源,不过其也能做任何事。
def spider_opened(self, spider):
spider.logger.info('pa chong kai shi le: %s' % spider.name)
print('start','1')
def item_scraped(self,item, response, spider):
global hahaha
hahaha += 1
# 当某个spider被关闭时,该信号被发送。该信号可以用来释放每个spider在 spider_opened 时占用的资源。
def spider_closed(self,spider, reason):
print('-------------------------------all over------------------------------------------')
global hahaha
print(spider.name,' closed')
# 当spider的回调函数产生错误时(例如,抛出异常),该信号被发送。
def spider_error(self,failure, response, spider):
code = response.status
print('spider error')
# 当spider进入空闲(idle)状态时该信号被发送。空闲意味着:
# requests正在等待被下载
# requests被调度
# items正在item pipeline中被处理
def spider_idle(self,spider):
for i in range(10):
print(spider.name)
2. 把中间件加到downloadMiddleware
DOWNLOADER_MIDDLEWARES = {'scrapy.contrib.downloadermiddleware.useragent.UserAgentMiddleware': None,
'news.signalceshi.QianlongwangSpiderMiddleware': 543
}
官方文档信号供参考
https://scrapy.readthedocs.io/en/latest/topics/signals.html
网友评论