scrapy进阶开发(三):去重策略

作者: 9c0ddf06559c | 来源:发表于2018-03-01 21:47 被阅读39次

    流程

    1. 在Engine拿到request发送给scheduler之前,需要先判断下当前请求是否被过滤

    # 源码位置scrapy.core.engine.ExecutionEngine
    class ExecutionEngine(Object):
    
        def schedule(self, request, spider):
            self.signals.send_catch_log(signal=signals.request_scheduled,
                    request=request, spider=spider)
            这里调用scheduler的enqueue_request方法做判断,具体见2
            if not self.slot.scheduler.enqueue_request(request):
                self.signals.send_catch_log(signal=signals.request_dropped,
                                            request=request, spider=spider)
    

    2. 判断下当前请求是否被过滤的逻辑由scheduler的enqueue_request方法判断

    # 源码位置 scrapy.core.scheduler.Scheduler
    class Scheduler(object):
        def __init__(self, dupefilter, jobdir=None, dqclass=None, mqclass=None,
                     logunser=False, stats=None, pqclass=None):
            # dupefilter 为具体过滤器,见3.
            self.df = dupefilter
            self.dqdir = self._dqdir(jobdir)
            self.pqclass = pqclass
            self.dqclass = dqclass
            self.mqclass = mqclass
            self.logunser = logunser
            self.stats = stats
        ...
        ...
        ...
        def enqueue_request(self, request):
            # self.df.request_seen 为过滤器中具体执行过滤的方法逻辑
            # 如果request设置了非不过滤(即过滤,双重否定表肯定)并且该request经过判断以后的确需要过滤,则打印任职并返回False
            if not request.dont_filter and self.df.request_seen(request):
                self.df.log(request, self.spider)
                return False
            if self.stats:
                self.stats.inc_value('scheduler/enqueued/redis', spider=self.spider)
            self.queue.push(request)
            return True
    

    3. scrapy的去重器 在 scrapy/dupefilters.py

    # 去重启基类,定义了去重器需要实现的方法
    class BaseDupeFilter(object):
    
        @classmethod
        def from_settings(cls, settings):
            return cls()
    
        def request_seen(self, request):
            return False
    
        def open(self):  # can return deferred
            pass
    
        def close(self, reason):  # can return a deferred
            pass
    
        def log(self, request, spider):  # log that a request has been filtered
            pass
    
    
    # scrapy中默认的去重器
    class RFPDupeFilter(BaseDupeFilter):
        """Request Fingerprint duplicates filter"""
    
        def __init__(self, path=None, debug=False):
            self.file = None
            # 一个指纹集合,利用到了set的特性,不重复
            self.fingerprints = set()
            self.logdupes = True
            self.debug = debug
            self.logger = logging.getLogger(__name__)
            # 这里会判断是否设置了jobdir,如果设置了,则会将去重结合写入到jobdir目录,具体见《scrapy进阶开发(二):暂停与重启》一文
            if path:
                self.file = open(os.path.join(path, 'requests.seen'), 'a+')
                self.file.seek(0)
                self.fingerprints.update(x.rstrip() for x in self.file)
    
        @classmethod
        def from_settings(cls, settings):
            # settings中将DUPEFILTER_DEBUG设置为true可以开启过滤debug信息的打印
            debug = settings.getbool('DUPEFILTER_DEBUG')
            return cls(job_dir(settings), debug)
    
        def request_seen(self, request):
            # 为request生成一个指纹 
            fp = self.request_fingerprint(request)
            # 判断当前指纹是否在集合中
            if fp in self.fingerprints:
                # 如果在返回True代表当前request已经被处理过应该过滤掉
                return True
            # 否则添加到set中
            self.fingerprints.add(fp)
            # 如果jobdir文件存在,则写入
            if self.file:
                self.file.write(fp + os.linesep)
    
        # request_fingerprint方法在scrapy.utils.request里,
        # 使用了sha1算法为每一个request生成一个固定长度的hash值
        def request_fingerprint(self, request):
            return request_fingerprint(request)
    
        # 关闭方法
        def close(self, reason):
            if self.file:
                self.file.close()
    
        # 记录日志的方法封装
        def log(self, request, spider):
            if self.debug:
                msg = "Filtered duplicate request: %(request)s"
                self.logger.debug(msg, {'request': request}, extra={'spider': spider})
            elif self.logdupes:
                msg = ("Filtered duplicate request: %(request)s"
                       " - no more duplicates will be shown"
                       " (see DUPEFILTER_DEBUG to show all duplicates)")
                self.logger.debug(msg, {'request': request}, extra={'spider': spider})
                self.logdupes = False
    
            spider.crawler.stats.inc_value('dupefilter/filtered', spider=spider)
    

    request_seen

    enqueue_request

    相关文章

      网友评论

        本文标题:scrapy进阶开发(三):去重策略

        本文链接:https://www.haomeiwen.com/subject/nognxftx.html