美文网首页
6-2 middlewares.py

6-2 middlewares.py

作者: 学飞的小鸡 | 来源:发表于2018-10-31 21:06 被阅读0次
    # -*- coding: utf-8 -*-
    
    # Define here the models for your spider middleware
    #
    # See documentation in:
    # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
    
    from scrapy import signals
    from selenium import webdriver
    from time import sleep
    
    from scrapy.http import HtmlResponse
    
    class ToutiaoDownloaderMiddleware(object):
        # Not all methods need to be defined. If a method is not defined,
        # scrapy acts as if the downloader middleware does not modify the
        # passed objects.
    
        @classmethod
        def from_crawler(cls, crawler):
            # This method is used by Scrapy to create your spiders.
            s = cls()
            crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
            return s
    
        def process_request(self, request, spider):
            # Called for each request that goes through the downloader
            # middleware.
    
            # Must either:
            # - return None: continue processing this request
            # - or return a Response object
            # - or return a Request object
            # - or raise IgnoreRequest: process_exception() methods of
            #   installed downloader middleware will be called
    
            # 创建一个webdriver对象
            opt = webdriver.ChromeOptions()
            opt.add_argument("--headless")
            driver = webdriver.Chrome(options=opt)
            driver.get(request.url)
            sleep(3)
            # 让页面滚动
            js = "var q = document.documentElement.scrollTop=%d"
            distance = 100
            for i in range(100):
                driver.execute_script(js%distance)
                distance += 100
                sleep(0.5)
            body = driver.page_source
            print("正在使用中间件下载...")
            print("当前浏览器正在访问的网址是:",driver.current_url)
            # 响应体需要重新定义
            res = HtmlResponse(url=driver.current_url,body=body,encoding='utf-8',request=request)
    
            return res
    
        def process_response(self, request, response, spider):
            # Called with the response returned from the downloader.
    
            # Must either;
            # - return a Response object
            # - return a Request object
            # - or raise IgnoreRequest
            return response
    
        def process_exception(self, request, exception, spider):
            # Called when a download handler or a process_request()
            # (from other downloader middleware) raises an exception.
    
            # Must either:
            # - return None: continue processing this exception
            # - return a Response object: stops process_exception() chain
            # - return a Request object: stops process_exception() chain
            pass
    
        def spider_opened(self, spider):
            spider.logger.info('Spider opened: %s' % spider.name)
    
    

    相关文章

      网友评论

          本文标题:6-2 middlewares.py

          本文链接:https://www.haomeiwen.com/subject/fqeztqtx.html