美文网首页
middlewares下载中间件、断点爬取、设置文件参数

middlewares下载中间件、断点爬取、设置文件参数

作者: 咻咻咻滴赵大妞 | 来源:发表于2019-01-07 20:17 被阅读0次

    User-Agent
    Cookies
    IP
    Selenium

    1.User-Agent

    settings.py文件中添加几个UA

    USERAGENT = [
        'Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10',
        'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
    ]
    

    middlewares.py中设置User-Agent中间件

    从settings.py文件中取值有两种方式
    方法一:

    class UserAgentDownloadMiddlerware(object):
         def __init__(self,User_Agents):
             self.User_Agents = User_Agents
        
         @classmethod
         def from_crawler(cls,crawler):
             User_Agents = crawler.settings['USERAGENT']
           return cls(User_Agents)
        
        def process_request(self,request,spider):
            """
            所有的request在交给下载器之前都会经过这个方法
            :param request:
            :param spider:
            :return:
            """
            import random
            radom_ua = random.choice(self.User_Agents)
            if random_ua:
                request.headers['User-Agent']=random_ua
    

    方法二:

    class UserAgentDownloadMiddlerware(object):
        def process_request(self,request,spider):
             import random
             User_Agent = spider.settings['USERAGENT']
             random_ua = random.choice(User_Agent)
             if random_ua:
                request.headers['User-Agent']=random_ua
    
    

    方法三:

    class UserAgentDownloadMiddlerware(object):
        def process_request(self,request,spider):
        from fake_useragent import UserAgent
            useAgent = UserAgent()
            random_ua = useAgent.random
            if random_ua:
                print('经过了下载中间件',random_ua)
                request.headers['User-Agent'] = random_ua
    

    2.IP代理中间件

    在settings.py文件中模拟一个代理池

    PROXIES =[
        {'ip':'127.0.0.1:6379','pwd':'zwz:1234'},有账号密码
        {'ip':'127.0.0.1:6372','pwd':None},没有账号密码
        {'ip':'127.0.0.1:6373','pwd':None},
        {'ip':'127.0.0.1:6370','pwd':None}
    ]
    

    middlewares.py中设置代理中间件

    class ProxyDownloadMiddlerware(object):
        def process_request(self,request,spider):
            proxies = spider.settings['PROXIES']
            import random
            proxy_rm = random.choice(proxies)
    
            if proxy_rm['pwd']:
                #有账号密码的代理
                #对账号密码进行base64编码
                import base64
                base64_pwd = base64.b64encode(proxy_rm['pwd'].encode('utf-8')).decode('utf-8')
                # 对应到代理服务器的信令格式里
                request.headers['Proxy-Authorization'] = 'Basic ' + base64_pwd
                #设置ip
                request.meta['proxy'] = proxy_rm['ip']
            else:
                # 设置ip
                request.meta['proxy'] = proxy_rm['ip']
    
    

    3.Cookie中间件

    在settings.py文件中模拟一个Cookie池

    COOKIES =[
        {'cookie1':'xxxx'},
        {'cookie1':'xxxx'},
        {'cookie1':'xxxx'},
        {'cookie1':'xxxx'},
        {'cookie1':'xxxx'},
    ]
    

    middlewares.py中设置cookie中间件

    import random
    class RandomCookiesMiddleware(object):
    
        def process_request(self, request, spider):
            cookies = spider.settings['COOKIES']
            # 随机获取一个cookies
            cookie = random.choice(cookies)
            if cookie:
                request.cookies = cookie
    

    4.Selenium获取动态的网页爬取

    在创建的spider项目里添加,因为有的网页是动态的,还有静态的,所以放在需要爬取动态页面的spider里

    import scrapy
    from selenium import webdriver
    
    class TestSpider(scrapy.Spider):
        name = 'test'
        allowed_domains = ['baidu.com']
        start_urls = ['http://www.baidu.com/']
    
        # 创建浏览器驱动
        driver = webdriver.Firefox(
            executable_path='/home/zwz/Desktop/浏览器驱动/geckodriver/'
        )
        driver.set_page_load_timeout(10)
    
        def parse(self, response):
            print(response.status,response.request.headers)
    

    middlewares.py中设置Selenium获取动态的网页爬取

    #scrapy并不支持动态加载网页的爬取
    from selenium import webdriver
    from selenium.common.exceptions import TimeoutException
    from scrapy.http import HtmlResponse
    class SeleniumDownloadMiddlerWare(object):
        @classmethod
        def from_crawler(cls, crawler):
            # This method is used by Scrapy to create your spiders.
            s = cls()
            #使用信号量,监控爬虫结束的信号
            crawler.signals.connect(s.close, signal=signals.spider_closed)
            return s
    
        def close(self, spider):
            import time
            time.sleep(5)
            spider.driver.close()
    
        def process_request(self,request,spider):
            if spider.name == 'test':
                #获取url
                url = request.url
    
                if url:
                    try:
                        # self.driver.get(url)
                        spider.driver.get(url)
                        # pageSource = self.driver.page_source
                        pageSource = spider.driver.page_source
    
                        if pageSource:
                            """
                            url, status=200, headers=None, 
                            body=b'', flags=None, request=None
                            """
                            return HtmlResponse(
                                url=url,
                                status=200,
                                body=pageSource.encode('utf-8'),
                                request=request
                            )
    
                    except TimeoutException as err:
                        print('请求超时',url)
                        return HtmlResponse(
                            url=url,
                            status=408,
                            body=b'',
                            request=request
                        )
    
    
    

    最后别忘记设置和激活下载中间件

    在settings.py文件中

    DOWNLOADER_MIDDLEWARES = {
       # 'downloadmiddlerware.middlewares.DownloadmiddlerwareDownloaderMiddleware': 543,
        'downloadmiddlerware.middlewares.UserAgentDownloadMiddlerware':543,
        'downloadmiddlerware.middlewares.ProxyDownloadMiddlerware':544,
        'downloadmiddlerware.middlewares.RandomCookiesMiddleware':545,
        'downloadmiddlerware.middlewares.SeleniumDownloadMiddlerWare':546,
    }
    

    关于爬虫的断点爬取:

    scrapy crawl 爬虫名称 -s JOBDIR=crawls/爬虫名称

    requests.queue : 保存的请求的任务队列
    requests.seen : 保存的是指纹
    spider.status : 爬虫运行的状态
    

    scrapy settings.py设置文件(相关参数)

    项目名称

    BOT_NAME = 'downloadmiddlerware'

    # 项目名称
    BOT_NAME = 'downloadmiddlerware'
    
    #爬虫存储的文件路径
    SPIDER_MODULES = ['downloadmiddlerware.spiders']
    #创建爬虫文件的模版,创建号的爬虫文件会存放在这个目录下
    NEWSPIDER_MODULE = 'downloadmiddlerware.spiders'
    
    # Crawl responsibly by identifying yourself (and your website) on the user-agent
    #设置ua,来模拟浏览器请求
    #USER_AGENT = 'downloadmiddlerware (+http://www.yourdomain.com)'
    
    # Obey robots.txt rules
    # 设置是否需要准守robot协议:默认为True
    ROBOTSTXT_OBEY = False
    
    # Configure maximum concurrent requests performed by Scrapy (default: 16)
    # 设置请求的最大并发数据(下载器) 默认是16
    #CONCURRENT_REQUESTS = 32
    
    # Configure a delay for requests for the same website (default: 0)
    # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
    # See also autothrottle settings and docs
    #设置请求的下载延时,默认为0
    DOWNLOAD_DELAY = 3
    # The download delay setting will honor only one of:
    #设置网站的最大并发请求数量,默认是8
    CONCURRENT_REQUESTS_PER_DOMAIN = 16
    
    #设置某个ip的最大并发请求数量,默认是0
    # 如果非0
    # 1.CONCURRENT_REQUESTS_PER_DOMAIN不生效,
    # 这时候请求的并发数量将针对于ip,而不是网站了
    #
    # 2.设置的DOWNLOAD_DELAY就是正对于ip而不是网站了
    CONCURRENT_REQUESTS_PER_IP = 16
    
    # Disable cookies (enabled by default)
    # 是否携带cookies:默认为True
    COOKIES_ENABLED = False
    
    #COOKIES_DEBUG:跟踪cookies,默认情况下为False
    COOKIES_DEBUG =True
    
    #关于日志信息的设置
    LOG_FILE = 'xxx.log'
    LOG_LEVEL = 'INFO/DEBUG/....'
    
    # Disable Telnet Console (enabled by default)
    #是一个终端的扩展插件
    TELNETCONSOLE_ENABLED = False
    
    # Override the default request headers:
    #设置默认的请求头(cookies信息不要放在这里)
    DEFAULT_REQUEST_HEADERS = {
      # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
      # 'Accept-Language': 'en',
        'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
    }
    
    # Enable or disable spider middlewares
    # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
    #设置和激活爬虫中间件
    #SPIDER_MIDDLEWARES = {
    #    'downloadmiddlerware.middlewares.DownloadmiddlerwareSpiderMiddleware': 543,
    #}
    
    # Enable or disable downloader middlewares
    # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
    #设置和激活下载中间件(后面的数字表示优先级)
    DOWNLOADER_MIDDLEWARES = {
       # 'downloadmiddlerware.middlewares.DownloadmiddlerwareDownloaderMiddleware': 543,
       #  'downloadmiddlerware.middlewares.UserAgentDownloadMiddlerware':543,
        'downloadmiddlerware.middlewares.SeleniumDownloadMiddlerWare':543,
    }
    
    # Enable or disable extensions
    # See https://doc.scrapy.org/en/latest/topics/extensions.html
    #设置扩展
    EXTENSIONS = {
       'scrapy.extensions.telnet.TelnetConsole': None,
    }
    
    # Configure item pipelines
    # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
    #设置和激活管道文件(后面的数字表示优先级)
    ITEM_PIPELINES = {
        'downloadmiddlerware.pipelines.DownloadmiddlerwarePipeline': 300,
    }
    
    #自动限速的扩展(实现上一个请求和下一个请求之间的时间是不固定的)
    # Enable and configure the AutoThrottle extension (disabled by default)
    # See https://doc.scrapy.org/en/latest/topics/autothrottle.html
    #默认请情框下自动限速的扩展是关闭的:AUTOTHROTTLE_ENABLED:False
     AUTOTHROTTLE_ENABLED = True
     # The initial download delay
     #初始的下载吧延时默认是5秒
     AUTOTHROTTLE_START_DELAY = 5
     # The maximum download delay to be set in case of high latencies
     #最大下载延时
     AUTOTHROTTLE_MAX_DELAY = 60
     # The average number of requests Scrapy should be sending in parallel to
     # each remote server
     #针对于网站的最大的并行请求数量
     AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
     # Enable showing throttling stats for every response received:
    #调试模式:默认为False,未开启
    AUTOTHROTTLE_DEBUG = False
    
    # Enable and configure HTTP caching (disabled by default)
    # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
    #设置数据的缓存,默认情况下是未开启的
    HTTPCACHE_ENABLED = True
    #设置缓存的超时时间,默认为0表示永久有效
    HTTPCACHE_EXPIRATION_SECS = 0
    #设置缓存的存储文件路径
    HTTPCACHE_DIR = 'httpcache'
    #忽略某些状态码的请求结果(Response)
    HTTPCACHE_IGNORE_HTTP_CODES = []
    #开始缓存的扩展插件
    HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
    
    

    相关文章

      网友评论

          本文标题:middlewares下载中间件、断点爬取、设置文件参数

          本文链接:https://www.haomeiwen.com/subject/fyuprqtx.html