使用scrapy框架爬取桌面背景图片

作者: 轻松学Python111 | 来源:发表于2019-10-12 20:28 被阅读0次

    目标数据: zol桌面壁纸,[风景] [1920*1080] 分类下19页每个图册的图片

    items.py

    import scrapy
    
    class Zol2Item(scrapy.Item):
        # define the fields for your item here like:
        # name = scrapy.Field()
        image_urls = scrapy.Field()
        images = scrapy.Field()
    
        image_title = scrapy.Field()
    
    image.gif

    pipelines.py

    from scrapy import Request
    from scrapy.pipelines.images import ImagesPipeline
    
    class ZolPipeline(ImagesPipeline):
        # num = 1
        def get_media_requests(self, item, info):
            image_url = item["image_urls"]
            if image_url:
                # self.num + 1
                yield Request(url=image_url, meta={"item": item})
    
        def file_path(self, request, response=None, info=None):
            ## start of deprecation warning block (can be removed in the future)
            def _warn():
                from scrapy.exceptions import ScrapyDeprecationWarning
                import warnings
                warnings.warn('ImagesPipeline.image_key(url) and file_key(url) methods are deprecated, '
                              'please use file_path(request, response=None, info=None) instead',
                              category=ScrapyDeprecationWarning, stacklevel=1)
    
            # check if called from image_key or file_key with url as first argument
            if not isinstance(request, Request):
                _warn()
                url = request
            else:
                url = request.url
    
            # detect if file_key() or image_key() methods have been overridden
            if not hasattr(self.file_key, '_base'):
                _warn()
                return self.file_key(url)
            elif not hasattr(self.image_key, '_base'):
                _warn()
                return self.image_key(url)
            ## end of deprecation warning block
    
            return 'desk/{}.jpg'.format(request.meta["item"]["image_title"])
    
    image.gif

    middlewares.py

    from scrapy import signals
    from zol2.useragents import agents
    
    class Zol2SpiderMiddleware(object):
        # Not all methods need to be defined. If a method is not defined,
        # scrapy acts as if the spider middleware does not modify the
        # passed objects.
    
        @classmethod
        def from_crawler(cls, crawler):
            # This method is used by Scrapy to create your spiders.
            s = cls()
            crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
            return s
    
        def process_spider_input(self, response, spider):
            # Called for each response that goes through the spider
            # middleware and into the spider.
    
            # Should return None or raise an exception.
            return None
    
        def process_spider_output(self, response, result, spider):
            # Called with the results returned from the Spider, after
            # it has processed the response.
    
            # Must return an iterable of Request, dict or Item objects.
            for i in result:
                yield i
    
        def process_spider_exception(self, response, exception, spider):
            # Called when a spider or process_spider_input() method
            # (from other spider middleware) raises an exception.
    
            # Should return either None or an iterable of Response, dict
            # or Item objects.
            pass
    
        def process_start_requests(self, start_requests, spider):
            # Called with the start requests of the spider, and works
            # similarly to the process_spider_output() method, except
            # that it doesn’t have a response associated.
    
            # Must return only requests (not items).
            for r in start_requests:
                yield r
    
        def spider_opened(self, spider):
            spider.logger.info('Spider opened: %s' % spider.name)
    
    class Zol2DownloaderMiddleware(object):
        # Not all methods need to be defined. If a method is not defined,
        # scrapy acts as if the downloader middleware does not modify the
        # passed objects.
    
        @classmethod
        def from_crawler(cls, crawler):
            # This method is used by Scrapy to create your spiders.
            s = cls()
            crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
            return s
    
        def process_request(self, request, spider):
            # Called for each request that goes through the downloader
            # middleware.
    
            # Must either:
            # - return None: continue processing this request
            # - or return a Response object
            # - or return a Request object
            # - or raise IgnoreRequest: process_exception() methods of
            #   installed downloader middleware will be called
            return None
    
        def process_response(self, request, response, spider):
            # Called with the response returned from the downloader.
    
            # Must either;
            # - return a Response object
            # - return a Request object
            # - or raise IgnoreRequest
            return response
    
        def process_exception(self, request, exception, spider):
            # Called when a download handler or a process_request()
            # (from other downloader middleware) raises an exception.
    
            # Must either:
            # - return None: continue processing this exception
            # - return a Response object: stops process_exception() chain
            # - return a Request object: stops process_exception() chain
            pass
    
        def spider_opened(self, spider):
            spider.logger.info('Spider opened: %s' % spider.name)
    
    image.gif

    settings.py

    # -*- coding: utf-8 -*-
    
    # Scrapy settings for zol2 project
    #
    # For simplicity, this file contains only settings considered important or
    # commonly used. You can find more settings consulting the documentation:
    #
    #     https://doc.scrapy.org/en/latest/topics/settings.html
    #     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
    #     https://doc.scrapy.org/en/latest/topics/spider-middleware.html
    
    BOT_NAME = 'zol2'
    
    SPIDER_MODULES = ['zol2.spiders']
    NEWSPIDER_MODULE = 'zol2.spiders'
    
    # Crawl responsibly by identifying yourself (and your website) on the user-agent
    USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36'
    
    # Obey robots.txt rules
    # ROBOTSTXT_OBEY = True
    
    # Configure maximum concurrent requests performed by Scrapy (default: 16)
    #CONCURRENT_REQUESTS = 32
    
    # Configure a delay for requests for the same website (default: 0)
    # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
    # See also autothrottle settings and docs
    DOWNLOAD_DELAY = 0.5
    # The download delay setting will honor only one of:
    #CONCURRENT_REQUESTS_PER_DOMAIN = 16
    #CONCURRENT_REQUESTS_PER_IP = 16
    
    # Disable cookies (enabled by default)
    #COOKIES_ENABLED = False
    
    # Disable Telnet Console (enabled by default)
    #TELNETCONSOLE_ENABLED = False
    
    # Override the default request headers:
    #DEFAULT_REQUEST_HEADERS = {
    #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
    #   'Accept-Language': 'en',
    #}
    
    # Enable or disable spider middlewares
    # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
    #SPIDER_MIDDLEWARES = {
    #    'zol2.middlewares.Zol2SpiderMiddleware': 543,
    #}
    
    # Enable or disable downloader middlewares
    # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
    #DOWNLOADER_MIDDLEWARES = {
    #    'zol2.middlewares.Zol2DownloaderMiddleware': 543,
    #}
    
    # Enable or disable extensions
    # See https://doc.scrapy.org/en/latest/topics/extensions.html
    #EXTENSIONS = {
    #    'scrapy.extensions.telnet.TelnetConsole': None,
    #}
    
    # Configure item pipelines
    # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
    ITEM_PIPELINES = {
       'zol2.pipelines.Zol2Pipeline': 300,
    }
    IMAGES_STORE = "/home/pyvip/env_spider/zol2/zol2/images"
    
    # Enable and configure the AutoThrottle extension (disabled by default)
    # See https://doc.scrapy.org/en/latest/topics/autothrottle.html
    #AUTOTHROTTLE_ENABLED = True
    # The initial download delay
    #AUTOTHROTTLE_START_DELAY = 5
    # The maximum download delay to be set in case of high latencies
    #AUTOTHROTTLE_MAX_DELAY = 60
    # The average number of requests Scrapy should be sending in parallel to
    # each remote server
    #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
    # Enable showing throttling stats for every response received:
    #AUTOTHROTTLE_DEBUG = False
    
    # Enable and configure HTTP caching (disabled by default)
    # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
    #HTTPCACHE_ENABLED = True
    #HTTPCACHE_EXPIRATION_SECS = 0
    #HTTPCACHE_DIR = 'httpcache'
    #HTTPCACHE_IGNORE_HTTP_CODES = []
    #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
    
    image.gif

    pazol2.py

    # -*- coding: utf-8 -*-
    import scrapy
    from scrapy.linkextractors import LinkExtractor
    from scrapy.spiders import CrawlSpider, Rule
    from zol2.items import Zol2Item
    
    class Pazol2Spider(CrawlSpider):
        name = 'pazol2'
        # allowed_domains = ['desk.zol.com.cn']
        start_urls = ['http://desk.zol.com.cn/fengjing/1920x1080/']
        front_url = "http://desk.zol.com.cn"
        num = 1
    
        rules = (
            # 1.解决翻页
            Rule(LinkExtractor(allow=r'/fengjing/1920x1080/[0-1]?[0-9]?.html'), callback='parse_album', follow=True),
            # 2.进入各个图库的每一张图片页
            Rule(LinkExtractor(allow=r'/bizhi/\d+_\d+_\d+.html', restrict_xpaths=("//div[@class='main']/ul[@class='pic-list2  clearfix']/li", "//div[@class='photo-list-box']")), follow=True),
            # 3.点击各个图片1920*1080按钮,获得html
            Rule(LinkExtractor(allow=r'/showpic/1920x1080_\d+_\d+.html'), callback='get_img', follow=True),
        )
    
        def get_img(self, response):
            item = Zol2Item()
            item['image_urls'] = response.xpath("//body/img[1]/@src").extract_first()
            item['image_title'] = str(self.num)
            self.num += 1
            yield item
    
    image.gif

    爬取结果

    image image.gif

    共爬取了4517张图片,用时108分钟

    放在桌面图库,半小时换一张,美滋滋。

    相关文章

      网友评论

        本文标题:使用scrapy框架爬取桌面背景图片

        本文链接:https://www.haomeiwen.com/subject/raqtyctx.html