爬虫系列(二十五):scrapy爬取图片

作者: 文子轩 | 来源:发表于2018-01-31 17:33 被阅读26次

    item.py

        class CoserItem(scrapy.Item):
            url = scrapy.Field()
            name = scrapy.Field()
            info = scrapy.Field()
            image_urls = scrapy.Field()
            images = scrapy.Field()
    

    spiders/coser.py

        # -*- coding: utf-8 -*-
        from scrapy.selector import Selector
        import scrapy
        from scrapy.contrib.loader import ItemLoader
        from Cosplay.items import CoserItem
    
    
        class CoserSpider(scrapy.Spider):
            name = "coser"
            allowed_domains = ["bcy.net"]
            start_urls = (
                'http://bcy.net/cn125101',
                'http://bcy.net/cn126487',
                'http://bcy.net/cn126173'
            )
    
            def parse(self, response):
                sel = Selector(response)
    
                for link in sel.xpath("//ul[@class='js-articles l-works']/li[@class='l-work--big']/article[@class='work work--second-created']/h2[@class='work__title']/a/@href").extract():
                    link = 'http://bcy.net%s' % link
                    request = scrapy.Request(link, callback=self.parse_item)
                    yield request
    
            def parse_item(self, response):
                l = ItemLoader(item=CoserItem(), response=response)
                l.add_xpath('name', "//h1[@class='js-post-title']/text()")
                l.add_xpath('info', "//div[@class='post__info']/div[@class='post__type post__info-group']/span/text()")
                urls = l.get_xpath('//img[@class="detail_std detail_clickable"]/@src')
                urls = [url.replace('/w650', '') for url in urls]
                l.add_value('image_urls', urls)
                l.add_value('url', response.url)
    
                return l.load_item()
    

    pipelines.py

        import requests
        from Cosplay import settings
        import os
    
    
        class ImageDownloadPipeline(object):
            def process_item(self, item, spider):
                if 'image_urls' in item:
                    images = []
                    dir_path = '%s/%s' % (settings.IMAGES_STORE, spider.name)
    
                    if not os.path.exists(dir_path):
                        os.makedirs(dir_path)
                    for image_url in item['image_urls']:
                        us = image_url.split('/')[3:]
                        image_file_name = '_'.join(us)
                        file_path = '%s/%s' % (dir_path, image_file_name)
                        images.append(file_path)
                        if os.path.exists(file_path):
                            continue
    
                        with open(file_path, 'wb') as handle:
                            response = requests.get(image_url, stream=True)
                            for block in response.iter_content(1024):
                                if not block:
                                    break
    
                                handle.write(block)
    
                    item['images'] = images
                return item
    

    settings.py

        ITEM_PIPELINES = {'Cosplay.pipelines.ImageDownloadPipeline': 1}
    
        IMAGES_STORE = '../Images'
    
        DOWNLOAD_DELAY = 0.25    # 250 ms of delay
    

    items.py

    class CoserItem(scrapy.Item):
        url = scrapy.Field()
        name = scrapy.Field()
        info = scrapy.Field()
        image_urls = scrapy.Field()
        images = scrapy.Field()
    
    

    spiders/coser.py

    # -*- coding: utf-8 -*-
    from scrapy.selector import Selector
    import scrapy
    from scrapy.contrib.loader import ItemLoader
    from Cosplay.items import CoserItem
    
    class CoserSpider(scrapy.Spider):
        name = "coser"
        allowed_domains = ["bcy.net"]
        start_urls = (
            'http://bcy.net/cn125101',
            'http://bcy.net/cn126487',
            'http://bcy.net/cn126173'
        )
    
        def parse(self, response):
            sel = Selector(response)
    
            for link in sel.xpath("//ul[@class='js-articles l-works']/li[@class='l-work--big']/article[@class='work work--second-created']/h2[@class='work__title']/a/@href").extract():
                link = 'http://bcy.net%s' % link
                request = scrapy.Request(link, callback=self.parse_item)
                yield request
    
        def parse_item(self, response):
            l = ItemLoader(item=CoserItem(), response=response)
            l.add_xpath('name', "//h1[@class='js-post-title']/text()")
            l.add_xpath('info', "//div[@class='post__info']/div[@class='post__type post__info-group']/span/text()")
            urls = l.get_xpath('//img[@class="detail_std detail_clickable"]/@src')
            urls = [url.replace('/w650', '') for url in urls]
            l.add_value('image_urls', urls)
            l.add_value('url', response.url)
    
            return l.load_item()
    
    

    pipelines.py

    import requests
    from Cosplay import settings
    import os
    
    class ImageDownloadPipeline(object):
        def process_item(self, item, spider):
            if 'image_urls' in item:
                images = []
                dir_path = '%s/%s' % (settings.IMAGES_STORE, spider.name)
    
                if not os.path.exists(dir_path):
                    os.makedirs(dir_path)
                for image_url in item['image_urls']:
                    us = image_url.split('/')[3:]
                    image_file_name = '_'.join(us)
                    file_path = '%s/%s' % (dir_path, image_file_name)
                    images.append(file_path)
                    if os.path.exists(file_path):
                        continue
    
                    with open(file_path, 'wb') as handle:
                        response = requests.get(image_url, stream=True)
                        for block in response.iter_content(1024):
                            if not block:
                                break
    
                            handle.write(block)
    
                item['images'] = images
            return item
    
    

    settings.py

    
    ITEM_PIPELINES = {'Cosplay.pipelines.ImageDownloadPipeline': 1}
    
    IMAGES_STORE = '../Images'
    
    DOWNLOAD_DELAY = 0.25    # 250 ms of delay
    
    

    在项目根目录下新建main.py文件,用于调试

    from scrapy import cmdline
    cmdline.execute('scrapy crawl coser'.split())
    
    

    执行程序

    py2 main.py
    

    相关文章

      网友评论

      • 水哥哥1991:我觉得你的这个Pipelines保存图片的方法是阻塞型的,爬取速度肯定不快
        文子轩:@水哥哥1991 确实,不顾可以在是spider文件的starturl加列表连接和加多代理IP来提升速度

      本文标题:爬虫系列(二十五):scrapy爬取图片

      本文链接:https://www.haomeiwen.com/subject/apslzxtx.html