爬虫系列（二十五）：scrapy爬取图片

作者: 文子轩 | 来源:发表于2018-01-31 17:33 被阅读26次

爬虫系列（二十五）：scrapy爬取图片
python-爬虫学习（文字、图片、视频）
python爬虫学习（文字、图片、视频）
各类链接
【读书笔记】_爬虫
scrapy爬虫框架（二）：创建一个scrapy爬虫
python爬取千图网高清图
爬虫练习_使用scrapy爬取淘宝
Scrapy爬取图片续集
Github 用户及仓库分析爬虫

item.py

    class CoserItem(scrapy.Item):
        url = scrapy.Field()
        name = scrapy.Field()
        info = scrapy.Field()
        image_urls = scrapy.Field()
        images = scrapy.Field()

spiders/coser.py

    # -*- coding: utf-8 -*-
    from scrapy.selector import Selector
    import scrapy
    from scrapy.contrib.loader import ItemLoader
    from Cosplay.items import CoserItem


    class CoserSpider(scrapy.Spider):
        name = "coser"
        allowed_domains = ["bcy.net"]
        start_urls = (
            'http://bcy.net/cn125101',
            'http://bcy.net/cn126487',
            'http://bcy.net/cn126173'
        )

        def parse(self, response):
            sel = Selector(response)

            for link in sel.xpath("//ul[@class='js-articles l-works']/li[@class='l-work--big']/article[@class='work work--second-created']/h2[@class='work__title']/a/@href").extract():
                link = 'http://bcy.net%s' % link
                request = scrapy.Request(link, callback=self.parse_item)
                yield request

        def parse_item(self, response):
            l = ItemLoader(item=CoserItem(), response=response)
            l.add_xpath('name', "//h1[@class='js-post-title']/text()")
            l.add_xpath('info', "//div[@class='post__info']/div[@class='post__type post__info-group']/span/text()")
            urls = l.get_xpath('//img[@class="detail_std detail_clickable"]/@src')
            urls = [url.replace('/w650', '') for url in urls]
            l.add_value('image_urls', urls)
            l.add_value('url', response.url)

            return l.load_item()

pipelines.py

    import requests
    from Cosplay import settings
    import os


    class ImageDownloadPipeline(object):
        def process_item(self, item, spider):
            if 'image_urls' in item:
                images = []
                dir_path = '%s/%s' % (settings.IMAGES_STORE, spider.name)

                if not os.path.exists(dir_path):
                    os.makedirs(dir_path)
                for image_url in item['image_urls']:
                    us = image_url.split('/')[3:]
                    image_file_name = '_'.join(us)
                    file_path = '%s/%s' % (dir_path, image_file_name)
                    images.append(file_path)
                    if os.path.exists(file_path):
                        continue

                    with open(file_path, 'wb') as handle:
                        response = requests.get(image_url, stream=True)
                        for block in response.iter_content(1024):
                            if not block:
                                break

                            handle.write(block)

                item['images'] = images
            return item

settings.py

    ITEM_PIPELINES = {'Cosplay.pipelines.ImageDownloadPipeline': 1}

    IMAGES_STORE = '../Images'

    DOWNLOAD_DELAY = 0.25    # 250 ms of delay

items.py

class CoserItem(scrapy.Item):
    url = scrapy.Field()
    name = scrapy.Field()
    info = scrapy.Field()
    image_urls = scrapy.Field()
    images = scrapy.Field()

spiders/coser.py

# -*- coding: utf-8 -*-
from scrapy.selector import Selector
import scrapy
from scrapy.contrib.loader import ItemLoader
from Cosplay.items import CoserItem

class CoserSpider(scrapy.Spider):
    name = "coser"
    allowed_domains = ["bcy.net"]
    start_urls = (
        'http://bcy.net/cn125101',
        'http://bcy.net/cn126487',
        'http://bcy.net/cn126173'
    )

    def parse(self, response):
        sel = Selector(response)

        for link in sel.xpath("//ul[@class='js-articles l-works']/li[@class='l-work--big']/article[@class='work work--second-created']/h2[@class='work__title']/a/@href").extract():
            link = 'http://bcy.net%s' % link
            request = scrapy.Request(link, callback=self.parse_item)
            yield request

    def parse_item(self, response):
        l = ItemLoader(item=CoserItem(), response=response)
        l.add_xpath('name', "//h1[@class='js-post-title']/text()")
        l.add_xpath('info', "//div[@class='post__info']/div[@class='post__type post__info-group']/span/text()")
        urls = l.get_xpath('//img[@class="detail_std detail_clickable"]/@src')
        urls = [url.replace('/w650', '') for url in urls]
        l.add_value('image_urls', urls)
        l.add_value('url', response.url)

        return l.load_item()

pipelines.py

import requests
from Cosplay import settings
import os

class ImageDownloadPipeline(object):
    def process_item(self, item, spider):
        if 'image_urls' in item:
            images = []
            dir_path = '%s/%s' % (settings.IMAGES_STORE, spider.name)

            if not os.path.exists(dir_path):
                os.makedirs(dir_path)
            for image_url in item['image_urls']:
                us = image_url.split('/')[3:]
                image_file_name = '_'.join(us)
                file_path = '%s/%s' % (dir_path, image_file_name)
                images.append(file_path)
                if os.path.exists(file_path):
                    continue

                with open(file_path, 'wb') as handle:
                    response = requests.get(image_url, stream=True)
                    for block in response.iter_content(1024):
                        if not block:
                            break

                        handle.write(block)

            item['images'] = images
        return item

settings.py


ITEM_PIPELINES = {'Cosplay.pipelines.ImageDownloadPipeline': 1}

IMAGES_STORE = '../Images'

DOWNLOAD_DELAY = 0.25    # 250 ms of delay

在项目根目录下新建main.py文件,用于调试

from scrapy import cmdline
cmdline.execute('scrapy crawl coser'.split())

执行程序

py2 main.py

网友评论

水哥哥1991:我觉得你的这个Pipelines保存图片的方法是阻塞型的，爬取速度肯定不快
文子轩:@水哥哥1991 确实，不顾可以在是spider文件的starturl加列表连接和加多代理IP来提升速度

本文标题：爬虫系列（二十五）：scrapy爬取图片

本文链接：https://www.haomeiwen.com/subject/apslzxtx.html

延伸阅读

深度阅读

您也可以注册成为美文阅读网的作者，发表您的原创作品、分享您的心情！

爬虫系列（二十五）：scrapy爬取图片

item.py

spiders/coser.py

pipelines.py

settings.py

items.py

spiders/coser.py

pipelines.py

settings.py

在项目根目录下新建main.py文件,用于调试

执行程序

相关文章

爬虫系列（二十五）：scrapy爬取图片

python-爬虫学习（文字、图片、视频）

python爬虫学习（文字、图片、视频）

各类链接

【读书笔记】_爬虫

scrapy爬虫框架（二）：创建一个scrapy爬虫

python爬取千图网高清图

爬虫练习_使用scrapy爬取淘宝

Scrapy爬取图片续集

Github 用户及仓库分析爬虫

网友评论

延伸阅读

深度阅读

栏目导航

热点阅读

python网络，爬虫，数据库笔记

大数据爬虫Python AI Sql

Python语言与信息数据获取和机器学习