美文网首页
Scrapy 爬虫爬取unsplash图库

Scrapy 爬虫爬取unsplash图库

作者: Mefisto_fele | 来源:发表于2016-11-02 17:06 被阅读0次

    本文详尽介绍了通过Scrapy框架爬取Unsplash图库的过程:

    # -*- coding: utf-8 -*-
    
    # Define here the models for your scraped items
    import scrapy
    
    class UnsplashItem(scrapy.Item):
        # define the fields for your item here like:
        # name = scrapy.Field()
        author = scrapy.Field()
        author_bio=scrapy.Field()
        image_id=scrapy.Field()
        image_urls=scrapy.Field()
        images=scrapy.Field()
        image_paths=scrapy.Field()
    
    # -*- coding: utf-8 -*-
    import scrapy
    import json
    import urllib
    from picture.items import UnsplashItem
    
    class UnsplashSpider(scrapy.Spider):
        name = "unsplash"
        allowed_domains = ["unsplash.com"]
        custom_settings = {
            'DEFAULT_REQUEST_HEADERS':{
                #'Accept':'*/*',
                #'Accept-Encoding':'gzip, deflate, sdch, br',
                #'Accept-Language':'zh-CN,zh;q=0.8,en;q=0.6',
                #'accept-version':'v1',
                #'Authorization':'Client-ID d69927c7ea5c770fa2ce9a2f1e3589bd896454f7068f689d8e41a25b54fa6042',
                #'Host':'unsplash.com',
                'Upgrade-Insecure-Requests': '1',
                #'Referer':'https://unsplash.com/?grid=single',
                #'Connection':'keep-alive',
                'x-unsplash-client':'web',
                #'User-Agent':'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.76 Mobile Safari/537.36',
            },
            "ITEM_PIPELINES":{
                'picture.pipelines.UnsplashImagesPipeline': 300,
            },
            "IMAGES_STORE":'./images',
            "LOG_FILE":'unsplash.log',
        }
    
        start_urls = (
            'https://unsplash.com/napi/feeds/home',
        )
    
        def parse(self, response):
            for photo in json.loads(response.body)['photos']:
                item=UnsplashItem()
                item['author']=photo['user']['name']
                item['author_bio']=photo['user']['bio']
                item['image_id']=photo['id']
                item['image_urls']=[photo['urls']['full']]
                yield item
    
            next_page='https://unsplash.com/napi/'+json.loads(response.body)['next_page'][25:]
            if next_page:
                yield scrapy.Request(next_page,callback=self.parse)
    
    # -*- coding: utf-8 -*-
    
    # Define your item pipelines here
    import scrapy
    from scrapy.pipelines.images import ImagesPipeline
    from scrapy.exceptions import DropItem
    
    class UnsplashImagesPipeline(ImagesPipeline):
        def get_media_requests(self,item,info):
            for image_url in item['image_urls']:
                yield scrapy.Request(image_url,meta={'item': item})
    
    
        def item_completed(self,results,item,info):
            image_paths = [x['path'] for ok, x in results if ok]
            if not image_paths:
                raise DropItem("Item contains no images")
            item['image_paths'] = image_paths
            return item
    
    
        def file_path(self, request, response=None, info=None):
            item = request.meta['item']
            filename = 'full/{0}/{1}.jpg'.format(item['author'],item['image_id'])
            return filename
    

    相关文章

      网友评论

          本文标题:Scrapy 爬虫爬取unsplash图库

          本文链接:https://www.haomeiwen.com/subject/xgnvuttx.html