美文网首页
Pipeline常用写法

Pipeline常用写法

作者: ckawyh | 来源:发表于2018-07-26 17:42 被阅读0次
    # -*- coding: utf-8 -*-
    
    # Define your item pipelines here
    #
    # Don't forget to add your pipeline to the ITEM_PIPELINES setting
    # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
    import json
    import codecs
    import pymongo
    from datetime import datetime
    
    
    class JsonWriterPipeline(object):
    
        def __init__(self, ouput_path):
            self.ouput_path = ouput_path
    
        @classmethod
        def from_crawler(cls, crawler):
            return cls(
                ouput_path=crawler.settings.get('OUTPUT_DIR'),
            )
    
        def open_spider(self, spider):
            file_path = self.ouput_path + datetime.strftime(datetime.now(), "%Y-%m-%d") + '.json'
            self.file = codecs.open(file_path, 'wb+', encoding='utf-8')
    
        def close_spider(self, spider):
            self.file.close()
    
        def process_item(self, item, spider):
            line = json.dumps(dict(item), ensure_ascii=False) + "\n"
            self.file.write(line)
            return item
    
    
    class MongoPipeline(object):
        collection_name = 'phones'
    
        def __init__(self, mongo_uri, mongo_db):
            self.mongo_uri = mongo_uri
            self.mongo_db = mongo_db
    
        @classmethod
        def from_crawler(cls, crawler):
            return cls(
                mongo_uri=crawler.settings.get('MONGO_URI'),
                mongo_db=crawler.settings.get('MONGO_DATABASE')
            )
    
        def open_spider(self, spider):
            self.client = pymongo.MongoClient(self.mongo_uri)
            self.db = self.client[self.mongo_db]
    
        def close_spider(self, spider):
            self.client.close()
    
        def process_item(self, item, spider):
            self.db[self.collection_name].insert_one(dict(item))
            return item
    
    
    # -*- coding: utf-8 -*-
    
    # Scrapy settings for jd_phone_model project
    #
    # For simplicity, this file contains only settings considered important or
    # commonly used. You can find more settings consulting the documentation:
    #
    #     http://doc.scrapy.org/en/latest/topics/settings.html
    #     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
    #     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
    import os
    from datetime import datetime
    
    # True:项目未发布   False:项目正式发布
    __PROJECT_DONT_PUBLISH__ = True
    
    ROOT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
    
    BOT_NAME = 'jd_phone_model'
    
    SPIDER_MODULES = ['jd_phone_model.spiders']
    NEWSPIDER_MODULE = 'jd_phone_model.spiders'
    
    # Obey robots.txt rules
    ROBOTSTXT_OBEY = False
    
    # Configure maximum concurrent requests performed by Scrapy (default: 16)
    # CONCURRENT_REQUESTS = 32
    
    # Configure a delay for requests for the same website (default: 0)
    # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
    # See also autothrottle settings and docs
    # DOWNLOAD_DELAY = 3
    # The download delay setting will honor only one of:
    # CONCURRENT_REQUESTS_PER_DOMAIN = 16
    # CONCURRENT_REQUESTS_PER_IP = 16
    
    # Disable cookies (enabled by default)
    # COOKIES_ENABLED = False
    
    # Disable Telnet Console (enabled by default)
    # TELNETCONSOLE_ENABLED = False
    
    # Override the default request headers:
    DEFAULT_REQUEST_HEADERS = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
    }
    
    # Enable or disable spider middlewares
    # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
    # SPIDER_MIDDLEWARES = {
    #    'jd_phone_model.middlewares.JdPhoneModelSpiderMiddleware': 543,
    # }
    
    # Enable or disable downloader middlewares
    # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
    # DOWNLOADER_MIDDLEWARES = {
    #    'jd_phone_model.middlewares.MyCustomDownloaderMiddleware': 543,
    # }
    
    # Enable or disable extensions
    # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
    # EXTENSIONS = {
    #    'scrapy.extensions.telnet.TelnetConsole': None,
    # }
    
    # Configure item pipelines
    # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
    ITEM_PIPELINES = {
        'jd_phone_model.pipelines.JsonWriterPipeline': 300,
        'jd_phone_model.pipelines.MongoPipeline': 301,
    }
    
    # Enable and configure the AutoThrottle extension (disabled by default)
    # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
    # AUTOTHROTTLE_ENABLED = True
    # The initial download delay
    # AUTOTHROTTLE_START_DELAY = 5
    # The maximum download delay to be set in case of high latencies
    # AUTOTHROTTLE_MAX_DELAY = 60
    # The average number of requests Scrapy should be sending in parallel to
    # each remote server
    # AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
    # Enable showing throttling stats for every response received:
    # AUTOTHROTTLE_DEBUG = False
    
    # Enable and configure HTTP caching (disabled by default)
    # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
    # HTTPCACHE_ENABLED = True
    # HTTPCACHE_EXPIRATION_SECS = 0
    # HTTPCACHE_DIR = 'httpcache'
    # HTTPCACHE_IGNORE_HTTP_CODES = []
    # HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
    
    LOG_DIR = ROOT_DIR + '/logs/'
    if not os.path.exists(LOG_DIR):
        os.makedirs(LOG_DIR)
    
    OUTPUT_DIR = ROOT_DIR + '/output/'
    if not os.path.exists(OUTPUT_DIR):
        os.makedirs(OUTPUT_DIR)
    
    if __PROJECT_DONT_PUBLISH__ == False:
        # 发布状态
        LOG_LEVEL = 'DEBUG'
    
        log_d = '/data/logs/' + BOT_NAME + '/'
        if not os.path.exists(log_d):
            os.makedirs(log_d)
        LOG_FILE = log_d + 'DEBUG_%s.txt' % (datetime.now().strftime('%Y%m%d_%H%M_%S'))
    
        MONGO_URI = "mongodb://XXXX:27017"
        MONGO_DATABASE = 'poi_dianping'
    else:
        # 测试状态
        MONGO_URI = "mongodb://127.0.0.1:27017"
        MONGO_DATABASE = 'jd_phone_model'
    
    MYSQL_HOST = 'XXXX'
    MYSQL_USER = 'crawler'
    MYSQL_PASSWORD = 'pveLnmEzoGEJ9Cc'
    MYSQL_DATABASE = 'crawler'
    MYSQL_PORT = 3307
    
    

    相关文章

      网友评论

          本文标题:Pipeline常用写法

          本文链接:https://www.haomeiwen.com/subject/moabmftx.html