# -*- coding: utf-8 -*-
# Scrapy settings for renting project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://doc.scrapy.org/en/latest/topics/settings.html
# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME= 'renting'
SPIDER_MODULES= ['renting.spiders']
NEWSPIDER_MODULE= 'renting.spiders'
# user-agent默认值,可以设置自定义的头部
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'renting (+http://www.yourdomain.com)'
# 是否遵守爬虫协议,也就是域名主目录下的robots.txt 规则
# Obey robots.txt rules
ROBOTSTXT_OBEY= False
# 默认请求并发数,默认16
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# 对于同一个网站爬取是否要有间歇,默认是0
# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
# 中间间隔3秒钟
# DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
# 每个域名并发请求数
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# 是否启用cookies 默认启用
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
# 是否启用Telnet控制台
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# 要不要覆盖默认请求头
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# 要不要启用爬虫中间件,不用自己写配置,记得打开注释,把自己的下载器中间件放到这里就可以了
# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'renting.middlewares.RentingSpiderMiddleware': 543,
#}
# 下载器diy中间件,也一样,不用自己写配置,打开注释就可以了,数值越小,执行越靠前
# 头部信息,cookie,proxy
SPIDER_MIDDLEWARES= {
'renting.middlewares.RandomUA': 542,
'renting.middlewares.ProxxyMiddleware': 543,
}
# 要不要启用下载器中间件
# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'renting.middlewares.RentingDownloaderMiddleware': 543,
#}
# 是否关闭第三方插件
# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# 配置管道
# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = {
# 'renting.pipelines.RentingPipeline': 300,
#}
# 管道配置
ITEM_PIPELINES= {
'renting.pipelines.RentingPipeline':300,
'renting.pipelines.MongodbPipeline':400,
}
# 数据库配置 crawler.settings.get('MONGO_URI') 获取
MONGO_URI= 'localhost'
MONGO_DB= 'renting'
#AUTOTHROTTLE插件,可以根据服务器和被抓取的网站调整抓取速度
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# 抓取初始链接会延迟5秒钟
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
# 最大延迟60s
#AUTOTHROTTLE_MAX_DELAY = 60
# 向远程服务器发送的并行的请求数
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# 是否显示统计信息
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# 是否开启http缓存,需要根据实际情况
# Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
网友评论