BloomFilter主要用于检索一个元素是否在集合中。优点是空间效率和查询效率比较高。缺点是存在误判率。
使用bloomfilter对url进行压缩可以很好的节约内存空间.
目的
在scrapy中集成bloomfilter
1.安装scrapy-redis-bloomfilter
pip install scrapy-redis-bloomfilter
2.修改配置文件
# scrapy redis 配置
SCHEDULER = "scrapy_redis_bloomfilter.scheduler.Scheduler"
DUPEFILTER_CLASS = "scrapy_redis_bloomfilter.dupefilter.RFPDupeFilter"
# redis://[user:pass]@host:port/db
REDIS_URL = 'redis://@localhost:6379'
# Schedule requests using a priority queue. (default)
#SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.PriorityQueue'
# Alternative queues.
#SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.FifoQueue'
#SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.LifoQueue'
# 持久化,不自动清空
SCHEDULER_PERSIST = True
# Store scraped item in redis for post-processing.
# ITEM_PIPELINES = {
# 'scrapy_redis.pipelines.RedisPipeline': 300
# }
# Number of Hash Functions to use, defaults to 6
BLOOMFILTER_HASH_NUMBER = 6
# Redis Memory Bit of Bloomfilter Usage, 30 means 2^30 = 128MB, defaults to 30
BLOOMFILTER_BIT = 30
网友评论