在scrapy项目中的settings.py中设置
#连接mongo
MONGODB_HOST = '127.0.0.1'
MONGODB_PORT = 27017
MONGODB_DBNAME = 'data'
MONGODB_DOCNAME = 'chinadata'
ITEM_PIPELINES = {
# 'chinaico.pipelines.ChinaicoPipeline': 300,
'chinaico.pipelines.RedisPipeline':301,
'chinaico.pipelines.MongoPipeline':301,
}
在scrapy项目中的pipelines.py中设置
import json
import redis
import pymongo
from pymongo import MongoClient
from scrapy.conf import settings
import datetime
class ChinaicoPipeline(object):
def process_item(self, item, spider):
return item
class RedisPipeline(object):
def __init__(self):
self.r = redis.StrictRedis(host='127.0.0.1',password='', port=6379, db=3)
def process_item(self, item, spider):
self.r.sadd("webchinadata", json.dumps(dict(item),ensure_ascii=False))
return item
class MongoPipeline(object):
def __init__(self):
host = settings['MONGODB_HOST']
port = settings['MONGODB_PORT']
db_name = settings['MONGODB_DBNAME']
client = pymongo.MongoClient(host=host, port=port)
db = client[db_name]
self.post = db[settings['MONGODB_DOCNAME']]
def process_item(self, item, spider):
china_data = dict(item)
self.post.insert(china_data,"time":datetime.datetime.now())插入数据
self.post.update({"name":item["name"]},{"$set":china_data},upsert = True)
更新数据
return item
网友评论