在pipeline中
import re
from yangguang.settings import MONGO_HOST
from pymongo import MongoClient
class YangguangPipeline(object):
def open_spider(self,spider):
# spider.hello = "world"
client = MongoClient()
self.collection = client["test"]["test"]
def process_item(self, item, spider):
spider.settings.get("MONGO_HOST")
item["content"] = self.process_content(item["content"])
print(item)
self.collection.insert(dict(item))
return item
def process_content(self,content):
content = [re.sub(r"\xa0|\s","",i) for i in content]
content = [i for i in content if len(i)>0] #去除列表中的空字符串
return content
网友评论