1.保存到csv格式表中
from scrapy.exporters import CsvItemExporter
class BossJobPipeline(object):
def process_item(self, item, spider):
self.exporter.export_item(item)
return item
def open_spider(self, spider):
self.file = open("job.csv", "wb")
self.exporter = CsvItemExporter(
self.file,
fields_to_export=["job", "salary", "require", "company", "staff"]
)
self.exporter.start_exporting()
def close_spider(self, spider):
self.exporter.finish_exporting()
self.file.close()
2.保存到mysql数据库
import pymysql
# 引用了scrapy预定义的DropItem异常,这个异常必须在管道中抛出,表示管道应该丢弃这个数据
from scrapy.exceptions import DropItem
class DbzfPipeline(object):
def open_spider(self, spider):
self.con = pymysql.connect(
host="localhost",
user="root",
password="****",
database="demo",
port=3306
)
self.cursor = self.con.cursor()
def close_spider(self, spider):
self.cursor.close()
self.con.close()
def process_item(self, item, spider):
# 存进数据库
# keys, values = zip(*item.items())
# keys_str = ",".join(keys)
# values_str = ",".join(["%s"] * len(values))
# .format(keys_str, values_str)
sql = """
INSERT INTO db_zf(username, title, content, img)
VALUES (%s, %s, %s, %s)
"""
keys, values = zip(*item.items())
self.cursor.execute(sql, values)
self.con.commit()
return item
3.保存到mogodb数据库
import pymongo
class MongoPipeline(object):
collection_name = 'scrapy_items'
def __init__(self, mongo_uri, mongo_db):
self.mongo_uri = mongo_uri
self.mongo_db = mongo_db
@classmethod
def from_crawler(cls, crawler):
return cls(
mongo_uri=crawler.settings.get('MONGO_URI'),
mongo_db=crawler.settings.get('MONGO_DATABASE', 'items')
)
def open_spider(self, spider):
self.client = pymongo.MongoClient(self.mongo_uri)
self.db = self.client[self.mongo_db]
def close_spider(self, spider):
self.client.close()
def process_item(self, item, spider):
self.db[self.collection_name].insert_one(dict(item))
return item
网友评论