美文网首页
Scrapy中的各种管道保存类型

Scrapy中的各种管道保存类型

作者: 八盖 | 来源:发表于2019-07-10 09:15 被阅读0次

1.保存到csv格式表中

from scrapy.exporters import CsvItemExporter

class BossJobPipeline(object):

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item

    def open_spider(self, spider):
        self.file = open("job.csv", "wb")
        self.exporter = CsvItemExporter(
            self.file,
            fields_to_export=["job", "salary", "require", "company", "staff"]
        )
        self.exporter.start_exporting()

    def close_spider(self, spider):
        self.exporter.finish_exporting()
        self.file.close()

2.保存到mysql数据库

import pymysql


# 引用了scrapy预定义的DropItem异常,这个异常必须在管道中抛出,表示管道应该丢弃这个数据
from scrapy.exceptions import DropItem
class DbzfPipeline(object):
    def open_spider(self, spider):
        self.con = pymysql.connect(
            host="localhost",
            user="root",
            password="****",
            database="demo",
            port=3306
        )
        self.cursor = self.con.cursor()

    def close_spider(self, spider):
        self.cursor.close()
        self.con.close()

    def process_item(self, item, spider):
        # 存进数据库
        # keys, values = zip(*item.items())
        # keys_str = ",".join(keys)
        # values_str = ",".join(["%s"] * len(values))
        # .format(keys_str, values_str)
        sql = """
                    INSERT INTO db_zf(username, title, content, img)
                    VALUES (%s, %s, %s, %s)
        """
        keys, values = zip(*item.items())
        self.cursor.execute(sql, values)
        self.con.commit()
        return item

3.保存到mogodb数据库

import pymongo

class MongoPipeline(object):

    collection_name = 'scrapy_items'

    def __init__(self, mongo_uri, mongo_db):
        self.mongo_uri = mongo_uri
        self.mongo_db = mongo_db

    @classmethod
    def from_crawler(cls, crawler):
        return cls(
            mongo_uri=crawler.settings.get('MONGO_URI'),
            mongo_db=crawler.settings.get('MONGO_DATABASE', 'items')
        )

    def open_spider(self, spider):
        self.client = pymongo.MongoClient(self.mongo_uri)
        self.db = self.client[self.mongo_db]

    def close_spider(self, spider):
        self.client.close()

    def process_item(self, item, spider):
        self.db[self.collection_name].insert_one(dict(item))
        return item

相关文章

网友评论

      本文标题:Scrapy中的各种管道保存类型

      本文链接:https://www.haomeiwen.com/subject/mcqckctx.html