五 2. 爬虫持久化存储

作者: W11ng | 来源:发表于2020-03-14 23:38 被阅读0次

五 2. 爬虫持久化存储
Hibremate4 介绍
prometheus+influxdb+grafana
iOS数据持久化
reactnative 数据持久化(一)
redis-05-持久化
第六章数据存储全方案,详解持久化技术
iOS 开发技术选型之数据库：SQLite vs. Core D
持久化存储
iOS数据存储

爬取所谓校花网信息。
创建一个项目：
scrapy startproject tutorial
cd到tutorial项目里，创建一个爬虫：
scrapy genspider belle 521609.com

一存储到文件中
在settings.py进行基本配置

# UA 伪装
USER_AGENT = 'User-Agent: Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36'
# 设置日志级别
LOG_LEVEL = 'ERROR'
# 关闭机器人协议
ROBOTSTXT_OBEY = False

在items.py中将类名改为BellelItem，因为这样能区分同项目下的其他爬虫管道类。定义要存储的字段

import scrapy


class BellelItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    title = scrapy.Field()
    img_url = scrapy.Field()

解析网站，在belle.py中通过xpath解析

# -*- coding: utf-8 -*-
import scrapy
from tutorial.items import BellelItem


class BelleSpider(scrapy.Spider):
    name = 'belle'
    allowed_domains = ['521609.com']
    start_urls = ['http://www.521609.com/xiaoyuanmeinv/']

    def parse(self, response):
        ph_list = response.xpath('//div[@class="index_img list_center"]/ul/li')
        for p in ph_list:
            title = p.xpath('./a[2]/text()').extract_first()
            # title = p.xpath('//a[@class="title"]/text()').extract_first()
            img_url = 'http://www.521609.com' + p.xpath('./a[1]/img/@src').extract_first()
            item = BellelItem()
            item['title'] = title
            item['img_url'] = img_url
            # 提交给管道
            yield item

piplines.py
存储到文件的时候，因为每次都数据过来都会运行一遍，如果打开文件操作放到process_item()中，就会每次都打开。因此可以重写open_spider()和close_spider()。

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html


class TutorialPipeline(object):
    f = None

    def open_spider(self, spider):
        self.f = open("./belle.txt", 'w', encoding='utf-8')

    def process_item(self, item, spider):
        title = item['title']
        img_url = item['img_url']
        self.f.write(title + ":" + img_url + "\n")
        return item

    def close_spider(self, spider):
        self.f.close()

settings.py中解注释启用管道，数字越小优先级越高。

# 300是优先级，越小优先级越高
ITEM_PIPELINES = {
   'tutorial.pipelines.TutorialPipeline': 300,
}

运行爬虫：
scrapy crawl belle

二存储到数据库中
修改piplines.py如下，导入pymysql模块

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
import pymysql


class TutorialPipeline(object):
    f = None

    def open_spider(self, spider):
        self.f = open("./belle.txt", 'w', encoding='utf-8')

    def process_item(self, item, spider):
        title = item['title']
        img_url = item['img_url']
        self.f.write(title + ":" + img_url + "\n")

        # 把item传递给下个类
        return item

    def close_spider(self, spider):
        self.f.close()


class MySQLPipeline(object):
    conn = None
    cursor = None

    def open_spider(self, spider):
        self.conn = pymysql.Connect(host='127.0.0.1', port=3306, user='root', password='123', db='belle', charset='utf8')

    def process_item(self, item, spider):
        self.cursor = self.conn.cursor()
        title = item['title']
        img_url = item['img_url']
        insert_sql = "insert into belle values('{}', '{}')".format(title, img_url)

        try:
            self.cursor.execute(insert_sql)
            self.conn.commit()
        except Exception as e:
            print(e)
            self.conn.rollback()
        return item

    def close_spider(self, spider):
        self.cursor.close()
        self.conn.close()

在settings.py增加管道配置

ITEM_PIPELINES = {
   'tutorial.pipelines.TutorialPipeline': 300,
   # 新增MySQLPipeline管道类，权重比上一个类低
   'tutorial.pipelines.MySQLPipeline': 301,
}

三 Redis存储
在piplines.py导入redis，并添加类。

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
import pymysql
import redis


class TutorialPipeline(object):
    f = None

    def open_spider(self, spider):
        self.f = open("./belle.txt", 'w', encoding='utf-8')

    def process_item(self, item, spider):
        title = item['title']
        img_url = item['img_url']
        self.f.write(title + ":" + img_url + "\n")

        # 把item传递给下个类
        return item

    def close_spider(self, spider):
        self.f.close()


class MySQLPipeline(object):
    conn = None
    cursor = None

    def open_spider(self, spider):
        self.conn = pymysql.Connect(host='127.0.0.1', port=3306, user='root', password='123', db='belle',
                                    charset='utf8')

    def process_item(self, item, spider):
        self.cursor = self.conn.cursor()
        title = item['title']
        img_url = item['img_url']
        insert_sql = "insert into belle values('{}', '{}')".format(title, img_url)

        try:
            self.cursor.execute(insert_sql)
            self.conn.commit()
        except Exception as e:
            print(e)
            self.conn.rollback()
        return item

    def close_spider(self, spider):
        self.cursor.close()
        self.conn.close()


class RedisPipeline(object):
    conn = None

    def open_spider(self, spider):
        self.conn = redis.Redis(host='127.0.0.1', port=6379)

    def process_item(self, item, spider):
        title = item['title']
        img_url = item['img_url']
        item_dic = {
            "title": title,
            "img": img_url,
        }
        import json
        self.conn.lpush("belle", json.dumps(item_dic))

        return item

    def close_spider(self, spider):
        print("爬虫结束")

在settings.py中开启管道，并设置权重。

ITEM_PIPELINES = {
   'tutorial.pipelines.TutorialPipeline': 300,
   # 新增MySQLPipeline管道类，权重比上一个类低
   'tutorial.pipelines.MySQLPipeline': 301,
   'tutorial.pipelines.RedisPipeline': 302,
}