美文网首页
五 2. 爬虫持久化存储

五 2. 爬虫持久化存储

作者: W11ng | 来源:发表于2020-03-14 23:38 被阅读0次

    爬取所谓校花网信息。
    创建一个项目:
    scrapy startproject tutorial
    cdtutorial项目里,创建一个爬虫:
    scrapy genspider belle 521609.com


    一 存储到文件中
    settings.py进行基本配置

    # UA 伪装
    USER_AGENT = 'User-Agent: Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36'
    # 设置日志级别
    LOG_LEVEL = 'ERROR'
    # 关闭机器人协议
    ROBOTSTXT_OBEY = False
    

    items.py中将类名改为BellelItem,因为这样能区分同项目下的其他爬虫管道类。定义要存储的字段

    import scrapy
    
    
    class BellelItem(scrapy.Item):
        # define the fields for your item here like:
        # name = scrapy.Field()
        title = scrapy.Field()
        img_url = scrapy.Field()
    

    解析网站,在belle.py中通过xpath解析

    # -*- coding: utf-8 -*-
    import scrapy
    from tutorial.items import BellelItem
    
    
    class BelleSpider(scrapy.Spider):
        name = 'belle'
        allowed_domains = ['521609.com']
        start_urls = ['http://www.521609.com/xiaoyuanmeinv/']
    
        def parse(self, response):
            ph_list = response.xpath('//div[@class="index_img list_center"]/ul/li')
            for p in ph_list:
                title = p.xpath('./a[2]/text()').extract_first()
                # title = p.xpath('//a[@class="title"]/text()').extract_first()
                img_url = 'http://www.521609.com' + p.xpath('./a[1]/img/@src').extract_first()
                item = BellelItem()
                item['title'] = title
                item['img_url'] = img_url
                # 提交给管道
                yield item
    
    

    piplines.py
    存储到文件的时候,因为每次都数据过来都会运行一遍,如果打开文件操作放到process_item()中,就会每次都打开。因此可以重写open_spider()和close_spider()。

    # -*- coding: utf-8 -*-
    
    # Define your item pipelines here
    #
    # Don't forget to add your pipeline to the ITEM_PIPELINES setting
    # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
    
    
    class TutorialPipeline(object):
        f = None
    
        def open_spider(self, spider):
            self.f = open("./belle.txt", 'w', encoding='utf-8')
    
        def process_item(self, item, spider):
            title = item['title']
            img_url = item['img_url']
            self.f.write(title + ":" + img_url + "\n")
            return item
    
        def close_spider(self, spider):
            self.f.close()
    
    

    settings.py中解注释启用管道,数字越小优先级越高。

    # 300是优先级,越小优先级越高
    ITEM_PIPELINES = {
       'tutorial.pipelines.TutorialPipeline': 300,
    }
    

    运行爬虫:
    scrapy crawl belle


    二 存储到数据库中
    修改piplines.py如下,导入pymysql模块

    # -*- coding: utf-8 -*-
    
    # Define your item pipelines here
    #
    # Don't forget to add your pipeline to the ITEM_PIPELINES setting
    # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
    import pymysql
    
    
    class TutorialPipeline(object):
        f = None
    
        def open_spider(self, spider):
            self.f = open("./belle.txt", 'w', encoding='utf-8')
    
        def process_item(self, item, spider):
            title = item['title']
            img_url = item['img_url']
            self.f.write(title + ":" + img_url + "\n")
    
            # 把item传递给下个类
            return item
    
        def close_spider(self, spider):
            self.f.close()
    
    
    class MySQLPipeline(object):
        conn = None
        cursor = None
    
        def open_spider(self, spider):
            self.conn = pymysql.Connect(host='127.0.0.1', port=3306, user='root', password='123', db='belle', charset='utf8')
    
        def process_item(self, item, spider):
            self.cursor = self.conn.cursor()
            title = item['title']
            img_url = item['img_url']
            insert_sql = "insert into belle values('{}', '{}')".format(title, img_url)
    
            try:
                self.cursor.execute(insert_sql)
                self.conn.commit()
            except Exception as e:
                print(e)
                self.conn.rollback()
            return item
    
        def close_spider(self, spider):
            self.cursor.close()
            self.conn.close()
    
    
    

    在settings.py增加管道配置

    ITEM_PIPELINES = {
       'tutorial.pipelines.TutorialPipeline': 300,
       # 新增MySQLPipeline管道类,权重比上一个类低
       'tutorial.pipelines.MySQLPipeline': 301,
    }
    

    三 Redis存储
    piplines.py导入redis,并添加类。

    # -*- coding: utf-8 -*-
    
    # Define your item pipelines here
    #
    # Don't forget to add your pipeline to the ITEM_PIPELINES setting
    # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
    import pymysql
    import redis
    
    
    class TutorialPipeline(object):
        f = None
    
        def open_spider(self, spider):
            self.f = open("./belle.txt", 'w', encoding='utf-8')
    
        def process_item(self, item, spider):
            title = item['title']
            img_url = item['img_url']
            self.f.write(title + ":" + img_url + "\n")
    
            # 把item传递给下个类
            return item
    
        def close_spider(self, spider):
            self.f.close()
    
    
    class MySQLPipeline(object):
        conn = None
        cursor = None
    
        def open_spider(self, spider):
            self.conn = pymysql.Connect(host='127.0.0.1', port=3306, user='root', password='123', db='belle',
                                        charset='utf8')
    
        def process_item(self, item, spider):
            self.cursor = self.conn.cursor()
            title = item['title']
            img_url = item['img_url']
            insert_sql = "insert into belle values('{}', '{}')".format(title, img_url)
    
            try:
                self.cursor.execute(insert_sql)
                self.conn.commit()
            except Exception as e:
                print(e)
                self.conn.rollback()
            return item
    
        def close_spider(self, spider):
            self.cursor.close()
            self.conn.close()
    
    
    class RedisPipeline(object):
        conn = None
    
        def open_spider(self, spider):
            self.conn = redis.Redis(host='127.0.0.1', port=6379)
    
        def process_item(self, item, spider):
            title = item['title']
            img_url = item['img_url']
            item_dic = {
                "title": title,
                "img": img_url,
            }
            import json
            self.conn.lpush("belle", json.dumps(item_dic))
    
            return item
    
        def close_spider(self, spider):
            print("爬虫结束")
    

    在settings.py中开启管道,并设置权重。

    ITEM_PIPELINES = {
       'tutorial.pipelines.TutorialPipeline': 300,
       # 新增MySQLPipeline管道类,权重比上一个类低
       'tutorial.pipelines.MySQLPipeline': 301,
       'tutorial.pipelines.RedisPipeline': 302,
    }
    

    相关文章

      网友评论

          本文标题:五 2. 爬虫持久化存储

          本文链接:https://www.haomeiwen.com/subject/cvmgdhtx.html