美文网首页爬虫专题
Scrapy爬取全国行政区划并实时插入MySQL数据库

Scrapy爬取全国行政区划并实时插入MySQL数据库

作者: 349ff5da91d8 | 来源:发表于2017-11-09 10:31 被阅读272次
    主要爬虫框架:Scrapy

    数据库模块:pymysql
    python版本:python3.5.3
    windows版本:win10
    爬取心得:利用已有的工具,熟悉需求
    爬取步骤:
    1、创建爬虫项目:scrapy startproject home_scrapy
    2、编写spider代码
    3、编写pipelines处理代码(json存储及mysql数据库存储)
    4、运行爬虫项目: scrapy crawl spiderone
    爬取时间:11:28am - 17:41pm (共计6小时13分钟)
    贴代码:

    spiderone.py:
    # -*- coding: utf-8 -*-
    import scrapy
    from home_scrapy.items import HomeScrapyItem
    import os
    
    
    class SpideroneSpider(scrapy.Spider):
        name = 'spiderone'
        allowed_domains = ['http://www.stats.gov.cn']
        baseUrl = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2016/'
        start_urls = ['http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2016/index.html']
        # start_urls = ['http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2016/43.html']
    
        # 省
        def parse(self, response):
            node_list = response.xpath('//tr[@class="provincetr"]/td/a')
            for province_node in node_list:
                province_data = HomeScrapyItem()
                aname = province_node.xpath('text()').extract()
                url = province_node.xpath('@href').extract()
                pre_id = str(url[0])
                id = pre_id.split('.', -1)[0] + '0000000000'
                province_data['aname'] = aname[0]
                province_data['id'] = id
                province_data['lv'] = 1
                province_data['pid'] = 100000000000
                city_url = self.baseUrl + str(url[0])
                yield scrapy.Request(city_url, meta={'province_data': province_data}, callback=self.city_parse,
                                     encoding='utf-8', dont_filter=True)
                yield province_data
    
        def city_parse(self, response):
            city_list = response.xpath('//tr[@class="citytr"]')
            meta = response.meta['province_data']
            for city_node in city_list:
                city_data = dict(meta)
                number = city_node.xpath('td[1]/a/text()').extract()
                name = city_node.xpath('td[2]/a/text()').extract()
                city_url = city_node.xpath('td[1]/a/@href').extract()
                city_data['id'] = number[0]
                city_data['aname'] = name[0]
                city_data['lv'] = 2
                city_data['pid'] = meta['id']
                country_url = self.baseUrl + str(city_url[0])
                yield scrapy.Request(country_url, meta={'city_data': city_data}, callback=self.country_parse,
                                     encoding='utf-8', dont_filter=True)
                yield city_data
    
    
        def country_parse(self, response):
            meta = response.meta['city_data']
            country_list = response.xpath('//tr[@class="countytr"]')
            for country_node in country_list:
    
                if country_node.xpath('td/a/text()'):
                    country_data = dict(meta)
                    number = country_node.xpath('td[1]/a/text()').extract()
                    name = country_node.xpath('td[2]/a/text()').extract()
                    country_data['id'] = number[0]
                    country_data['aname'] = name[0]
                    country_data['lv'] = 3
                    country_data['pid'] = meta['id']
                    country_url = country_node.xpath('td[1]/a/@href').extract()[0]
                    base_link = os.path.dirname(response.url)
                    next_link = '/'.join([base_link, country_url])
                    yield scrapy.Request(next_link, meta={'country_data': country_data}, callback=self.town_parse,
                                         encoding='utf-8', dont_filter=True)
                    yield country_data
                else:
                    country_data = dict(meta)
                    number = country_node.xpath('td[1]/text()').extract()
                    name = country_node.xpath('td[2]/text()').extract()
                    country_data['id'] = number[0]
                    country_data['aname'] = name[0]
                    country_data['lv'] = 3
                    country_data['pid'] = meta['id']
                    yield country_data
    
        def town_parse(self, response):
            meta = response.meta['country_data']
            town_list = response.xpath('//tr[@class="towntr"]')
            for town_node in town_list:
                town_data = dict(meta)
                number = town_node.xpath('td[1]/a/text()').extract()
                name = town_node.xpath('td[2]/a/text()').extract()
                town_data['id'] = number[0]
                town_data['aname'] = name[0]
                town_data['lv'] = 4
                town_data['pid'] = meta['id']
                town_url = town_node.xpath('td[1]/a/@href').extract()[0]
                base_link = os.path.dirname(response.url)
                next_link = '/'.join(([base_link, town_url]))
                yield scrapy.Request(next_link, meta={'town_data': town_data}, callback=self.village_parse,
                                     encoding='utf-8', dont_filter=True)
                yield town_data
        def village_parse(self, response):
            meta = response.meta['town_data']
            village_list = response.xpath('//tr[@class="villagetr"]')
            for village_node in village_list:
                village_data = dict(meta)
                number = village_node.xpath('td[1]/text()').extract()
                code = village_node.xpath('td[2]/text()').extract()
                name = village_node.xpath('td[3]/text()').extract()
                village_data['id'] = number[0]
                # village_data['villageCode'] = code[0]
                village_data['aname'] = name[0]
                village_data['lv'] = 5
                village_data['pid'] = meta['id']
                yield village_data
    
    pipelines.py:
    # -*- coding: utf-8 -*-
    
    # Define your item pipelines here
    #
    # Don't forget to add your pipeline to the ITEM_PIPELINES setting
    # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
    import json
    import threading
    import pymysql
    from DBUtils.PooledDB import PooledDB
    class HomeScrapyPipeline(object):
        def __init__(self):
            self.f = open("data.json", 'wb')
    
        def process_item(self, item, spider):
            content = json.dumps(dict(item), ensure_ascii=False) + ", \n"
            self.f.write(content.encode('utf-8'))
            return item
    
        def close_spider(self, spider):
            self.f.close()
    
    
    lock = threading.RLock()
    class HomeScrapyMySQLPipeline(object):
        def __init__(self, dbpool):
            self.dbpool = dbpool
    
    
        @classmethod
        def from_settings(cls, settings):
            #  创建数据库连接池类方法
            dbpool = PooledDB(creator=pymysql,
                     mincached=settings['DB_MIN_CACHED'], maxcached=settings['DB_MAX_CACHED'],
                     maxshared=settings['DB_MAX_SHARED'], maxconnections=settings['DB_MAX_CONNECTIONS'],
                     blocking=settings['DB_BLOCKING'], maxusage=settings['DB_MAX_USAGE'], setsession=settings['DB_SET_SESSION'],
                     host=settings['DB_HOST'], port=settings['DB_PORT'],
                     user=settings['DB_USER'], passwd=settings['DB_PASSWD'],
                     db=settings['DB_NAME'], charset=settings['DB_CHARSET'], use_unicode=False
                     )
            return cls(dbpool)  
    
        # pipeline默认调用
        def process_item(self, item, spider):
            lock.acquire()
            conn = self.dbpool.connection()
            cursor = conn.cursor()
            sql = "insert into area(id, aname, lv, pid) values(%s,%s,%s,%s)"
            params = (item["id"], item["aname"], item["lv"], item["pid"])
            cursor.execute(sql, params)
            conn.commit()
            cursor.close()
            conn.close()
            lock.release()
            return item
    
    
    items.py:
    # -*- coding: utf-8 -*-
    
    # Define here the models for your scraped items
    #
    # See documentation in:
    # http://doc.scrapy.org/en/latest/topics/items.html
    
    import scrapy
    
    class HomeScrapyItem(scrapy.Item):
    
        # provinceName = scrapy.Field()  # 省/直辖市/自治区名称
        #
        # cityName = scrapy.Field()      # 市/区级名称
        # cityNumber = scrapy.Field()
        #
        # countryName = scrapy.Field()   # 区/县级名称
        # countryNumber = scrapy.Field()
        #
        # townName = scrapy.Field()      # 乡/镇级名称
        # townNumber = scrapy.Field()
        #
        # villageName = scrapy.Field()   # 街道/镇/村级名称
        # villageNumber = scrapy.Field()
        # villageCode = scrapy.Field()
          id = scrapy.Field()     #行政区划代码
          aname = scrapy.Field()  #行政区划名称
          lv = scrapy.Field()     #行政区划等级
          pid = scrapy.Field()    #上级行政区划代码
    
    settings.py:
    # -*- coding: utf-8 -*-
    
    # Scrapy settings for home_scrapy project
    #
    # For simplicity, this file contains only settings considered important or
    # commonly used. You can find more settings consulting the documentation:
    #
    #     http://doc.scrapy.org/en/latest/topics/settings.html
    #     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
    #     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
    
    BOT_NAME = 'home_scrapy'
    
    SPIDER_MODULES = ['home_scrapy.spiders']
    NEWSPIDER_MODULE = 'home_scrapy.spiders'
    
    # TEST数据库信息
    DB_HOST = "localhost"
    DB_PORT = 3306
    DB_NAME = "test"
    DB_USER = "root"
    DB_PASSWD = "root111"
    
    # 数据库连接编码
    DB_CHARSET = "utf8"
    
    # mincached : 启动时开启的闲置连接数量(缺省值 0 以为着开始时不创建连接)
    DB_MIN_CACHED = 10
    
    # maxcached : 连接池中允许的闲置的最多连接数量(缺省值 0 代表不闲置连接池大小)
    DB_MAX_CACHED = 10
    
    # maxshared : 共享连接数允许的最大数量(缺省值 0 代表所有连接都是专用的)如果达到了最大数量,被请求为共享的连接将会被共享使用
    DB_MAX_SHARED = 20
    
    # maxconnections : 创建连接池的最大数量(缺省值 0 代表不限制)
    DB_MAX_CONNECIONS = 100
    
    # blocking : 设置在连接池达到最大数量时的行为(缺省值 0 或 False 代表返回一个错误<toMany......>; 其他代表阻塞直到连接数减少,连接被分配)
    DB_BLOCKING = True
    
    # maxusage : 单个连接的最大允许复用次数(缺省值 0 或 False 代表不限制的复用).当达到最大数时,连接会自动重新连接(关闭和重新打开)
    DB_MAX_USAGE = 0
    
    # setsession : 一个可选的SQL命令列表用于准备每个会话,如["set datestyle to german", ...]
    DB_SET_SESSION = None
    
    # Crawl responsibly by identifying yourself (and your website) on the user-agent
    # USER_AGENT = 'home_scrapy (+http://www.yourdomain.com)'
    
    # Obey robots.txt rules
    ROBOTSTXT_OBEY = False
    
    # Configure maximum concurrent requests performed by Scrapy (default: 16)
    # CONCURRENT_REQUESTS = 32
    
    # Configure a delay for requests for the same website (default: 0)
    # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
    # See also autothrottle settings and docs
    # DOWNLOAD_DELAY = 3
    # The download delay setting will honor only one of:
    # CONCURRENT_REQUESTS_PER_DOMAIN = 16
    # CONCURRENT_REQUESTS_PER_IP = 16
    
    # Disable cookies (enabled by default)
    # COOKIES_ENABLED = False
    
    # Disable Telnet Console (enabled by default)
    # TELNETCONSOLE_ENABLED = False
    
    # Override the default request headers:
    # DEFAULT_REQUEST_HEADERS = {
    #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
    #   'Accept-Language': 'en',
    # }
    
    # Enable or disable spider middlewares
    # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
    # SPIDER_MIDDLEWARES = {
    #    'home_scrapy.middlewares.HomeScrapySpiderMiddleware': 543,
    # }
    
    # Enable or disable downloader middlewares
    # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
    # DOWNLOADER_MIDDLEWARES = {
    #    'home_scrapy.middlewares.MyCustomDownloaderMiddleware': 543,
    # }
    
    # Enable or disable extensions
    # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
    # EXTENSIONS = {
    #    'scrapy.extensions.telnet.TelnetConsole': None,
    # }
    
    # Configure item pipelines
    # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
    ITEM_PIPELINES = {
        'home_scrapy.pipelines.HomeScrapyPipeline': 300,
        'home_scrapy.pipelines.HomeScrapyMySQLPipeline': 300,
    }
    
    # Enable and configure the AutoThrottle extension (disabled by default)
    # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
    # AUTOTHROTTLE_ENABLED = True
    # The initial download delay
    # AUTOTHROTTLE_START_DELAY = 5
    # The maximum download delay to be set in case of high latencies
    # AUTOTHROTTLE_MAX_DELAY = 60
    # The average number of requests Scrapy should be sending in parallel to
    # each remote server
    # AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
    # Enable showing throttling stats for every response received:
    # AUTOTHROTTLE_DEBUG = False
    
    # Enable and configure HTTP caching (disabled by default)
    # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
    # HTTPCACHE_ENABLED = True
    # HTTPCACHE_EXPIRATION_SECS = 0
    # HTTPCACHE_DIR = 'httpcache'
    # HTTPCACHE_IGNORE_HTTP_CODES = []
    # HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
    
    数据库查询示例图:
    微信截图_20171109102727.png
    微信截图_20171109102858.png

    相关文章

      网友评论

        本文标题:Scrapy爬取全国行政区划并实时插入MySQL数据库

        本文链接:https://www.haomeiwen.com/subject/kahpmxtx.html