美文网首页
pyspider爬取最新的全国区划代码和城乡划分代码

pyspider爬取最新的全国区划代码和城乡划分代码

作者: sankyson | 来源:发表于2019-05-08 15:09 被阅读0次

    pyspider爬取最新的全国区划代码和城乡划分代码,保存到mongodb

    其中遇到一些坑,由于一些省级市下面没有设县级行政区,直接市管镇,所以要对其做特殊的处理
    目前发现的是广东省中山市、东莞市和海南省儋州市存在这个问题,如果还有其他市有类似情况的需要修改判断处理这种情况

    修改如下代码:

     for each in response.doc('a[href^="http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018"]').items():
                #特殊情况,东莞、中山、儋州下面没有设县级行政区,直接市管镇
                if each.attr.href[-9:-5] == '4419' or each.attr.href[-9:-5] == '4420' or each.attr.href[-9:-5] == '4604':
                    self.crawl(each.attr.href, callback=self.street)
                self.crawl(each.attr.href, callback=self.district)
    
    for i in range(int(len(lData)/2)):
                #由于特殊情况,东莞、中山、儋州直接到镇,所以这里要判断处理东莞、中山、儋州的镇上级pid为市的id
                if lData[2*i][0:4] == '4419' or lData[2*i][0:4] == '4420' or lData[2*i][0:4] == '4604':
                    pid = lData[2*i][0:4]
                else:
                    pid = lData[2*i][0:6]
    

    完整代码如下:

    from pyspider.libs.base_handler import *
    import pymongo
    
    class Handler(BaseHandler):
        crawl_config = {
        }
    
        @every(minutes=24 * 60)
        def on_start(self):
            self.crawl('http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/index.html', callback=self.province)
    
        @config(age=10 * 24 * 60 * 60)
        #获取省区划代码
        def province(self, response):
            lProvince = []
            lCode = []
            lValue = []
    
            for each in response.doc('a[href^="http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018"]').items():
                code = each.attr.href[-7:-5]
                lCode.append(code)
    
            for province in response.doc('a').items():
                province_name = province.text()
                lValue.append(province_name)
    
            lValue.pop()
    
            for i in range(len(lCode)):
                dProvince = {
                    'id':lCode[i],
                    'pid':'0',
                    'name':lValue[i],
                }
                lProvince.append(dProvince)
    
            for each in response.doc('a[href^="http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018"]').items():
                self.crawl(each.attr.href, callback=self.city)
            return {
                    'title':'province',
                    'lRes':lProvince,
            }
    
        @config(age=10 * 24 * 60 * 60)
        #获取市区划代码
        def city(self, response):
            lCity = []
            lData = []
    
            for city in response.doc('a').items():
                lData.append(city.text())
            lData .pop()
    
            for i in range(int(len(lData)/2)):
                dCity = {
                    'id':lData[2*i][0:4],
                    'pid':lData[2*i][0:2],
                    'name':lData[2*i+1],
                }
                lCity.append(dCity)
    
            for each in response.doc('a[href^="http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018"]').items():
                #特殊情况,东莞、中山、儋州下面没有设县级行政区,直接市管镇
                if each.attr.href[-9:-5] == '4419' or each.attr.href[-9:-5] == '4420' or each.attr.href[-9:-5] == '4604':
                    self.crawl(each.attr.href, callback=self.street)
                self.crawl(each.attr.href, callback=self.district)
            return {
                    'title':'city',
                    'lRes':lCity,
            }
    
        @config(age=10 * 24 * 60 * 60)
        #获取县市区划代码
        def district(self, response):
            lDistrict = []
            lData = []
            for district in response.doc('a').items():
                lData.append(district.text())
    
            lData .pop()
    
            for i in range(int(len(lData)/2)):
                #由于特殊情况,东莞、中山、儋州直接到镇,所以这里要判断处理东莞、中山、儋州的镇上级pid为市的id
                if lData[2*i][0:4] == '4419' or lData[2*i][0:4] == '4420' or lData[2*i][0:4] == '4604':
                    pid = lData[2*i][0:4]
                else:
                    pid = lData[2*i][0:6]
                    
                dStreet = {
                    'id':lData[2*i][0:9],
                    'pid':pid,
                    'name':lData[2*i+1],
                }
    
                lDistrict.append(dDistrict)
    
            for each in response.doc('a[href^="http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018"]').items():
                self.crawl(each.attr.href, callback=self.street)
            return {
                    'title':'district',
                    'lRes':lDistrict,
            }
    
        @config(age=10 * 24 * 60 * 60) 
        #获取街道区划代码
        def street(self, response):
            lStreet = []
            lData = []
    
            for street in response.doc('a').items():
                lData.append(street.text())
    
            lData .pop()
    
            for i in range(int(len(lData)/2)):
                dStreet = {
                    'id':lData[2*i][0:9],
                    'pid':lData[2*i][0:6],
                    'name':lData[2*i+1],
                }
                lStreet.append(dStreet)
    
            for each in response.doc('a[href^="http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018"]').items():
                self.crawl(each.attr.href, callback=self.detail)
    
            return {
                    'title':'street',
                    'lRes':lStreet,
            }
    
        @config(age=10 * 24 * 60 * 60) 
        #获取居委、村区划代码
        def detail(self, response):
            lDetail = []
            lData = []
    
            for detail in response.doc('.villagetr > td').items():
    
                lData.append(detail.text())
    
            for i in range(int(len(lData)/3)):
                dDetail = {
                    'id':lData[3*i],
                    'pid':lData[3*i][0:9],
                    'name':lData[3*i+2],
                }
                lDetail.append(dDetail)
            return {
                    'title':'detail',
                    'lRes':lDetail,
            }
        
        #重写on_result保存抓取信息到mongodb
        def on_result(self,result):
            if not result:
                return
            lData = result['lRes']
            client = pymongo.MongoClient(host='127.0.0.1',port=27017)
            db = client['district']
            coll = db['district_data']
    
            for i in range(len(lData)):
                data = {
                    'id':lData[i]['id'],
                    'pid':lData[i]['pid'],
                    'name':lData[i]['name'],
                }
                data_id = coll.insert(data)
    
    
    image.png

    附2019全国行政区划代码

    链接: https://pan.baidu.com/s/1JCQ_wGaf8dlmtLMJU5xTYA 提取码: utz5

    相关文章

      网友评论

          本文标题:pyspider爬取最新的全国区划代码和城乡划分代码

          本文链接:https://www.haomeiwen.com/subject/rtnmoqtx.html