pyspider爬取最新的全国区划代码和城乡划分代码,保存到mongodb
其中遇到一些坑,由于一些省级市下面没有设县级行政区,直接市管镇,所以要对其做特殊的处理
目前发现的是广东省中山市、东莞市和海南省儋州市存在这个问题,如果还有其他市有类似情况的需要修改判断处理这种情况
修改如下代码:
for each in response.doc('a[href^="http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018"]').items():
#特殊情况,东莞、中山、儋州下面没有设县级行政区,直接市管镇
if each.attr.href[-9:-5] == '4419' or each.attr.href[-9:-5] == '4420' or each.attr.href[-9:-5] == '4604':
self.crawl(each.attr.href, callback=self.street)
self.crawl(each.attr.href, callback=self.district)
for i in range(int(len(lData)/2)):
#由于特殊情况,东莞、中山、儋州直接到镇,所以这里要判断处理东莞、中山、儋州的镇上级pid为市的id
if lData[2*i][0:4] == '4419' or lData[2*i][0:4] == '4420' or lData[2*i][0:4] == '4604':
pid = lData[2*i][0:4]
else:
pid = lData[2*i][0:6]
完整代码如下:
from pyspider.libs.base_handler import *
import pymongo
class Handler(BaseHandler):
crawl_config = {
}
@every(minutes=24 * 60)
def on_start(self):
self.crawl('http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/index.html', callback=self.province)
@config(age=10 * 24 * 60 * 60)
#获取省区划代码
def province(self, response):
lProvince = []
lCode = []
lValue = []
for each in response.doc('a[href^="http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018"]').items():
code = each.attr.href[-7:-5]
lCode.append(code)
for province in response.doc('a').items():
province_name = province.text()
lValue.append(province_name)
lValue.pop()
for i in range(len(lCode)):
dProvince = {
'id':lCode[i],
'pid':'0',
'name':lValue[i],
}
lProvince.append(dProvince)
for each in response.doc('a[href^="http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018"]').items():
self.crawl(each.attr.href, callback=self.city)
return {
'title':'province',
'lRes':lProvince,
}
@config(age=10 * 24 * 60 * 60)
#获取市区划代码
def city(self, response):
lCity = []
lData = []
for city in response.doc('a').items():
lData.append(city.text())
lData .pop()
for i in range(int(len(lData)/2)):
dCity = {
'id':lData[2*i][0:4],
'pid':lData[2*i][0:2],
'name':lData[2*i+1],
}
lCity.append(dCity)
for each in response.doc('a[href^="http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018"]').items():
#特殊情况,东莞、中山、儋州下面没有设县级行政区,直接市管镇
if each.attr.href[-9:-5] == '4419' or each.attr.href[-9:-5] == '4420' or each.attr.href[-9:-5] == '4604':
self.crawl(each.attr.href, callback=self.street)
self.crawl(each.attr.href, callback=self.district)
return {
'title':'city',
'lRes':lCity,
}
@config(age=10 * 24 * 60 * 60)
#获取县市区划代码
def district(self, response):
lDistrict = []
lData = []
for district in response.doc('a').items():
lData.append(district.text())
lData .pop()
for i in range(int(len(lData)/2)):
#由于特殊情况,东莞、中山、儋州直接到镇,所以这里要判断处理东莞、中山、儋州的镇上级pid为市的id
if lData[2*i][0:4] == '4419' or lData[2*i][0:4] == '4420' or lData[2*i][0:4] == '4604':
pid = lData[2*i][0:4]
else:
pid = lData[2*i][0:6]
dStreet = {
'id':lData[2*i][0:9],
'pid':pid,
'name':lData[2*i+1],
}
lDistrict.append(dDistrict)
for each in response.doc('a[href^="http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018"]').items():
self.crawl(each.attr.href, callback=self.street)
return {
'title':'district',
'lRes':lDistrict,
}
@config(age=10 * 24 * 60 * 60)
#获取街道区划代码
def street(self, response):
lStreet = []
lData = []
for street in response.doc('a').items():
lData.append(street.text())
lData .pop()
for i in range(int(len(lData)/2)):
dStreet = {
'id':lData[2*i][0:9],
'pid':lData[2*i][0:6],
'name':lData[2*i+1],
}
lStreet.append(dStreet)
for each in response.doc('a[href^="http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018"]').items():
self.crawl(each.attr.href, callback=self.detail)
return {
'title':'street',
'lRes':lStreet,
}
@config(age=10 * 24 * 60 * 60)
#获取居委、村区划代码
def detail(self, response):
lDetail = []
lData = []
for detail in response.doc('.villagetr > td').items():
lData.append(detail.text())
for i in range(int(len(lData)/3)):
dDetail = {
'id':lData[3*i],
'pid':lData[3*i][0:9],
'name':lData[3*i+2],
}
lDetail.append(dDetail)
return {
'title':'detail',
'lRes':lDetail,
}
#重写on_result保存抓取信息到mongodb
def on_result(self,result):
if not result:
return
lData = result['lRes']
client = pymongo.MongoClient(host='127.0.0.1',port=27017)
db = client['district']
coll = db['district_data']
for i in range(len(lData)):
data = {
'id':lData[i]['id'],
'pid':lData[i]['pid'],
'name':lData[i]['name'],
}
data_id = coll.insert(data)
image.png
附2019全国行政区划代码
链接: https://pan.baidu.com/s/1JCQ_wGaf8dlmtLMJU5xTYA 提取码: utz5
网友评论