31、BeautifulSoup实例3：人社部区域数据抓取

作者: 魔方宫殿 | 来源:发表于2022-04-15 22:17 被阅读0次

31、BeautifulSoup实例3：人社部区域数据抓取
30、BeautifulSoup实例2：统计区域数据抓取
32、正则表达式
Beautifulsoup的用法实例
一个简单的不能再简单的python爬虫
Python实战抓取捧腹网笑话数据
BeautifulSoup4爬取某社招网站数据
3种网页抓取方法
python网络爬虫-爬取网页的三种方式（1）
BeautifulSoup简介与安装

Life is short, you need Python!

上集回顾：

需求分析
流程图
代码实现

上集尝试了抓取统计局区域数据，学习了多层级页面的数据抓取喝保存。
本集尝试抓取人社部的行政区域数据。统计局的区域数据可能快递外卖使用比较多，但行政区域还是以人社部为准。

一、需求分析
还是先看网页界面长什么样：

民政局
很好，这次不需要多层级页面跳转了，数据全在一个页面里。

右键查看html代码：

html

可以发现数据在class="x17228320"的tag里，纯数字的为区域代码，可以通过正则表达式匹配4个0结尾的所有省级区域。然后再通过遍历兄弟节点，获取市级区域，同理再获取区级数据。

二、代码实现

from bs4 import BeautifulSoup
import requests
import datetime
import time
import random
import re

URL = 'http://www.mca.gov.cn/article/sj/xzqh/2020/20201201.html'

def get_header() :
    heads = {}
    heads['User-Agent'] = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1"
    return heads

def get_soup(url) :
    headers = get_header()
    print('loading...')
    html = requests.get(url, headers)
    return BeautifulSoup(html.content, "html.parser")

def write_start(tag, file) :
    print('write: ', tag.text)
    file.write('{"name":"' + tag.text + '", "children": [\n')
    return tag.get('href')

def write_end(file, is_end) :
    if is_end :
        file.write(']}\n')
    else :
        file.write(']},\n')
    time.sleep(1 + random.random())

def get_sibling(tag, file, city_start = False, area_start = False) :
    sibling = tag.next_sibling.next_sibling
    contents = sibling.contents
    name = contents[5].text.strip()
    if not name : return

    code = contents[3].text
    if code.endswith('0000') : # next province
        if area_start : file.write('}\n')
        if city_start : file.write(']}\n')
        return
 
    if code.endswith('00') : # city
        print('city: ' + name + '-' + code)
        if area_start : 
            file.write('}\n')
            area_start = False
        if city_start : file.write(']},\n')
        else : city_start = True
        file.write('{"code":"' + code + '", "name":"' + name + '", "children":[\n')
    else : #area
        print('area: ' + name + '-' + code)
        if area_start : file.write('},\n')
        else : area_start = True
        file.write('{"code":"' + code + '", "name":"' + name + '"')

    get_sibling(sibling, file, city_start, area_start)

special_1 = {'110000':'北京市', '120000':'天津市', '310000':'上海市', '500000':'重庆市'}
special_2 = {'710000':'台湾省', '810000':'香港特别行政区', '820000':'澳门特别行政区'}
def get_province(file) :
    soup = get_soup(URL)
    codes = soup.find_all("td", class_="xl7228320", text=re.compile("0000$"))
    for code in codes:
        name = code.next_sibling.next_sibling
        print('write: ' + code.text + ' - ' + name.text)

        if code.text in special_2 : #港澳台
            file.write('{"code":"' + code.text + '", "name":"' + name.text + '"},\n')
            continue

        file.write('{"code":"' + code.text + '","name":"' + name.text + '", "children":[\n')
        city_start = False
        if code.text in special_1 :
            city_start = True
            special_code = str(int(code.text) + 100)
            file.write('{"code":"' + special_code + '","name":"' + name.text + '", "children":[\n')
        get_sibling(code.parent, file, city_start)
        file.write(']},\n')

if __name__ == '__main__' :
    time_formatter = '%Y%m%d'
    time_str = datetime.datetime.now().strftime(time_formatter)
    filename = 'area_mca_' + time_str + '.json'
    with open(filename, 'w') as file:
        file.write('[\n')
        get_province(file)
        file.seek(file.tell() - 2)
        file.write('\n]')

本集总结：