美文网首页
从国家统计局爬取省市区信息

从国家统计局爬取省市区信息

作者: ouyangan | 来源:发表于2017-08-03 15:53 被阅读294次

Location

    private Long id;
    private Long code;
    private String name;
    private Integer level;
    @TableField("create_time")
    private Date createTime;
    @TableField("parent_id")
    private Long parentId;

HttpUtil

public static String get(String url) throws IOException {
        OkHttpClient client = new OkHttpClient.Builder()
                .connectTimeout(10, TimeUnit.SECONDS)
                .writeTimeout(10, TimeUnit.SECONDS)
                .readTimeout(10, TimeUnit.SECONDS)
                .build();
        Request request = new Request.Builder()
                .url(url)
                .build();
        Response response = client.newCall(request).execute();
        String string = response.body().string();
        log.debug(string);
        return string;
    }

爬虫代码

  @Test
    public void locationSpider() throws IOException {
        String url = "http://www.stats.gov.cn/tjsj/tjbz/xzqhdm";
        String string = HttpUtil.get(url);
        Document document = Jsoup.parse(string);
        Elements center_list_contlist = document.getElementsByClass("center_list_contlist");
        List<String> urls = new ArrayList<>();
        for (Element element : center_list_contlist) {
            Elements a = element.select("a");
            for (Element element1 : a) {
                String href = element1.attr("href");
                urls.add(url + href.substring(1));
            }
        }
        String areaStr = HttpUtil.get(urls.get(0));
        Elements msoNormal = Jsoup.parse(areaStr).getElementsByClass("MsoNormal");
        Map<String, String> map = new TreeMap<>();
        for (Element element : msoNormal) {
//            System.out.println(element);
            Element lang = element.attr("lang", "EN-US");
            String s = lang.text().replaceAll(" ", "").replaceAll(" ", "").replaceAll(" ", "");
            String substring = s.substring(0, 6);
            String substring1 = s.substring(6);
            map.put(substring, substring1);
        }
        Map<String, String> province = new TreeMap<>();
        Map<String, String> city = new TreeMap<>();
        Map<String, String> district = new TreeMap<>();
        System.out.println("===========省份===========");
        for (String key : map.keySet()) {
            if (key.endsWith("0000")) {
                System.out.println(key + "->" + map.get(key));
                province.put(key, map.get(key));
            }
        }
        System.out.println(province.size());
        System.out.println("===========市===========");
        for (String key : map.keySet()) {
            String substring = key.substring(0, 2);
            if (key.endsWith("00") && key.startsWith(substring) && !key.endsWith("0000")) {
                System.out.println(key + "->" + map.get(key));
                city.put(key, map.get(key));
            }
        }
        System.out.println("===========区===========");
        for (String key : map.keySet()) {
            boolean b = province.containsKey(key);
            boolean b1 = city.containsKey(key);
            if (!b && !b1) {
                System.out.println(key + "->" + map.get(key));
                district.put(key, map.get(key));
            }
        }
        for (String provinceKey : province.keySet()) {
            Location p = Location.builder()
                    .code(Long.valueOf(provinceKey))
                    .name(province.get(provinceKey))
                    .createTime(new Date())
                    .level(1)
                    .parentId(0L)
                    .build();
            locationMapper.insert(p);
            for (String cityKey : city.keySet()) {
                if (cityKey.startsWith(provinceKey.substring(0, 2))) {
                    Location c = Location.builder()
                            .code(Long.valueOf(cityKey))
                            .name(city.get(cityKey))
                            .createTime(new Date())
                            .level(2)
                            .parentId(p.getId())
                            .build();
                    locationMapper.insert(c);
                    for (String districtKey : district.keySet()) {
                        if (districtKey.startsWith(cityKey.substring(0, 4))) {
                            Location d = Location.builder()
                                    .code(Long.valueOf(districtKey))
                                    .name(district.get(districtKey))
                                    .createTime(new Date())
                                    .level(3)
                                    .parentId(c.getId())
                                    .build();
                            locationMapper.insert(d);
                        }
                    }
                }
            }
        }
    }
image.png

相关文章

网友评论

      本文标题:从国家统计局爬取省市区信息

      本文链接:https://www.haomeiwen.com/subject/abuplxtx.html