美文网首页
使用Java爬取网页内容

使用Java爬取网页内容

作者: 无我_无他_有你 | 来源:发表于2021-05-14 08:51 被阅读0次

本篇博客主要参考 java爬取网站信息和url实例
以爬取全国行政编码为例

仅做参考:
测试类

@Slf4j
@RunWith(SpringJUnit4ClassRunner.class)
@SpringBootTest
public class DiagnosisApplicationTests {
    @Autowired
    private OrderMapper orderMapper;

    @Test
    public void test() {
        //设置根区域数据(中国)
        setUpCountryData();
    }

    @Test
    public void test1() {
        //获取省数据
        dealProvinceData();
    }

    @Test
    public void test2() {
        //获取市数据
        dealCityData();
    }

    @Test
    public void test3() {
        //获取区县数据
        dealCountyData();
    }

    @Test
    public void test4() {
        //获取街道数据
        dealStreetData();
    }

    /**
     * 方法描述: 设置中国的行政编码
     *
     * @return void
     * @author wqf
     * @date 2021/5/12 10:38
     */
    public void setUpCountryData() {
        AreaInfo areaInfo = new AreaInfo();
        areaInfo.setAreaCode("100000");
        areaInfo.setParentCode(null);
        areaInfo.setAreaLevel(1);
        areaInfo.setAreaName("中华人民共和国");
        areaInfo.setCreateBy("sysadmin");
        areaInfo.setUpdateBy("sysadmin");
        areaInfo.setCreateTime(LocalDateTime.now());
        areaInfo.setUpdateTime(LocalDateTime.now());
        orderMapper.batchInsertAreaInfo(Collections.singletonList(areaInfo));
    }

    /**
     * 方法描述: 获取省行政区数据 可能丢失台湾 香港 澳门数据
     * 可手动查询获取
     *
     * @return void
     * @author wqf
     * @date 2021/5/11 17:07
     */
    public void dealProvinceData() {
        //查询顶级(中国)区域信息
        AreaInfo rootAreaInfo = orderMapper.queryRootArea();
        String url = getProvinceUrl();
        String regex = "(http|https)://[\\w+\\.?/?]+\\.[A-Za-z]+";
        List<AreaInfo> list = SpiderUtil.spiderUrl(url, regex, "中国省直辖市自治区行政编码");
        if (list == null || list.size() == 0) {
            return;
        }
        List<AreaInfo> resultList = dealData(rootAreaInfo, list, 2);
        if (resultList.size() > 0) {
            //插入数据到数据库
            orderMapper.batchInsertAreaInfo(resultList);
        }
    }

    /**
     * 方法描述: 处理市级数据
     *
     * @return void
     * @author wqf
     * @date 2021/5/12 9:57
     */
    public void dealCityData() {
        String regex = "(http|https)://[\\w+\\.?/?]+\\.[A-Za-z]+";
        dealMunicipalityCity(regex);
        //通过市查询所有区县数据
        List<AreaInfo> areaInfoList = orderMapper.queryProvinceList();
        List<AreaInfo> infoArrayList = new ArrayList<>();
        int size = areaInfoList.size();
        String time = LocalDateTime.now().format(DateTimeFormatter.ofPattern("yyyyMMddHHmmss"));
        for (int i = 0; i < areaInfoList.size(); i++) {
            AreaInfo areaInfo = areaInfoList.get(i);
            String province = areaInfo.getAreaName();
            String filename = province + time;
            String url = getCityUrl(province);
            if (url == null) {
                continue;
            }
            List<AreaInfo> list = SpiderUtil.spiderUrl(url, regex, filename);
            //处理后的数据
            List<AreaInfo> resultList = dealData(areaInfo, list, 3);
            if (resultList.size() > 0) {
                infoArrayList.addAll(resultList);
                orderMapper.batchInsertAreaInfo(resultList);
            }
            log.info("城市总数量===>{},查询进度===>{}", infoArrayList.size(), (float) i * 100 / size + "%");
        }
    }

    /**
     * 方法描述: 处理县级数据
     *
     * @return void
     * @author wqf
     * @date 2021/5/12 9:56
     */
    public void dealCountyData() {
        List<AreaInfo> areaInfos = orderMapper.queryAreaInfoByAreaLevel(3);
        String regex = "(http|https)://[\\w+\\.?/?]+\\.[A-Za-z]+";
        //直辖市单独处理
        dealMunicipalityDistrict(regex);
        //处理其他
        List<AreaInfo> infoArrayList = new ArrayList<>();
        for (int i = 0; i < areaInfos.size(); i++) {
            AreaInfo areaInfo = areaInfos.get(i);
            String province = areaInfo.getParentName();
            boolean b = checkAreaName(province);
            if (!b) {
                continue;
            }
            String city = areaInfo.getAreaName();
            String countyUrl = getCountyUrl(province, city);
            List<AreaInfo> list = SpiderUtil.spiderUrl(countyUrl, regex, city);
            List<AreaInfo> resultList = dealData(areaInfo, list, 4);
            if (resultList.size() > 0) {
                infoArrayList.addAll(resultList);
                orderMapper.batchInsertAreaInfo(resultList);
            }
            log.info("区县总数量===>{},查询进度===>{}", infoArrayList.size(), (float) i * 100 / areaInfos.size() + "%");
        }
    }

    /**
     * 方法描述: 处理乡镇街道级数据
     *
     * @return void
     * @author wqf
     * @date 2021/5/12 9:56
     */
    public void dealStreetData() {
        List<AreaInfo> areaInfos = orderMapper.queryAreaInfoByAreaLevel(4);
        String regex = "(http|https)://[\\w+\\.?/?]+\\.[A-Za-z]+";
        //处理其他
        List<AreaInfo> infoArrayList = new ArrayList<>();
        for (int i = 0; i < areaInfos.size(); i++) {
            AreaInfo areaInfo = areaInfos.get(i);
            String city = areaInfo.getParentName();
            String province = areaInfo.getProvince();
            String county = areaInfo.getAreaName();
            boolean b = checkAreaName(province);
            if (!b) {
                continue;
            }
            String countyUrl = getStreetUrl(province, city, county);
            List<AreaInfo> list = SpiderUtil.spiderUrl(countyUrl, regex, city);
            List<AreaInfo> resultList = dealData(areaInfo, list, 5);
            if (resultList.size() > 0) {
                infoArrayList.addAll(resultList);
                try {
                    orderMapper.batchInsertAreaInfo(resultList);
                } catch (Exception e) {
                    log.error("发生错误,错误数据=={}", JSONUtil.toJsonStr(resultList));
                }
            }
            log.info("街道总数量===>{},查询进度===>{}", infoArrayList.size(), (float) i * 100 / areaInfos.size() + "%");
        }
    }

    /**
     * 方法描述: 爬取到的结果集进行数据处理
     *
     * @param areaInfo  父区域信息
     * @param list      爬取到的结果集
     * @param areaLevel 区域等级
     * @return void
     * @author wqf
     * @date 2021/5/12 9:08
     */
    private List<AreaInfo> dealData(AreaInfo areaInfo, List<AreaInfo> list, Integer areaLevel) {
        List<AreaInfo> resultList = new ArrayList<>();
        if (list != null && list.size() > 0) {
            for (AreaInfo info : list) {
                //中国地区名称最长的是10个子
                if (info.getAreaName().length() > 10) {
                    continue;
                }
                info.setAreaLevel(areaLevel);
                info.setParentCode(areaInfo.getAreaCode());
                info.setCreateBy("sysadmin");
                info.setUpdateBy("sysadmin");
                info.setCreateTime(LocalDateTime.now());
                info.setUpdateTime(LocalDateTime.now());
                resultList.add(info);
            }
        }
        return resultList;
    }

    /**
     * 方法描述: 设置直辖市的城市(直辖市属于省级但没有城市,手动设置一个节点)
     *
     * @param regex ee
     * @return void
     * @author wqf
     * @date 2021/5/12 10:56
     */
    private void dealMunicipalityCity(String regex) {
        List<AreaInfo> resultAreaList = new ArrayList<>();
        AreaInfo areaInfo = new AreaInfo();

//        17737 sysadmin    2021-04-07 15:26:25 sysadmin    2021-04-07 15:26:25 310100  3   上海城区    021 31.231706   121.472644  310000
//        26229 sysadmin    2021-04-07 15:26:36 sysadmin    2021-04-07 15:26:36 500100  3   重庆城区    023 29.533155   106.504962  500000
//        36478 sysadmin    2021-04-07 15:26:49 sysadmin    2021-04-07 15:26:49 110100  3   北京城区    010 39.904989   116.405285  110000
//        36831 sysadmin    2021-04-07 15:26:50 sysadmin    2021-04-07 15:26:50 120100  3   天津城区    022 39.125596   117.190182  120000
        areaInfo.setAreaName("上海城区");
        areaInfo.setAreaCode("310100");//上海
        areaInfo.setParentCode("310000");
        areaInfo.setAreaLevel(3);
        areaInfo.setCreateBy("sysadmin");
        areaInfo.setUpdateBy("sysadmin");
        areaInfo.setCreateTime(LocalDateTime.now());
        areaInfo.setUpdateTime(LocalDateTime.now());
        resultAreaList.add(areaInfo);

        AreaInfo areaInfo1 = new AreaInfo();
        areaInfo1.setAreaName("重庆城区");
        areaInfo1.setAreaCode("500100");//重庆
        areaInfo1.setParentCode("500000");
        areaInfo1.setAreaLevel(3);
        areaInfo1.setCreateBy("sysadmin");
        areaInfo1.setUpdateBy("sysadmin");
        areaInfo1.setCreateTime(LocalDateTime.now());
        areaInfo1.setUpdateTime(LocalDateTime.now());
        resultAreaList.add(areaInfo1);

        AreaInfo areaInfo2 = new AreaInfo();

        areaInfo2.setAreaName("天津城区");
        areaInfo2.setAreaCode("120100");
        areaInfo2.setParentCode("120000");
        areaInfo2.setAreaLevel(3);
        areaInfo2.setCreateBy("sysadmin");
        areaInfo2.setUpdateBy("sysadmin");
        areaInfo2.setCreateTime(LocalDateTime.now());
        areaInfo2.setUpdateTime(LocalDateTime.now());
        resultAreaList.add(areaInfo2);


        AreaInfo areaInfo3 = new AreaInfo();
        areaInfo3.setAreaName("北京城区");
        areaInfo3.setAreaCode("110100");
        areaInfo3.setParentCode("110000");
        areaInfo3.setAreaLevel(3);
        areaInfo3.setCreateBy("sysadmin");
        areaInfo3.setUpdateBy("sysadmin");
        areaInfo3.setCreateTime(LocalDateTime.now());
        areaInfo3.setUpdateTime(LocalDateTime.now());
        resultAreaList.add(areaInfo3);
        orderMapper.batchInsertAreaInfo(resultAreaList);
    }

    /**
     * 方法描述: 获取直辖市区县信息
     *
     * @param regex ee
     * @return void
     * @author wqf
     * @date 2021/5/12 10:56
     */
    private void dealMunicipalityDistrict(String regex) {
        //查询直辖市信息
        List<AreaInfo> municipalityInfo = orderMapper.queryMunicipalityInfo();
        String url = "https://m.ajinshou.com/daima/daima-di.php?sd=北京市&xd=北京市";
        List<AreaInfo> beijing = SpiderUtil.spiderUrl(url, regex, "北京市");
        url = "https://m.ajinshou.com/daima/daima-di.php?sd=上海市&xd=上海市";
        List<AreaInfo> shanghai = SpiderUtil.spiderUrl(url, regex, "上海市");
        url = "https://m.ajinshou.com/daima/daima-di.php?sd=天津市&xd=天津市";
        List<AreaInfo> tianjinCity = SpiderUtil.spiderUrl(url, regex, "天津市");
        url = "https://m.ajinshou.com/daima/daima-di.php?sd=重庆市&xd=重庆市";
        List<AreaInfo> chongqing = SpiderUtil.spiderUrl(url, regex, "重庆市");
        AreaInfo areaInfo = new AreaInfo();
        areaInfo.setAreaCode("310100");//上海
        List<AreaInfo> resultList1 = dealData(areaInfo, shanghai, 4);
        areaInfo.setAreaCode("500100");//重庆
        List<AreaInfo> resultList2 = dealData(areaInfo, chongqing, 4);
        areaInfo.setAreaCode("120100");//天津
        List<AreaInfo> resultList3 = dealData(areaInfo, tianjinCity, 4);
        areaInfo.setAreaCode("110100");//北京
        List<AreaInfo> resultList4 = dealData(areaInfo, beijing, 4);
        orderMapper.batchInsertAreaInfo(resultList1);
        orderMapper.batchInsertAreaInfo(resultList2);
        orderMapper.batchInsertAreaInfo(resultList3);
        orderMapper.batchInsertAreaInfo(resultList4);
    }

    /**
     * 方法描述: 校验区属是否需要发请求获取数据
     *
     * @param areaName 区域名称
     * @return boolean
     * @author wqf
     * @date 2021/5/11 16:47
     */
    private static boolean checkAreaName(String areaName) {
        return !areaName.contains("北京") && !areaName.contains("上海") && !areaName.contains("天津") && !areaName.contains(
                "重庆");
    }


    public static String getProvinceUrl() {
        return "https://m.ajinshou.com/daima/";
    }

    public static String getCityUrl(String province) {
        boolean b = checkAreaName(province);
        if (b) {
            String areaName = getAreaName(province);
            return "https://m.ajinshou.com/daima/daima-sheng.php?sd=" + areaName;
        }
        return null;
    }

    public static String getCountyUrl(String province, String city) {
        province = getAreaName(province);
        return "https://m.ajinshou.com/daima/daima-di.php?sd=" + province + "&xd=" + city;
    }

    public static String getStreetUrl(String province, String city, String county) {
        province = getAreaName(province);
        return "https://m.ajinshou.com/daima/daima-xian.php?sd=" + province + "&xd=" + city + "&cd=" + county;
    }

    private static String getAreaName(String areaName) {
        if (areaName.contains("宁夏")) {
            areaName = "宁夏";
        }
        if (areaName.contains("内蒙")) {
            areaName = "内蒙古";
        }
        if (areaName.contains("广西")) {
            areaName = "广西";
        }
        if (areaName.contains("西藏")) {
            areaName = "西藏";
        }
        if (areaName.contains("新疆")) {
            areaName = "新疆";
        }
        if (areaName.contains("香港")) {
            areaName = "香港";
        }
        if (areaName.contains("澳门")) {
            areaName = "澳门";
        }
        if (areaName.contains("广西")) {
            areaName = "广西";
        }
        return areaName;
    }
}

爬取网页工具方法

       public static void spiderUrl(String url, String regex, String filename) {
        BufferedReader br = null;
        PrintWriter contentFile = null;
        PrintWriter urlFile = null;
        String downTime = LocalDate.now().toString();
        try {
            URL realUrl = new URL(url);
            URLConnection connection = realUrl.openConnection();
            //文件夹名称
            String directoryName = "E:/spider/" + downTime;
            File fileDir = new File(directoryName);
            //创建文件夹
            if (!fileDir.exists()) {
                boolean mkdirs = fileDir.mkdirs();
                if (!mkdirs) {
                    throw new RuntimeException("创建文件失败!");
                }
            }
            //存放网页内容的txt文件
            String contentFileName = filename + "_content.txt";
            //存放url的txt文件
            String urlFileName = filename + "_URL.txt";
            // 将爬取到的内容放到E盘相应目录
            contentFile = new PrintWriter(new FileWriter(directoryName + "/" + contentFileName), true);
            urlFile = new PrintWriter(new FileWriter(directoryName + "/" + urlFileName), true);
            br = new BufferedReader(new InputStreamReader(connection.getInputStream()));
            String line;
            // 编译正则表达式
            Pattern pattern = Pattern.compile(regex);
            // 读写
            while ((line = br.readLine()) != null) {
                //dealFileContent 处理文件内容 依据自己的需求进行处理
                String resultContent = dealFileContent(pattern, line);
                //将处理后的内容写入到文件中
                if (resultContent != null) {
                    contentFile.println(resultContent);
                }
                Matcher matcher = pattern.matcher(line);
                while (matcher.find()) {
                    //将url写入文件
                    urlFile.println(matcher.group());
                }
            }
            System.out.println("爬取成功!");
        } catch (Exception e) {
            e.printStackTrace();
        } finally {
            try {
                Objects.requireNonNull(br).close();
                contentFile.close();
                urlFile.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
    }

    //自定义处理方法,根据需求处理爬取的内容
    private static String dealFileContent(Pattern pattern, String line) {
        if (line.contains("php?") && line.contains(":")) {
            Matcher m2 = pattern.matcher(line);
            if (!m2.find()) {
                return null;
            }
            String[] split = line.split(">");
            String s = split[1];
            String replace = s.replace("</a", "");
            String replace1 = split[2].replace("<br /", "");
            return replace + replace1;
        }
        return null;
    }

相关文章

网友评论

      本文标题:使用Java爬取网页内容

      本文链接:https://www.haomeiwen.com/subject/cvoidltx.html