本篇博客主要参考 java爬取网站信息和url实例
以爬取全国行政编码为例
仅做参考:
测试类
@Slf4j
@RunWith(SpringJUnit4ClassRunner.class)
@SpringBootTest
public class DiagnosisApplicationTests {
@Autowired
private OrderMapper orderMapper;
@Test
public void test() {
//设置根区域数据(中国)
setUpCountryData();
}
@Test
public void test1() {
//获取省数据
dealProvinceData();
}
@Test
public void test2() {
//获取市数据
dealCityData();
}
@Test
public void test3() {
//获取区县数据
dealCountyData();
}
@Test
public void test4() {
//获取街道数据
dealStreetData();
}
/**
* 方法描述: 设置中国的行政编码
*
* @return void
* @author wqf
* @date 2021/5/12 10:38
*/
public void setUpCountryData() {
AreaInfo areaInfo = new AreaInfo();
areaInfo.setAreaCode("100000");
areaInfo.setParentCode(null);
areaInfo.setAreaLevel(1);
areaInfo.setAreaName("中华人民共和国");
areaInfo.setCreateBy("sysadmin");
areaInfo.setUpdateBy("sysadmin");
areaInfo.setCreateTime(LocalDateTime.now());
areaInfo.setUpdateTime(LocalDateTime.now());
orderMapper.batchInsertAreaInfo(Collections.singletonList(areaInfo));
}
/**
* 方法描述: 获取省行政区数据 可能丢失台湾 香港 澳门数据
* 可手动查询获取
*
* @return void
* @author wqf
* @date 2021/5/11 17:07
*/
public void dealProvinceData() {
//查询顶级(中国)区域信息
AreaInfo rootAreaInfo = orderMapper.queryRootArea();
String url = getProvinceUrl();
String regex = "(http|https)://[\\w+\\.?/?]+\\.[A-Za-z]+";
List<AreaInfo> list = SpiderUtil.spiderUrl(url, regex, "中国省直辖市自治区行政编码");
if (list == null || list.size() == 0) {
return;
}
List<AreaInfo> resultList = dealData(rootAreaInfo, list, 2);
if (resultList.size() > 0) {
//插入数据到数据库
orderMapper.batchInsertAreaInfo(resultList);
}
}
/**
* 方法描述: 处理市级数据
*
* @return void
* @author wqf
* @date 2021/5/12 9:57
*/
public void dealCityData() {
String regex = "(http|https)://[\\w+\\.?/?]+\\.[A-Za-z]+";
dealMunicipalityCity(regex);
//通过市查询所有区县数据
List<AreaInfo> areaInfoList = orderMapper.queryProvinceList();
List<AreaInfo> infoArrayList = new ArrayList<>();
int size = areaInfoList.size();
String time = LocalDateTime.now().format(DateTimeFormatter.ofPattern("yyyyMMddHHmmss"));
for (int i = 0; i < areaInfoList.size(); i++) {
AreaInfo areaInfo = areaInfoList.get(i);
String province = areaInfo.getAreaName();
String filename = province + time;
String url = getCityUrl(province);
if (url == null) {
continue;
}
List<AreaInfo> list = SpiderUtil.spiderUrl(url, regex, filename);
//处理后的数据
List<AreaInfo> resultList = dealData(areaInfo, list, 3);
if (resultList.size() > 0) {
infoArrayList.addAll(resultList);
orderMapper.batchInsertAreaInfo(resultList);
}
log.info("城市总数量===>{},查询进度===>{}", infoArrayList.size(), (float) i * 100 / size + "%");
}
}
/**
* 方法描述: 处理县级数据
*
* @return void
* @author wqf
* @date 2021/5/12 9:56
*/
public void dealCountyData() {
List<AreaInfo> areaInfos = orderMapper.queryAreaInfoByAreaLevel(3);
String regex = "(http|https)://[\\w+\\.?/?]+\\.[A-Za-z]+";
//直辖市单独处理
dealMunicipalityDistrict(regex);
//处理其他
List<AreaInfo> infoArrayList = new ArrayList<>();
for (int i = 0; i < areaInfos.size(); i++) {
AreaInfo areaInfo = areaInfos.get(i);
String province = areaInfo.getParentName();
boolean b = checkAreaName(province);
if (!b) {
continue;
}
String city = areaInfo.getAreaName();
String countyUrl = getCountyUrl(province, city);
List<AreaInfo> list = SpiderUtil.spiderUrl(countyUrl, regex, city);
List<AreaInfo> resultList = dealData(areaInfo, list, 4);
if (resultList.size() > 0) {
infoArrayList.addAll(resultList);
orderMapper.batchInsertAreaInfo(resultList);
}
log.info("区县总数量===>{},查询进度===>{}", infoArrayList.size(), (float) i * 100 / areaInfos.size() + "%");
}
}
/**
* 方法描述: 处理乡镇街道级数据
*
* @return void
* @author wqf
* @date 2021/5/12 9:56
*/
public void dealStreetData() {
List<AreaInfo> areaInfos = orderMapper.queryAreaInfoByAreaLevel(4);
String regex = "(http|https)://[\\w+\\.?/?]+\\.[A-Za-z]+";
//处理其他
List<AreaInfo> infoArrayList = new ArrayList<>();
for (int i = 0; i < areaInfos.size(); i++) {
AreaInfo areaInfo = areaInfos.get(i);
String city = areaInfo.getParentName();
String province = areaInfo.getProvince();
String county = areaInfo.getAreaName();
boolean b = checkAreaName(province);
if (!b) {
continue;
}
String countyUrl = getStreetUrl(province, city, county);
List<AreaInfo> list = SpiderUtil.spiderUrl(countyUrl, regex, city);
List<AreaInfo> resultList = dealData(areaInfo, list, 5);
if (resultList.size() > 0) {
infoArrayList.addAll(resultList);
try {
orderMapper.batchInsertAreaInfo(resultList);
} catch (Exception e) {
log.error("发生错误,错误数据=={}", JSONUtil.toJsonStr(resultList));
}
}
log.info("街道总数量===>{},查询进度===>{}", infoArrayList.size(), (float) i * 100 / areaInfos.size() + "%");
}
}
/**
* 方法描述: 爬取到的结果集进行数据处理
*
* @param areaInfo 父区域信息
* @param list 爬取到的结果集
* @param areaLevel 区域等级
* @return void
* @author wqf
* @date 2021/5/12 9:08
*/
private List<AreaInfo> dealData(AreaInfo areaInfo, List<AreaInfo> list, Integer areaLevel) {
List<AreaInfo> resultList = new ArrayList<>();
if (list != null && list.size() > 0) {
for (AreaInfo info : list) {
//中国地区名称最长的是10个子
if (info.getAreaName().length() > 10) {
continue;
}
info.setAreaLevel(areaLevel);
info.setParentCode(areaInfo.getAreaCode());
info.setCreateBy("sysadmin");
info.setUpdateBy("sysadmin");
info.setCreateTime(LocalDateTime.now());
info.setUpdateTime(LocalDateTime.now());
resultList.add(info);
}
}
return resultList;
}
/**
* 方法描述: 设置直辖市的城市(直辖市属于省级但没有城市,手动设置一个节点)
*
* @param regex ee
* @return void
* @author wqf
* @date 2021/5/12 10:56
*/
private void dealMunicipalityCity(String regex) {
List<AreaInfo> resultAreaList = new ArrayList<>();
AreaInfo areaInfo = new AreaInfo();
// 17737 sysadmin 2021-04-07 15:26:25 sysadmin 2021-04-07 15:26:25 310100 3 上海城区 021 31.231706 121.472644 310000
// 26229 sysadmin 2021-04-07 15:26:36 sysadmin 2021-04-07 15:26:36 500100 3 重庆城区 023 29.533155 106.504962 500000
// 36478 sysadmin 2021-04-07 15:26:49 sysadmin 2021-04-07 15:26:49 110100 3 北京城区 010 39.904989 116.405285 110000
// 36831 sysadmin 2021-04-07 15:26:50 sysadmin 2021-04-07 15:26:50 120100 3 天津城区 022 39.125596 117.190182 120000
areaInfo.setAreaName("上海城区");
areaInfo.setAreaCode("310100");//上海
areaInfo.setParentCode("310000");
areaInfo.setAreaLevel(3);
areaInfo.setCreateBy("sysadmin");
areaInfo.setUpdateBy("sysadmin");
areaInfo.setCreateTime(LocalDateTime.now());
areaInfo.setUpdateTime(LocalDateTime.now());
resultAreaList.add(areaInfo);
AreaInfo areaInfo1 = new AreaInfo();
areaInfo1.setAreaName("重庆城区");
areaInfo1.setAreaCode("500100");//重庆
areaInfo1.setParentCode("500000");
areaInfo1.setAreaLevel(3);
areaInfo1.setCreateBy("sysadmin");
areaInfo1.setUpdateBy("sysadmin");
areaInfo1.setCreateTime(LocalDateTime.now());
areaInfo1.setUpdateTime(LocalDateTime.now());
resultAreaList.add(areaInfo1);
AreaInfo areaInfo2 = new AreaInfo();
areaInfo2.setAreaName("天津城区");
areaInfo2.setAreaCode("120100");
areaInfo2.setParentCode("120000");
areaInfo2.setAreaLevel(3);
areaInfo2.setCreateBy("sysadmin");
areaInfo2.setUpdateBy("sysadmin");
areaInfo2.setCreateTime(LocalDateTime.now());
areaInfo2.setUpdateTime(LocalDateTime.now());
resultAreaList.add(areaInfo2);
AreaInfo areaInfo3 = new AreaInfo();
areaInfo3.setAreaName("北京城区");
areaInfo3.setAreaCode("110100");
areaInfo3.setParentCode("110000");
areaInfo3.setAreaLevel(3);
areaInfo3.setCreateBy("sysadmin");
areaInfo3.setUpdateBy("sysadmin");
areaInfo3.setCreateTime(LocalDateTime.now());
areaInfo3.setUpdateTime(LocalDateTime.now());
resultAreaList.add(areaInfo3);
orderMapper.batchInsertAreaInfo(resultAreaList);
}
/**
* 方法描述: 获取直辖市区县信息
*
* @param regex ee
* @return void
* @author wqf
* @date 2021/5/12 10:56
*/
private void dealMunicipalityDistrict(String regex) {
//查询直辖市信息
List<AreaInfo> municipalityInfo = orderMapper.queryMunicipalityInfo();
String url = "https://m.ajinshou.com/daima/daima-di.php?sd=北京市&xd=北京市";
List<AreaInfo> beijing = SpiderUtil.spiderUrl(url, regex, "北京市");
url = "https://m.ajinshou.com/daima/daima-di.php?sd=上海市&xd=上海市";
List<AreaInfo> shanghai = SpiderUtil.spiderUrl(url, regex, "上海市");
url = "https://m.ajinshou.com/daima/daima-di.php?sd=天津市&xd=天津市";
List<AreaInfo> tianjinCity = SpiderUtil.spiderUrl(url, regex, "天津市");
url = "https://m.ajinshou.com/daima/daima-di.php?sd=重庆市&xd=重庆市";
List<AreaInfo> chongqing = SpiderUtil.spiderUrl(url, regex, "重庆市");
AreaInfo areaInfo = new AreaInfo();
areaInfo.setAreaCode("310100");//上海
List<AreaInfo> resultList1 = dealData(areaInfo, shanghai, 4);
areaInfo.setAreaCode("500100");//重庆
List<AreaInfo> resultList2 = dealData(areaInfo, chongqing, 4);
areaInfo.setAreaCode("120100");//天津
List<AreaInfo> resultList3 = dealData(areaInfo, tianjinCity, 4);
areaInfo.setAreaCode("110100");//北京
List<AreaInfo> resultList4 = dealData(areaInfo, beijing, 4);
orderMapper.batchInsertAreaInfo(resultList1);
orderMapper.batchInsertAreaInfo(resultList2);
orderMapper.batchInsertAreaInfo(resultList3);
orderMapper.batchInsertAreaInfo(resultList4);
}
/**
* 方法描述: 校验区属是否需要发请求获取数据
*
* @param areaName 区域名称
* @return boolean
* @author wqf
* @date 2021/5/11 16:47
*/
private static boolean checkAreaName(String areaName) {
return !areaName.contains("北京") && !areaName.contains("上海") && !areaName.contains("天津") && !areaName.contains(
"重庆");
}
public static String getProvinceUrl() {
return "https://m.ajinshou.com/daima/";
}
public static String getCityUrl(String province) {
boolean b = checkAreaName(province);
if (b) {
String areaName = getAreaName(province);
return "https://m.ajinshou.com/daima/daima-sheng.php?sd=" + areaName;
}
return null;
}
public static String getCountyUrl(String province, String city) {
province = getAreaName(province);
return "https://m.ajinshou.com/daima/daima-di.php?sd=" + province + "&xd=" + city;
}
public static String getStreetUrl(String province, String city, String county) {
province = getAreaName(province);
return "https://m.ajinshou.com/daima/daima-xian.php?sd=" + province + "&xd=" + city + "&cd=" + county;
}
private static String getAreaName(String areaName) {
if (areaName.contains("宁夏")) {
areaName = "宁夏";
}
if (areaName.contains("内蒙")) {
areaName = "内蒙古";
}
if (areaName.contains("广西")) {
areaName = "广西";
}
if (areaName.contains("西藏")) {
areaName = "西藏";
}
if (areaName.contains("新疆")) {
areaName = "新疆";
}
if (areaName.contains("香港")) {
areaName = "香港";
}
if (areaName.contains("澳门")) {
areaName = "澳门";
}
if (areaName.contains("广西")) {
areaName = "广西";
}
return areaName;
}
}
爬取网页工具方法
public static void spiderUrl(String url, String regex, String filename) {
BufferedReader br = null;
PrintWriter contentFile = null;
PrintWriter urlFile = null;
String downTime = LocalDate.now().toString();
try {
URL realUrl = new URL(url);
URLConnection connection = realUrl.openConnection();
//文件夹名称
String directoryName = "E:/spider/" + downTime;
File fileDir = new File(directoryName);
//创建文件夹
if (!fileDir.exists()) {
boolean mkdirs = fileDir.mkdirs();
if (!mkdirs) {
throw new RuntimeException("创建文件失败!");
}
}
//存放网页内容的txt文件
String contentFileName = filename + "_content.txt";
//存放url的txt文件
String urlFileName = filename + "_URL.txt";
// 将爬取到的内容放到E盘相应目录
contentFile = new PrintWriter(new FileWriter(directoryName + "/" + contentFileName), true);
urlFile = new PrintWriter(new FileWriter(directoryName + "/" + urlFileName), true);
br = new BufferedReader(new InputStreamReader(connection.getInputStream()));
String line;
// 编译正则表达式
Pattern pattern = Pattern.compile(regex);
// 读写
while ((line = br.readLine()) != null) {
//dealFileContent 处理文件内容 依据自己的需求进行处理
String resultContent = dealFileContent(pattern, line);
//将处理后的内容写入到文件中
if (resultContent != null) {
contentFile.println(resultContent);
}
Matcher matcher = pattern.matcher(line);
while (matcher.find()) {
//将url写入文件
urlFile.println(matcher.group());
}
}
System.out.println("爬取成功!");
} catch (Exception e) {
e.printStackTrace();
} finally {
try {
Objects.requireNonNull(br).close();
contentFile.close();
urlFile.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
//自定义处理方法,根据需求处理爬取的内容
private static String dealFileContent(Pattern pattern, String line) {
if (line.contains("php?") && line.contains(":")) {
Matcher m2 = pattern.matcher(line);
if (!m2.find()) {
return null;
}
String[] split = line.split(">");
String s = split[1];
String replace = s.replace("</a", "");
String replace1 = split[2].replace("<br /", "");
return replace + replace1;
}
return null;
}
网友评论