开发环境:
- Java语言,JKD1.8
- 开发工具IDEA
抓取过程分析
阮佬的博客就是纯html,没有什么反爬虫限制,我做的就是http请求下载到html页面,然后把里边所有的广告删除了,保留了文章主体。博客是按时间组织的。代码如下,代码注释是爬取思路。总共120行代码,很简单。
talk is cheap show me the code
private void down() {
/*
爬取思路:
日志网站是以时间节点组织的,以时间为线索爬取所有日志
最早的日志是2003年12份
http://www.ruanyifeng.com/blog/2003/12/
*/
String url = "http://www.ruanyifeng.com/blog/";
LocalDate now = LocalDate.now();//获取当前日期
while (now.isAfter(LocalDate.of(2003, 12, 1))) {//网站开始时间 2003年12月1日
//爬取网站
int year = now.getYear();//年份
int month = now.getMonthValue();//月份
String monthStr = ((month + 100) + "").substring(1, 3);//格式月份 比如1月转为01月
String href = url + year + "/" + monthStr + "/";
//下载该月份的所有文章
String path = "ruanyifeng" + File.separator + year + monthStr;//文档保存目录
downPage(href, path);
//时间减一个月
now = now.minusMonths(1);
}
}
/**
* 下载一个月份的所有文章
*
* @param path 文件存储路径:年份+月份 如 200312
* @param url 某月份的网址
*/
private void downPage(String url, String path) {
if (repeatUrl.contains(url)) {
return;
}
LOG.info("开始下载某月份的所有文章:{}", url);
/*
实现思路:
比如某月份网址:http://www.ruanyifeng.com/blog/2003/12/
页面分析:由上下两部分组成,上边是该月份最后一篇文章,下边是所有剩下的所有文章
故爬取分上下两部分,爬取到文章的网址即可
*/
Document document = SpiderUtil.getDocument(url);
List<String> urls = new ArrayList<>();
//得到上部分,最后一篇文章
Element last = document.selectFirst("div > h2 > a");
if (last != null) {
String lastUrl = last.attr("href");
urls.add(lastUrl);
//得到下部分,剩下的所有文章
Elements elements = document.select("#alpha-inner > div > div > ul > li > a");
elements.forEach(m -> urls.add(m.attr("href")));
//创建目录
FileUtil.mkdir(System.getProperty("user.dir") + File.separator + path);
//下载文章
urls.forEach(m -> downArticle(m, path));
}
//防止重复爬取
repeatUrl.add(url);
}
/**
* 下载文章详情页
*
* @param url 文章网址
* @param path 文件存储路径
*/
private void downArticle(String url, String path) {
if (repeatUrl.contains(url)) {
return;
}
LOG.info("开始下载文章详情:{}", url);
Document document = SpiderUtil.getDocument(url);
if (document != null) {
Element title = document.selectFirst("#page-title");
if(title==null){
repeatUrl.add(url);
return;
}
Element date = document.selectFirst("article > div > p:nth-child(2) > a > abbr");
Element content = document.selectFirst("#main-content");
//文章的图片处理
// SpiderUtil.saveImages(content, path);
String html = "<head>\n" +
" <meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\" /></head>" +
title.toString() + date.toString() + content.toString();
Document doc = Jsoup.parse(html);
// String markdown = SpiderUtil.html2markdown(html);//文章转markdown格式
//保存文档到硬盘
//得到文件标题并清除特殊符号
String titleStr = title.text().replaceAll("[\\pP+~$`^=|<>~`$^+=|<>¥×]", "");
String dateStr = date.text().replaceAll("[年|月|日]", "");
String filePath = FileUtil.getProjectDir() + path + File.separator + dateStr + "-" + titleStr + ".html";
File file = FileUtil.newFile(filePath);
LOG.info("保存文章: {},文章地址: {}", dateStr + titleStr, url);
FileUtil.writeString(doc.toString(), file, CharsetUtil.UTF_8);
//防止重复爬取
repeatUrl.add(url);
}
}
项目依赖环境
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<parent>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-parent</artifactId>
<version>2.1.3.RELEASE</version>
<relativePath/> <!-- lookup parent from repository -->
</parent>
<groupId>com</groupId>
<artifactId>zhiyuan</artifactId>
<version>1.0.0</version>
<name>zhiyuan</name>
<description>致远开发团队-项目集合</description>
<properties>
<java.version>1.8</java.version>
</properties>
<!--使用阿里云maven库,快速下载jar-->
<repositories>
<repository>
<id>central</id>
<name>aliyun maven</name>
<url>http://maven.aliyun.com/nexus/content/groups/public/</url>
<layout>default</layout>
<!-- 是否开启发布版构件下载 -->
<releases>
<enabled>true</enabled>
</releases>
<!-- 是否开启快照版构件下载 -->
<snapshots>
<enabled>false</enabled>
</snapshots>
</repository>
</repositories>
<dependencies>
<!--springboot相关依赖-->
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-web</artifactId>
</dependency>
<!--参数校验支持-->
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-validation</artifactId>
</dependency>
<!--aop支持-->
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-aop</artifactId>
</dependency>
<!--测试支持-->
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-test</artifactId>
<scope>test</scope>
</dependency>
<!--缓存支持-->
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-cache</artifactId>
</dependency>
<!--jdbc连接数据库支持-->
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-jdbc</artifactId>
</dependency>
<!--mysql 默认版本-->
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
</dependency>
<!--为支持低版本mysql 引入mysql5.1版本的jar-->
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>5.1.47</version>
</dependency>
<!--阿里Druid数据库连接池 https://github.com/alibaba/druid/blob/master/druid-spring-boot-starter/README.md-->
<!-- <dependency>
<groupId>com.alibaba</groupId>
<artifactId>druid-spring-boot-starter</artifactId>
<version>1.1.10</version>
</dependency>-->
<!--Mybatis-plus增强版Mybatis https://mybatis.plus-->
<dependency>
<groupId>com.baomidou</groupId>
<artifactId>mybatis-plus-boot-starter</artifactId>
<version>3.1.0</version>
</dependency>
<!-- Mybatis-plus 代码生成 -->
<dependency>
<groupId>com.baomidou</groupId>
<artifactId>mybatis-plus-generator</artifactId>
<version>3.1.0</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.velocity/velocity -->
<dependency>
<groupId>org.apache.velocity</groupId>
<artifactId>velocity</artifactId>
<version>1.7</version>
</dependency>
<!--hutool工具包 http://hutool.cn/-->
<dependency>
<groupId>cn.hutool</groupId>
<artifactId>hutool-all</artifactId>
<version>4.4.5</version>
</dependency>
<!--图片压缩处理 https://mvnrepository.com/artifact/net.coobird.thumbnailator/thumbnailator -->
<dependency>
<groupId>net.coobird</groupId>
<artifactId>thumbnailator</artifactId>
<version>0.4.8</version>
</dependency>
<!--fastjson-->
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>fastjson</artifactId>
<version>1.2.15</version>
</dependency>
<!--jjwt java版jwt实现 https://github.com/jwtk/jjwt-->
<dependency>
<groupId>io.jsonwebtoken</groupId>
<artifactId>jjwt</artifactId>
<version>0.9.1</version>
</dependency>
<!-- jsoup Java HTML Parser Java版的Html解析工具 https://jsoup.org/-->
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.11.3</version>
</dependency>
<dependency>
<groupId>org.junit.jupiter</groupId>
<artifactId>junit-jupiter-api</artifactId>
<version>5.3.2</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.testng</groupId>
<artifactId>testng</artifactId>
<version>6.13.1</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.testng</groupId>
<artifactId>testng</artifactId>
<version>6.13.1</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.testng</groupId>
<artifactId>testng</artifactId>
<version>6.13.1</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.testng</groupId>
<artifactId>testng</artifactId>
<version>6.13.1</version>
<scope>test</scope>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-maven-plugin</artifactId>
</plugin>
</plugins>
</build>
</project>
image
所有的博客在群文件,可以下载
317896269 点击链接加入群聊【数据爬取技术群】:点击加群
网友评论