一、项目框架
image.png二、代码实现
1、数据库建表
1)、创建动态信息表
DROP TABLE IF EXISTS `user_dymatic_info`;
CREATE TABLE `user_dymatic_info` (
`_id` int(11) NOT NULL AUTO_INCREMENT,
`content` longtext NOT NULL,
`time` varchar(60) NOT NULL DEFAULT '',
`slug` varchar(60) NOT NULL DEFAULT '',
`dymatic_type` varchar(60) NOT NULL DEFAULT '',
`extra_content` varchar(500) NOT NULL DEFAULT '',
PRIMARY KEY (`_id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_bin;
2)、创建用户信息表
DROP TABLE IF EXISTS `user_information`;
CREATE TABLE `user_information` (
`id` varchar(255) NOT NULL primary key,
`follow` varchar(255) NOT NULL ,
`follower` varchar(255) NOT NULL ,
`article` varchar(255) NOT NULL ,
`words` varchar(255) NOT NULL ,
`like` varchar(255) NOT NULL
)
;
2、jsoup爬虫以及数据写入数据库
使用maven创建java项目
1)、添加如下依赖:
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.10.3</version>
</dependency>
<!-- https://mvnrepository.com/artifact/com.alibaba/fastjson -->
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>fastjson</artifactId>
<version>1.2.56</version>
</dependency>
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>5.1.47</version>
</dependency>
2)、编写爬虫以及写入数据库代码
package com.neusoft;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.IOException;
import java.sql.*;
public class Jianshu {
public static void jsoup(String id) throws IOException, SQLException, ClassNotFoundException {
Document doc = Jsoup.connect("https://www.jianshu.com/users/"+id+"/timeline?page=1")
.userAgent("Mozilla")
.timeout(10000000)
.get();
String href = doc.select("a.nickname").attr("href");
// 用户id
String slug=href.substring(href.lastIndexOf("/")+1);
System.out.println(slug);
// max_id
String max_id="";
// 连接数据库
Connection conn=null;
PreparedStatement stmt= null;
String jdbcurl = "jdbc:mysql://localhost:3306/test";
String user = "root";
String password = "root" ; //数据库密码
Class.forName("com.mysql.jdbc.Driver");
conn = DriverManager.getConnection(jdbcurl, user, password);
// 采集用户信息
// 关注
String follow="";
// 粉丝
String follower="";
// 文章
String article="";
// 字数
String words="";
// 收获喜欢
String like="";
// 获得当前用户基本信息集合
Document doc1 = Jsoup.connect("https://www.jianshu.com/users/" + id + "/timeline?page=1").get();
Elements eles = doc1.select("div.info li");
int i=0;
// 遍历当前用户信息集合,按顺序依次赋值
for (Element el:eles) {
if (i==0){
follow=el.select("p").text();
}else if (i==1){
follower=el.select("p").text();
}else if (i==2){
article=el.select("p").text();
}else if (i==3){
words=el.select("p").text();
}else if (i==4){
like=el.select("p").text();
}else break;
i++;
}
// 查询当前用户是否存在
String usersql="select * from user_information where id=?";
System.out.println(usersql);
stmt=conn.prepareStatement(usersql);
stmt.setString(1,slug);
ResultSet rs1 = stmt.executeQuery();
// 判断当前用户是否已经存在,如果存在就更新数据,如果不存在就新建用户
if (rs1.next()){
String userupdate="UPDATE user_information SET follow=?,follower=?,article=?,words=?,`like`=? WHERE id =?";
System.out.println(userupdate);
stmt=conn.prepareStatement(userupdate);
stmt.setString(1,follow);
stmt.setString(2,follower);
stmt.setString(3,article);
stmt.setString(4,words);
stmt.setString(5,like);
stmt.executeUpdate();
}else {
String userinsert="insert into user_information(id,follow,follower,article,words,`like`) values (?,?,?,?,?,?)";
System.out.println(userinsert);
stmt=conn.prepareStatement(userinsert);
stmt.setString(1,slug);
stmt.setString(2,follow);
stmt.setString(3,follower);
stmt.setString(4,article);
stmt.setString(5,words);
stmt.setString(6,like);
stmt.executeUpdate();
}
int page = 1;
out:while (true) {
String url;
if(page==1){
url="https://www.jianshu.com/users/"+id+"/timeline?page=1";
}else {
url="https://www.jianshu.com/users/"+id+"/timeline?max_id="+max_id+"&page="+page;
}
Document document = Jsoup.connect(url)
.userAgent("Mozilla")
.timeout(10000000)
.get();
// 每一条动态里面的内容
Elements ele = document.select("div#list-container li");
if(ele==null||ele.size()<=0){
break;
}
for (Element e:ele) {
// 动态类型
String dymatic_type="";
// 时间
String time = "";
// 评论内容
String content="";
// 被评论的文章
String extra_content="";
// 取得max_id用作翻页
String index=e.attr("id");
max_id = Integer.parseInt(index.substring(index.indexOf("-") + 1)) - 1 + "";
// 时间
String time8 = e.select("span").attr("data-datetime");
time = time8.substring(0, time8.indexOf("+"));
// System.out.println(time);
// 动态类型
dymatic_type = e.select("div.info span").attr("data-type");
// 判断动态类型是否是评论类型
if(dymatic_type.equals("comment_note"))
{
content=e.select("p.comment").text();
String str=e.select("a.title").attr("href");
extra_content=str.substring(str.lastIndexOf("/")+1);
}
// 断点续爬,从最新的开始爬,直到爬到数据库里面存储的最新的一天和刚爬的数据一样,停止爬虫
String exit="select * from user_dymatic_info where time=? and slug=? and dymatic_type=?";
System.out.println(exit);
stmt = conn.prepareStatement(exit);
stmt.setString(1, time);
stmt.setString(2, slug);
stmt.setString(3, dymatic_type);
ResultSet resultSet = stmt.executeQuery();
if (resultSet.next()){
break out;
}
// 将爬取到的数据存入数据库
String sql = "insert into user_dymatic_info (content,time,slug,dymatic_type,extra_content) values (?,?,?,?,?)";
stmt = conn.prepareStatement(sql);
stmt.setString(1, content);
stmt.setString(2, time);
stmt.setString(3, slug);
stmt.setString(4, dymatic_type);
stmt.setString(5, extra_content);
stmt.executeUpdate();
}
System.out.println("-----------------------------");
System.out.println(max_id);
// 翻页
page++;
}
}
public static void main(String[] args) throws SQLException, IOException, ClassNotFoundException {
jsoup("d99a7dfae9e4");
}
}
网友评论