美文网首页
Spark项目之简书百万用户动态分析与查询展示

Spark项目之简书百万用户动态分析与查询展示

作者: 小月半会飞 | 来源:发表于2019-03-19 11:21 被阅读0次

一、项目框架

image.png

二、代码实现

1、数据库建表

1)、创建动态信息表

DROP TABLE IF EXISTS `user_dymatic_info`;
CREATE TABLE `user_dymatic_info` (
  `_id` int(11) NOT NULL AUTO_INCREMENT,
  `content` longtext NOT NULL,
  `time` varchar(60) NOT NULL DEFAULT '',
  `slug` varchar(60) NOT NULL DEFAULT '',
  `dymatic_type` varchar(60) NOT NULL DEFAULT '',
  `extra_content` varchar(500) NOT NULL DEFAULT '',
  PRIMARY KEY (`_id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_bin;

2)、创建用户信息表

DROP TABLE IF EXISTS `user_information`;
CREATE TABLE `user_information` (
`id`  varchar(255) NOT NULL primary key,
`follow`  varchar(255) NOT NULL ,
`follower`  varchar(255) NOT NULL ,
`article`  varchar(255) NOT NULL ,
`words`  varchar(255) NOT NULL ,
`like`  varchar(255) NOT NULL 
)
;

2、jsoup爬虫以及数据写入数据库

使用maven创建java项目

1)、添加如下依赖:

<dependency>
      <groupId>org.jsoup</groupId>
      <artifactId>jsoup</artifactId>
      <version>1.10.3</version>
    </dependency>
    <!-- https://mvnrepository.com/artifact/com.alibaba/fastjson -->
    <dependency>
      <groupId>com.alibaba</groupId>
      <artifactId>fastjson</artifactId>
      <version>1.2.56</version>
    </dependency>
    <dependency>
      <groupId>mysql</groupId>
      <artifactId>mysql-connector-java</artifactId>
      <version>5.1.47</version>
    </dependency>

2)、编写爬虫以及写入数据库代码

package com.neusoft;


import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.io.IOException;
import java.sql.*;

public class Jianshu {
    public static void jsoup(String id) throws IOException, SQLException, ClassNotFoundException {

        Document doc = Jsoup.connect("https://www.jianshu.com/users/"+id+"/timeline?page=1")
                .userAgent("Mozilla")
                .timeout(10000000)
                .get();
        String href = doc.select("a.nickname").attr("href");
//        用户id
        String slug=href.substring(href.lastIndexOf("/")+1);
        System.out.println(slug);
//        max_id
        String max_id="";

//        连接数据库
        Connection conn=null;
        PreparedStatement stmt= null;
        String jdbcurl = "jdbc:mysql://localhost:3306/test";
        String user = "root";
        String password = "root" ; //数据库密码
        Class.forName("com.mysql.jdbc.Driver");
        conn = DriverManager.getConnection(jdbcurl, user, password);
//        采集用户信息
//        关注
        String follow="";
//        粉丝
        String follower="";
//        文章
        String article="";
//        字数
        String words="";
//        收获喜欢
        String like="";
//        获得当前用户基本信息集合
        Document doc1 = Jsoup.connect("https://www.jianshu.com/users/" + id + "/timeline?page=1").get();
        Elements eles = doc1.select("div.info li");
        int i=0;
//        遍历当前用户信息集合,按顺序依次赋值
        for (Element el:eles) {
            if (i==0){
                follow=el.select("p").text();
            }else if (i==1){
                follower=el.select("p").text();
            }else if (i==2){
                article=el.select("p").text();
            }else if (i==3){
                words=el.select("p").text();
            }else if (i==4){
                like=el.select("p").text();
            }else break;
            i++;
        }
//      查询当前用户是否存在
        String usersql="select * from user_information where id=?";
        System.out.println(usersql);
        stmt=conn.prepareStatement(usersql);
        stmt.setString(1,slug);
        ResultSet rs1 = stmt.executeQuery();
//        判断当前用户是否已经存在,如果存在就更新数据,如果不存在就新建用户
        if (rs1.next()){
            String userupdate="UPDATE user_information SET follow=?,follower=?,article=?,words=?,`like`=? WHERE id =?";
            System.out.println(userupdate);
            stmt=conn.prepareStatement(userupdate);
            stmt.setString(1,follow);
            stmt.setString(2,follower);
            stmt.setString(3,article);
            stmt.setString(4,words);
            stmt.setString(5,like);
            stmt.executeUpdate();
        }else {
            String userinsert="insert into user_information(id,follow,follower,article,words,`like`) values (?,?,?,?,?,?)";
            System.out.println(userinsert);
            stmt=conn.prepareStatement(userinsert);
            stmt.setString(1,slug);
            stmt.setString(2,follow);
            stmt.setString(3,follower);
            stmt.setString(4,article);
            stmt.setString(5,words);
            stmt.setString(6,like);
            stmt.executeUpdate();
        }
        int page = 1;
        out:while (true) {
            String url;
            if(page==1){
                url="https://www.jianshu.com/users/"+id+"/timeline?page=1";
            }else {
                url="https://www.jianshu.com/users/"+id+"/timeline?max_id="+max_id+"&page="+page;
            }
            Document document = Jsoup.connect(url)
                    .userAgent("Mozilla")
                    .timeout(10000000)
                    .get();
//                每一条动态里面的内容
            Elements ele = document.select("div#list-container li");
            if(ele==null||ele.size()<=0){
                break;
            }
            for (Element e:ele) {
                //        动态类型
                String dymatic_type="";
                //        时间
                String time = "";
                //        评论内容
                String content="";
                //        被评论的文章
                String extra_content="";
                //        取得max_id用作翻页
                String index=e.attr("id");
                    max_id = Integer.parseInt(index.substring(index.indexOf("-") + 1)) - 1 + "";
//                时间
                    String time8 = e.select("span").attr("data-datetime");
                    time = time8.substring(0, time8.indexOf("+"));
//               System.out.println(time);
//               动态类型
                dymatic_type = e.select("div.info span").attr("data-type");
//                判断动态类型是否是评论类型
                if(dymatic_type.equals("comment_note"))
                {
                    content=e.select("p.comment").text();
                    String str=e.select("a.title").attr("href");
                    extra_content=str.substring(str.lastIndexOf("/")+1);
                }
//              断点续爬,从最新的开始爬,直到爬到数据库里面存储的最新的一天和刚爬的数据一样,停止爬虫
                String exit="select * from user_dymatic_info where time=? and slug=? and dymatic_type=?";
                System.out.println(exit);
                stmt = conn.prepareStatement(exit);
                stmt.setString(1, time);
                stmt.setString(2, slug);
                stmt.setString(3, dymatic_type);
                ResultSet resultSet = stmt.executeQuery();
                if (resultSet.next()){
                    break out;
                }
//              将爬取到的数据存入数据库
                String sql = "insert into user_dymatic_info (content,time,slug,dymatic_type,extra_content) values (?,?,?,?,?)";
                stmt = conn.prepareStatement(sql);
                stmt.setString(1, content);
                stmt.setString(2, time);
                stmt.setString(3, slug);
                stmt.setString(4, dymatic_type);
                stmt.setString(5, extra_content);
                stmt.executeUpdate();
            }
            System.out.println("-----------------------------");
            System.out.println(max_id);
//            翻页
            page++;
        }
    }

    public static void main(String[] args) throws SQLException, IOException, ClassNotFoundException {
        jsoup("d99a7dfae9e4");
    }
}

相关文章

网友评论

      本文标题:Spark项目之简书百万用户动态分析与查询展示

      本文链接:https://www.haomeiwen.com/subject/aiqgmqtx.html