美文网首页
简书用户动态信息爬虫

简书用户动态信息爬虫

作者: weare_b646 | 来源:发表于2019-04-24 14:54 被阅读0次

    第一步 爬取简书推荐用户

    爬取简书推荐用户,加入到用户信息表(第一批用户)

    package com.company;
    import org.jsoup.Connection;
    import org.jsoup.Jsoup;
    import org.jsoup.nodes.Document;
    import org.jsoup.nodes.Element;
    import org.jsoup.select.Elements;
    
    import java.io.*;
    
    public class meizi {
    
        public static void main(String[] args) {
    
            for(int i = 1; i <= 100; i++)
            {
                Connection connect = Jsoup.connect("https://www.jianshu.com/recommendations/users?page=" + i);
                try {
                    // 得到Document对象
                    Document document = connect.get();
    
                    Elements elements2 = document.select(".wrap");
                    for(Element element : elements2)
                    {
                        Element elements3  = element.select("a").first();
    //                    System.out.println(elements3.attr("abs:href"));
                        System.out.println(elements3.attr("href").replace("/users/",""));
    
                        System.out.println();
                    }
    
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
    
        }
    }
    
    

    第二步 从推荐用户出发,爬取每个推荐用户的全部粉丝,加入到用户信息表(第二批用户)

    package com.company;
    
    import org.jsoup.Connection;
    import org.jsoup.Jsoup;
    import org.jsoup.nodes.Document;
    import org.jsoup.nodes.Element;
    import org.jsoup.select.Elements;
    
    import java.io.IOException;
    
    public class meizi2 {
    
        public static void main(String[] args) {
    
    //        https://www.jianshu.com/users/757b5f9f910b/followers?page=5
            //每页9条
            // 利用Jsoup获得连接
            Connection connect = Jsoup.connect("https://www.jianshu.com/users/5ddd9dd5ced9/followers?page=1");
            try {
                // 得到Document对象
                Document document = connect.get();
    
    //            Elements elements = document.select(".wrap");
    //            System.out.println(elements.get(0).text());
    
                Elements elements2 = document.select(".user-list li");
                for(Element element : elements2)
                {
                    Element elements3  = element.select(".info a").first();
                    System.out.println(elements3.attr("abs:href"));
    
                    System.out.println();
    //                elements3  = element.select(".answer-user-name");
    //                System.out.println(elements3.text());
    //
    //                elements3  = element.select(".answer-user-tag");
    //                System.out.println(elements3.text());
    //
    //                elements3  = element.select(".answer-text");
    //                System.out.println(elements3.text());
                }
    
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
    }
    
    

    第三步,爬取每个简书用户的全部动态信息

    package com.company;
    
    import org.jsoup.Connection;
    import org.jsoup.Jsoup;
    import org.jsoup.nodes.Document;
    import org.jsoup.nodes.Element;
    import org.jsoup.select.Elements;
    
    import java.io.IOException;
    
    public class meizi3 {
    
        public static void getDync(int maxid,int pageno)
        {
            //        https://www.jianshu.com/users/757b5f9f910b/followers?page=5
            //每页9条
            // 利用Jsoup获得连接
            String query = "?max_id="+maxid+"&page="+pageno;
    
            if(maxid == 0)
            {
                query = "";
            }
            Connection connect = Jsoup.connect("http://www.jianshu.com/users/c4165d16d0ad/timeline"+query);
            try {
                // 得到Document对象
                Document document = connect.get();
    
    //            Elements elements = document.select(".wrap");
    //            System.out.println(elements.get(0).text());
    
                Elements elements2 = document.select(".note-list li");
                if(elements2.last() == null) {
                    System.out.println(pageno);
                    return;
                }
                String id = elements2.last().id();
                maxid = Integer.parseInt(id.split("-")[1]) - 1;
    
                System.out.println(id);
                for(Element element : elements2)
                {
    
                    Elements elements3  = element.select("span[data-type=like_comment]");
                    if(elements3.size() > 0)
                    {
                        System.out.println("喜欢了评论");
    
                        System.out.println(elements3.attr("data-datetime"));
                    }
    
                    elements3  = element.select("span[data-type=comment_note]");
                    if(elements3.size() > 0)
                    {
                        System.out.println("发表评论");
                        System.out.println(element.select(".comment").first().ownText());
                        System.out.println(element.select("a.title").first().attr("href").replace("/p/",""));
                        System.out.println(elements3.attr("data-datetime"));
                    }
                    elements3  = element.select("span[data-type=like_note]");
                    if(elements3.size() > 0)
                    {
                        System.out.println("喜欢文章");
                        System.out.println(elements3.attr("data-datetime"));
                    }
                    elements3  = element.select("span[data-type=reward_note]");
                    if(elements3.size() > 0)
                    {
                        System.out.println("赞赏文章");
                        System.out.println(elements3.attr("data-datetime"));
                    }
    
                    elements3  = element.select("span[data-type=share_note]");
                    if(elements3.size() > 0)
                    {
                        System.out.println("发表文章");
                        System.out.println(elements3.attr("data-datetime"));
                    }
                    elements3  = element.select("span[data-type=like_user]");
                    if(elements3.size() > 0)
                    {
                        System.out.println("关注作者");
                        System.out.println(elements3.attr("data-datetime"));
                    }
                    elements3  = element.select("span[data-type=like_collection]");
                    if(elements3.size() > 0)
                    {
                        System.out.println("关注专题");
                        System.out.println(elements3.attr("data-datetime"));
                    }
                    elements3  = element.select("span[data-type=like_notebook]");
                    if(elements3.size() > 0)
                    {
                        System.out.println("关注文集");
                        System.out.println(elements3.attr("data-datetime"));
                    }
                }
    
            } catch (IOException e) {
                e.printStackTrace();
            }
    
            getDync(maxid,++pageno);
        }
    
        public static void main(String[] args) {
            getDync(0,1);
    
        }
    }
    
    

    相关文章

      网友评论

          本文标题:简书用户动态信息爬虫

          本文链接:https://www.haomeiwen.com/subject/ypgjwqtx.html