美文网首页
使用Jsoup+HtmlUnit抓取动态网页数据

使用Jsoup+HtmlUnit抓取动态网页数据

作者: Buckler | 来源:发表于2018-01-03 18:04 被阅读447次

    最后更新日期为2018.1.3

    只为自己留个记录

    待添加功能:

    1.获取历史全部消息
    2.爬取大于10条数据
    3.自定义抓取公众号信息

    这里以搜狗微信公众号搜索微信公众号为例!
    搜狗微信公众号作为解析入口:http://weixin.sogou.com/weixin?type=1&s_from=input&query=[这里填公众号名称]&ie=utf8&sug=n&sug_type=
    DEMO中的完整URL为:http://weixin.sogou.com/weixin?type=1&s_from=input&query=DOTA%E6%AF%8F%E6%97%A5%E8%8A%82%E5%A5%8F&ie=utf8&sug=n&sug_type=

    package cc.buckler.test;
    
    import com.gargoylesoftware.htmlunit.BrowserVersion;
    import com.gargoylesoftware.htmlunit.WebClient;
    import com.gargoylesoftware.htmlunit.html.HtmlPage;
    import org.jsoup.Jsoup;
    import org.jsoup.nodes.Document;
    import org.junit.Test;
    
    import java.io.IOException;
    
    public class TestData {
        private String ENTRY_URL = "http://weixin.sogou.com/weixin?type=1&query=%s&ie=utf8&_sug_=n&_sug_type_=";//入口地址
        private String QUERY_WORD = "DOTA每日节奏";//查询参数
        private String BASE_URL = "";//从入口进入公众号后的公众号地址
        private String WE_CHAT_URL = "http://mp.weixin.qq.com";//微信公众号官方入口
        private int NEW_MSG_ID = 0;//最新msgId
        private int MSG_NUM = 20;//需要获取的数量
    
        @Test
        public void getData() {
            String url = String.format(ENTRY_URL, QUERY_WORD);
            //System.out.println(url);
    
            WebClient webClient = new WebClient(BrowserVersion.CHROME);
            webClient.getOptions().setCssEnabled(false);
            webClient.getOptions().setJavaScriptEnabled(true);
            webClient.getOptions().setRedirectEnabled(true);
            webClient.getOptions().setThrowExceptionOnScriptError(false);
            webClient.getOptions().setTimeout(50000);
            Document doc = null;
            try {
                //首先用jsoup获取搜狗入口公众号连接
                doc = Jsoup.connect(url).get();
                //System.out.println("doc:" + doc);
                BASE_URL = doc.select("p a").attr("href");
                //System.out.println(BASE_URL);
    
                //使用htmlunit加载公众号文章列表
                HtmlPage htmlPage = webClient.getPage(BASE_URL);
                webClient.waitForBackgroundJavaScript(10000);
                doc = Jsoup.parse(htmlPage.asXml());
                //System.out.println("doc:" + doc);
                //获取最新文章msgid,之后的循环用msgid-1
                String lastMsgId = doc.select(".weui_media_box").attr("msgid");
                NEW_MSG_ID = Integer.parseInt(lastMsgId);
                //System.out.println(NEW_MSG_ID);
    
                for (int i = NEW_MSG_ID; i >= NEW_MSG_ID - MSG_NUM; i--) {
                    String articalPrev = "#WXAPPMSG";
                    String articalId = articalPrev + i;
                    String h4 = articalId + " h4";
                    String weui_media_desc = articalId + " .weui_media_desc";
                    String weui_media_extra_info = articalId + " .weui_media_extra_info";
    
                    System.out.println(articalId);
                    String title = doc.select(h4).text();
                    System.out.println(title);
                    String detailUrl = doc.select(h4).attr("hrefs");//2018.1.3 ok
                    System.out.println(WE_CHAT_URL + detailUrl);
                    String note = doc.select(weui_media_desc).text();//2018.1.3 ok
                    if (note.compareToIgnoreCase("") == 0) {
                        continue;
                    }
                    System.out.println(note);
                    String releaseDate = doc.select(weui_media_extra_info).text().toString();//2018.1.3 ok
                    if (releaseDate.compareToIgnoreCase("") == 0) {
                        continue;
                    }
                    System.out.println(releaseDate);
                }
                webClient.close();
            } catch (IOException e) {
                e.printStackTrace();
                webClient.close();
            }
        }
    }
    

    相关文章

      网友评论

          本文标题:使用Jsoup+HtmlUnit抓取动态网页数据

          本文链接:https://www.haomeiwen.com/subject/zyhknxtx.html