美文网首页
HtmlUnit2.14使用样例—获取动态网页

HtmlUnit2.14使用样例—获取动态网页

作者: 陈煦缘 | 来源:发表于2018-04-26 16:28 被阅读0次

    标签: 简书笔记

    转自:http://shihlei.iteye.com/blog/2067707

    一、概述

    HttpClient适合处理静态资源,网络爬虫等类似应用很大程度需要处理动态网页(内容有js填充,如百度图片,body里基本没有数据,碰到最麻烦的是新浪微博列表页)。将网页下载后,结合JS和Dom模型还原网页,我目前还未攻破,但在下载层还原网页,HtmlUnit是一种解决方案,虽然对JS的支持还是不完美。
    HtmlUnit其实是自动化测试工具,集成了下载(HttpClient),Dom(NekoHtml),驱动JS(Rhino)。有一定的网页渲染能力,由于会驱动Dom,会消耗些CPU,内存。
    本文描述HTMLUnit请求响应,设置cookies,设置代理,驱动JS等方法。

    二、版本

    pom.xml 依赖

    <dependency>
                <groupId>net.sourceforge.htmlunit</groupId>
                <artifactId>htmlunit</artifactId>
                <version>2.14</version>
            </dependency>
            
            <dependency>
                <groupId>xml-apis</groupId>
                <artifactId>xml-apis</artifactId>
                <version>1.4.01</version>
            </dependency>
            
            <dependency>
                <groupId>org.seleniumhq.selenium</groupId>
                <artifactId>selenium-java</artifactId>
                <version>3.11.0</version>
            </dependency>
    
    

    三、典型功能

    1、打开百度

    
    public static void main(String[] args) throws Exception {  
        String url = "http://www.baidu.com";  
      
        final WebClient webClient = new WebClient();  
        HtmlPage htmlPage = webClient.getPage(url);  
      
        // HtmlUnit dom模型  
        // 获取表单 ,获得form标签name属性=f  
        HtmlForm form = htmlPage.getFormByName("f");  
        // 获取输入框, 获取 input标签 ,name属性=q  
        HtmlTextInput text = form.getInputByName("q");  
        // 搜索百度  
        text.setText("baidu");  
        // 获取提交按钮  
        HtmlSubmitInput button = form.getInputByName("btnG");  
        // 提交表单  
        HtmlPage listPage = button.click();  
      
        System.out.println(listPage.asXml());  
               
          webClient.closeAllWindows();  
    }  
       
    
    

    2、获取动态页面

    /** 
     * 获取百度图片js后的内容 
     *  
     * @throws Exception 
     */  
    public void demo2() throws Exception {  
        String url = "http://image.baidu.com/i?tn=baiduimage&ipn=r&ct=201326592&cl=2&lm=-1&st=-1&fm=result&fr=&sf=1&fmq=1400328281672_R&pv=&ic=0&nc=1&z=&se=1&showtab=0&fb=0&width=&height=&face=0&istype=2&ie=utf-8&word=html";  
      
        final WebClient webClient = new WebClient();  
      
        // 1 启动JS  
        webClient.getOptions().setJavaScriptEnabled(true);  
        // 2 禁用Css,可避免自动二次请求CSS进行渲染  
        webClient.getOptions().setCssEnabled(false);  
        // 3 启动客户端重定向  
        webClient.getOptions().setRedirectEnabled(true);  
      
        // 4 js运行错误时,是否抛出异常  
        webClient.getOptions().setThrowExceptionOnScriptError(false);  
        // 5 设置超时  
        webClient.getOptions().setTimeout(50000);  
          
        HtmlPage htmlPage = webClient.getPage(url);  
        // 等待JS驱动dom完成获得还原后的网页  
        webClient.waitForBackgroundJavaScript(10000);  
        // 网页内容  
        System.out.println(htmlPage.asXml());  
        webClient.closeAllWindows();  
    }  
    
    

    四、Demo

    1 请求响应

    /** 
     * Get请求 
     * @param url 
     * @return 
     * @throws Exception 
     */  
    public static byte[] sendGetRequest(String url) throws Exception{  
            WebClient webClient = new WebClient();  
        WebRequest webRequest = new WebRequest(new URL(url));  
        webRequest.setHttpMethod(HttpMethod.GET);  
        return sendRequest(webClient,webRequest);  
    }  
      
    /** 
     * Post 请求 
     *  
     * @param url 
     * @param params 
     * @return 
     * @throws Exception 
     */  
    public static byte[] sendPostRequest(String url,Map<String,String> params) throws Exception{  
            WebClient webClient = new WebClient();  
        WebRequest webRequest = new WebRequest(new URL(url));  
        webRequest.setHttpMethod(HttpMethod.POST);  
        if (params != null && params.size() > 0) {  
            for (Entry<String, String> param : params.entrySet()) {  
                webRequest.getRequestParameters().add(new NameValuePair(param.getKey(), param.getValue()));  
            }  
        }  
        return sendRequest(webClient,webRequest);  
    }  
      
    //底层请求  
    private static byte[] sendRequest(WebClient webClient,WebRequest webRequest) throws Exception{  
        byte[] responseContent = null;  
        Page page = webClient.getPage(webRequest);  
          
        WebResponse webResponse = page.getWebResponse();  
          
        int status = webResponse.getStatusCode();  
          
        System.out.println("Charset : " + webResponse.getContentCharset());  
      
        System.out.println("ContentType : " + webResponse.getContentType());  
      
        // 读取数据内容  
        if (status==200) {  
            if (page.isHtmlPage()) {  
                <strong>// 等待JS执行完成,包括远程JS文件请求,Dom处理  
                 webClient.waitForBackgroundJavaScript(10000);</strong>  
    <strong>                     // 使用JS还原网页  
                 responseContent = ((HtmlPage) page).asXml().getBytes();</strong>  
            } else {  
                InputStream bodyStream = webResponse.getContentAsStream();  
                responseContent = ByteStreams.toByteArray(bodyStream);  
                bodyStream.close();  
            }  
        }  
        // 关闭响应流  
        webResponse.cleanUp();  
      
        return responseContent;  
    }  
    

    2、配置JS,CSS,超时,重定向

    
    private void configWebClient(WebClient webClient) {  
        // 设置webClient的相关参数  
        // 1 启动JS  
        webClient.getOptions().setJavaScriptEnabled(true);  
        // 2 禁用Css,可避免自动二次请求CSS进行渲染  
        webClient.getOptions().setCssEnabled(false);  
        // 3 启动客户端重定向  
        webClient.getOptions().setRedirectEnabled(true);  
      
        // 4 js运行错误时,是否抛出异常  
        webClient.getOptions().setThrowExceptionOnScriptError(false);  
        // 5 设置超时  
        webClient.getOptions().setTimeout(timeout);  
    } 
    
    

    3、代理

    
      private void setProxy(WebClient webClient,HttpProxy proxy) {  
        ProxyConfig proxyConfig = webClient.getOptions().getProxyConfig();  
        proxyConfig.setProxyHost(proxy.getHost());  
        proxyConfig.setProxyPort(proxy.getPort());  
      
        DefaultCredentialsProvider credentialsProvider = (DefaultCredentialsProvider) webClient  
                .getCredentialsProvider();  
        credentialsProvider.addCredentials(proxy.getUser(), proxy.getPassword());  
    }  
    
    

    4、辅助类

    
    public class HttpProxy {
        private String proxy = "http";
        private String host;
        private int port;
        private String user;
        private String password;
    
        public String getProxy() {
            return proxy;
        }
    
        public void setProxy(String proxy) {
            this.proxy = proxy;
        }
    
        public String getHost() {
            return host;
        }
    
        public void setHost(String host) {
            this.host = host;
        }
    
        public int getPort() {
            return port;
        }
    
        public void setPort(int port) {
            this.port = port;
        }
    
        public String getUser() {
            return user;
        }
    
        public void setUser(String user) {
            this.user = user;
        }
    
        public String getPassword() {
            return password;
        }
    
        public void setPassword(String password) {
            this.password = password;
        }
    }
    

    五、Cookies:可以用于认证数据设置

    1)设置Cookies

    
    private void setCookies(WebClient webClient,String domain, Map<String, String> cookies) {  
        if (cookies != null && cookies.size() > 0) {  
            webClient.getCookieManager().setCookiesEnabled(true);// enable  
                                                                    // cookies  
            for (Entry<String, String> c : cookies.entrySet()) {  
                Cookie cookie = new Cookie(domain, c.getKey(), c.getValue());  
                webClient.getCookieManager().addCookie(cookie);  
            }  
        }  
    }  
    
    
    

    2)获取响应Cookies

    
    private Map<String, String> getResponseCookies(WebClient webClient) {  
        Set<Cookie> cookies = webClient.getCookieManager().getCookies();  
        Map<String, String> responseCookies = Maps.newHashMap();  
        for (Cookie c : cookies) {  
            responseCookies.put(c.getName(), c.getValue());  
        }  
        return responseCookies;  
    }  
    
    

    3)删除所有Cookies

    
    /** 
     * 清除所有cookie 
     */  
    public void clearCookies(WebClient webClient) {  
        webClient.getCookieManager().clearCookies();  
    }  
    
    

    六、驱动JS

    可实现自动化流程,如驱动表单提交,获取表单提交后的页面
    如登录后页面:

    
    public void doWeb(Page page) {  
        if (page instanceof HtmlPage) {  
            StringBuilder js = new StringBuilder();  
            js.append("document.getElementsByName('username')[1].value='").append(WeiboAccount.USERNAME)  
                    .append("';");  
            js.append("document.getElementsByName('password')[1].value='").append(WeiboAccount.PASSWORD)  
                    .append("';");  
            js.append("document.getElementsByClassName('W_btn_g')[1].click();");  
            HtmlPage htmlPage = (HtmlPage) page;  
            htmlPage.executeJavaScript(js.toString());  
        }  
    } 
    
    

    七、附录:完整代码

    import com.gargoylesoftware.htmlunit.*;
    import com.gargoylesoftware.htmlunit.html.HtmlPage;
    import com.gargoylesoftware.htmlunit.util.Cookie;
    import com.google.common.collect.Maps;
    import com.google.common.io.ByteStreams;
    import com.jdd.basedata.commons.bean.HttpProxy;
    
    import java.io.IOException;
    import java.io.InputStream;
    import java.net.URL;
    import java.util.Map;
    import java.util.Map.Entry;
    import java.util.Set;
    
    
    public class HtmlUnitDemo {  
      
        private WebClient webClient = null;  
      
        private int timeout = 50000;  
      
        public HtmlUnitDemo() {  
            this(null);  
        }  
      
        /** 
         * Get请求 
         *  
         * @param url 
         * @return 
         * @throws Exception 
         */  
        public byte[] sendGetRequest(String url) throws Exception {  
            WebRequest webRequest = new WebRequest(new URL(url));  
            webRequest.setHttpMethod(HttpMethod.GET);  
            return sendRequest(webRequest);  
        }  
      
        /** 
         * Post 请求 
         *  
         * @param url 
         * @param params 
         * @return 
         * @throws Exception 
         */  
        public byte[] sendPostRequest(String url, Map<String, String> params) throws Exception {  
            WebRequest webRequest = new WebRequest(new URL(url));  
            webRequest.setHttpMethod(HttpMethod.POST);  
            if (params != null && params.size() > 0) {  
                for (Entry<String, String> param : params.entrySet()) {  
                    webRequest.getRequestParameters().add(new NameValuePair(param.getKey(), param.getValue()));  
                }  
            }  
            return sendRequest(webRequest);  
        }  
      
        // 底层请求  
        private byte[] sendRequest(WebRequest webRequest) throws Exception {  
            byte[] responseContent = null;  
            Page page = webClient.getPage(webRequest);  
      
            WebResponse webResponse = page.getWebResponse();  
      
            int status = webResponse.getStatusCode();  
      
            System.out.println("Charset : " + webResponse.getContentCharset());  
      
            System.out.println("ContentType : " + webResponse.getContentType());  
      
            // 读取数据内容  
            if (status == 200) {  
                if (page.isHtmlPage()) {  
                    // 等待JS执行完成  
                    webClient.waitForBackgroundJavaScript(100000);  
                    responseContent = ((HtmlPage) page).asXml().getBytes();  
                } else {  
                    InputStream bodyStream = webResponse.getContentAsStream();  
                    responseContent = ByteStreams.toByteArray(bodyStream);  
                    bodyStream.close();  
                }  
            }  
            // 关闭响应流  
            webResponse.cleanUp();  
      
            return responseContent;  
        }  
      
        public HtmlUnitDemo(HttpProxy proxy) {  
            webClient = new WebClient();  
            configWebClient();  
            // 设置代理  
            if (proxy != null) {  
                setProxy(proxy);  
            }  
        }  
      
        private void configWebClient() {  
            // 设置webClient的相关参数  
            // 1 启动JS  
            webClient.getOptions().setJavaScriptEnabled(true);  
            // 2 禁用Css,可避免自动二次请求CSS进行渲染  
            webClient.getOptions().setCssEnabled(false);  
            // 3 启动客户端重定向  
            webClient.getOptions().setRedirectEnabled(true);  
      
            // 4 js运行错误时,是否抛出异常  
            webClient.getOptions().setThrowExceptionOnScriptError(false);  
            // 5 设置超时  
            webClient.getOptions().setTimeout(timeout);  
        }  
      
        private void setProxy(HttpProxy proxy) {  
            ProxyConfig proxyConfig = webClient.getOptions().getProxyConfig();  
            proxyConfig.setProxyHost(proxy.getHost());  
            proxyConfig.setProxyPort(proxy.getPort());  
      
            DefaultCredentialsProvider credentialsProvider = (DefaultCredentialsProvider) webClient  
                    .getCredentialsProvider();  
            credentialsProvider.addCredentials(proxy.getUser(), proxy.getPassword());  
        }  
      
        @SuppressWarnings("unused")  
        private Map<String, String> getResponseCookies() {  
            Set<Cookie> cookies = webClient.getCookieManager().getCookies();  
            Map<String, String> responseCookies = Maps.newHashMap();  
            for (Cookie c : cookies) {  
                responseCookies.put(c.getName(), c.getValue());  
            }  
            return responseCookies;  
        }  
      
        @SuppressWarnings("unused")  
        private void setCookies(String domain, Map<String, String> cookies) {  
            if (cookies != null && cookies.size() > 0) {  
                webClient.getCookieManager().setCookiesEnabled(true);// enable  
                                                                        // cookies  
                for (Entry<String, String> c : cookies.entrySet()) {  
                    Cookie cookie = new Cookie(domain, c.getKey(), c.getValue());  
                    webClient.getCookieManager().addCookie(cookie);  
      
                    System.out.println("Set Cookies : " + c.getKey() + " | " + c.getValue());  
                }  
            }  
        }  
      
        /** 
         * 清除所有cookie 
         */  
        public void clearCookies() {  
            webClient.getCookieManager().clearCookies();  
        }  
      
        public void shutdown() throws IOException {  
            webClient.closeAllWindows();  
        }  
      
        /** 
         * 打开google 搜索百度 
         *  
         * @param args 
         * @throws Exception 
         */  
        public void demo() throws Exception{  
            String url = "http://www.google.com.hk";  
      
            final WebClient webClient = new WebClient();  
            HtmlPage htmlPage = webClient.getPage(url);  
      
            // HtmlUnit dom模型  
            // 获取表单 ,获得form标签name属性=f  
            HtmlForm form = htmlPage.getFormByName("f");  
            // 获取输入框, 获取 input标签 ,name属性=q  
            HtmlTextInput text = form.getInputByName("q");  
            // 搜索百度  
            text.setText("baidu");  
            // 获取提交按钮  
            HtmlSubmitInput button = form.getInputByName("btnG");  
            // 提交表单  
            HtmlPage listPage = button.click();  
      
            System.out.println(listPage.asXml());  
            webClient.closeAllWindows();  
        }  
        
        /** 
         * 打开百度 
         *  
         * @param args 
         * @throws Exception 
         */  
        public static void main(String[] args) throws Exception {  
            String url = "http://www.baidu.com";  
      
            HtmlUnitDemo htmlUnit = new HtmlUnitDemo();  
            byte[] getResponse = htmlUnit.sendGetRequest(url);  
            System.out.println("Get Body : " + new String(getResponse, "utf-8"));  
            byte[] postResponse = htmlUnit.sendPostRequest(url, null);  
            System.out.println("Get Body : " + new String(postResponse, "utf-8"));  
      
            htmlUnit.shutdown();  
        }  
    }  
    
    

    相关文章

      网友评论

          本文标题:HtmlUnit2.14使用样例—获取动态网页

          本文链接:https://www.haomeiwen.com/subject/tflmlftx.html