美文网首页
Java爬虫之使用HttpClient抓取页面HTML

Java爬虫之使用HttpClient抓取页面HTML

作者: HAO延WEI | 来源:发表于2019-12-31 09:30 被阅读0次

    官方文档:http://hc.apache.org/httpclient-3.x/

    HttpClient入门实例

    1.新建一个普通的maven项目:
    2.修改pom文件,引入依赖
    <dependency>
        <groupId>org.apache.httpcomponents</groupId>
        <artifactId>httpclient</artifactId>
        <version>4.5.8</version>
    </dependency>
    
    3.新建java类
    package com.data.spider.spider;
    
    import java.io.IOException;
    import org.apache.http.HttpEntity;
    import org.apache.http.HttpStatus;
    import org.apache.http.client.ClientProtocolException;
    import org.apache.http.client.methods.CloseableHttpResponse;
    import org.apache.http.client.methods.HttpGet;
    import org.apache.http.client.utils.HttpClientUtils;
    import org.apache.http.impl.client.CloseableHttpClient;
    import org.apache.http.impl.client.HttpClients;
    import org.apache.http.util.EntityUtils;
    
    
    /**
     * dome HttpClientSpider
     *
     * @author Mr.Hao
     * @date 2019-12-28
     */
    public class HttpClientSpider {
    
    
        public static void getRosponse() throws IOException{
            //1.生成httpclient,相当于该打开一个浏览器
            CloseableHttpClient httpClient = HttpClients.createDefault();
    
            try {
    
                //2.创建get请求,相当于在浏览器地址栏输入 网址
                HttpGet httpget = new HttpGet("http://www.baidu.com");
    
                httpget.setHeader("Accept", "text/html, */*; q=0.01");
                httpget.setHeader("Accept-Encoding", "gzip, deflate,sdch");
                httpget.setHeader("Accept-Language", "zh-CN,zh;q=0.8");
                httpget.setHeader("Connection", "keep-alive");
    //            httpget.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.124 Safari/537.36)");
                //3.执行get请求,相当于在输入地址栏后敲回车键
                CloseableHttpResponse response = httpClient.execute(httpget);
    
                //4.判断响应状态为200,进行处理
                if(response.getStatusLine().getStatusCode() == HttpStatus.SC_OK) {
                    //5.获取响应内容
                    HttpEntity httpEntity = response.getEntity();
                    String html = EntityUtils.toString(httpEntity, "utf-8");
                    System.out.println(html);
                } else {
                    //如果返回状态不是200,比如404(页面不存在)等,根据情况做处理,这里略
                    System.out.println("返回状态不是200");
                    System.out.println(EntityUtils.toString(response.getEntity(), "utf-8"));
                }
            } catch (ClientProtocolException e) {
                e.printStackTrace();
            }
             finally {
                //6.关闭
                HttpClientUtils.closeQuietly(httpClient);
            }
    
        }
    
        public static void main(String[] args) {
            try {
                HttpClientSpider.getRosponse();
            }catch (IOException e) {
                e.printStackTrace();
            }
        }
    }
    

    相关文章

      网友评论

          本文标题:Java爬虫之使用HttpClient抓取页面HTML

          本文链接:https://www.haomeiwen.com/subject/ugwsoctx.html