美文网首页源码Java学习笔记程序员
HttpClient之自定义HttpRequestRetryHa

HttpClient之自定义HttpRequestRetryHa

作者: holly_wang_王小飞 | 来源:发表于2016-11-15 21:06 被阅读5037次

    构造httpclient的时候可以setRetryHandler(HttpRequestRetryHandler) ** HttpRequestRetryHandler是Http请求出错后的重试的处理接口类,对于了某些要求比较严格的业务情况下这个参数还是比较重要的。
      
    HttpRequestRetryHandler** 的已知实现类有 DefaultHttpRequestRetryHandler和继承了DefaultHttpRequestRetryHandlerStandardHttpRequestRetryHandler

    DefaultHttpRequestRetryHandler

    /*
     * ====================================================================
     * Licensed to the Apache Software Foundation (ASF) under one
     * or more contributor license agreements.  See the NOTICE file
     * distributed with this work for additional information
     * regarding copyright ownership.  The ASF licenses this file
     * to you under the Apache License, Version 2.0 (the
     * "License"); you may not use this file except in compliance
     * with the License.  You may obtain a copy of the License at
     *
     *   http://www.apache.org/licenses/LICENSE-2.0
     *
     * Unless required by applicable law or agreed to in writing,
     * software distributed under the License is distributed on an
     * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
     * KIND, either express or implied.  See the License for the
     * specific language governing permissions and limitations
     * under the License.
     * ====================================================================
     *
     * This software consists of voluntary contributions made by many
     * individuals on behalf of the Apache Software Foundation.  For more
     * information on the Apache Software Foundation, please see
     * <http://www.apache.org/>.
     *
     */
    
    package org.apache.http.impl.client;
    
    import java.io.IOException;
    import java.io.InterruptedIOException;
    import java.net.ConnectException;
    import java.net.UnknownHostException;
    import java.util.Arrays;
    import java.util.Collection;
    import java.util.HashSet;
    import java.util.Set;
    
    import javax.net.ssl.SSLException;
    
    import org.apache.http.HttpEntityEnclosingRequest;
    import org.apache.http.HttpRequest;
    import org.apache.http.annotation.Immutable;
    import org.apache.http.client.HttpRequestRetryHandler;
    import org.apache.http.client.methods.HttpUriRequest;
    import org.apache.http.client.protocol.HttpClientContext;
    import org.apache.http.protocol.HttpContext;
    import org.apache.http.util.Args;
    
    /**
     * The default {@link HttpRequestRetryHandler} used by request executors.
     *
     * @since 4.0
     */
    @Immutable
    public class DefaultHttpRequestRetryHandler implements HttpRequestRetryHandler {
    
        public static final DefaultHttpRequestRetryHandler INSTANCE = new DefaultHttpRequestRetryHandler();
    
        /** the number of times a method will be retried */
        private final int retryCount;
    
        /** Whether or not methods that have successfully sent their request will be retried */
        private final boolean requestSentRetryEnabled;
    
        private final Set<Class<? extends IOException>> nonRetriableClasses;
    
        /**
         * Create the request retry handler using the specified IOException classes
         *
         * @param retryCount how many times to retry; 0 means no retries
         * @param requestSentRetryEnabled true if it's OK to retry requests that have been sent
         * @param clazzes the IOException types that should not be retried
         * @since 4.3
         */
        protected DefaultHttpRequestRetryHandler(
                final int retryCount,
                final boolean requestSentRetryEnabled,
                final Collection<Class<? extends IOException>> clazzes) {
            super();
            this.retryCount = retryCount;
            this.requestSentRetryEnabled = requestSentRetryEnabled;
            this.nonRetriableClasses = new HashSet<Class<? extends IOException>>();
            for (final Class<? extends IOException> clazz: clazzes) {
                this.nonRetriableClasses.add(clazz);
            }
        }
    
        /**
         * Create the request retry handler using the following list of
         * non-retriable IOException classes: <br>
         * <ul>
         * <li>InterruptedIOException</li>
         * <li>UnknownHostException</li>
         * <li>ConnectException</li>
         * <li>SSLException</li>
         * </ul>
         * @param retryCount how many times to retry; 0 means no retries
         * @param requestSentRetryEnabled true if it's OK to retry non-idempotent requests that have been sent
         */
        @SuppressWarnings("unchecked")
        public DefaultHttpRequestRetryHandler(final int retryCount, final boolean requestSentRetryEnabled) {
            this(retryCount, requestSentRetryEnabled, Arrays.asList(
                    InterruptedIOException.class,
                    UnknownHostException.class,
                    ConnectException.class,
                    SSLException.class));
        }
    
        /**
         * Create the request retry handler with a retry count of 3, requestSentRetryEnabled false
         * and using the following list of non-retriable IOException classes: <br>
         * <ul>
         * <li>InterruptedIOException</li>
         * <li>UnknownHostException</li>
         * <li>ConnectException</li>
         * <li>SSLException</li>
         * </ul>
         */
        public DefaultHttpRequestRetryHandler() {
            this(3, false);
        }
        /**
         * Used {@code retryCount} and {@code requestSentRetryEnabled} to determine
         * if the given method should be retried.
         */
        @Override
        public boolean retryRequest(
                final IOException exception,
                final int executionCount,
                final HttpContext context) {
            Args.notNull(exception, "Exception parameter");
            Args.notNull(context, "HTTP context");
            if (executionCount > this.retryCount) {
                // Do not retry if over max retry count
                return false;
            }
            if (this.nonRetriableClasses.contains(exception.getClass())) {
                return false;
            } else {
                for (final Class<? extends IOException> rejectException : this.nonRetriableClasses) {
                    if (rejectException.isInstance(exception)) {
                        return false;
                    }
                }
            }
            final HttpClientContext clientContext = HttpClientContext.adapt(context);
            final HttpRequest request = clientContext.getRequest();
    
            if(requestIsAborted(request)){
                return false;
            }
    
            if (handleAsIdempotent(request)) {
                // Retry if the request is considered idempotent
                return true;
            }
    
            if (!clientContext.isRequestSent() || this.requestSentRetryEnabled) {
                // Retry if the request has not been sent fully or
                // if it's OK to retry methods that have been sent
                return true;
            }
            // otherwise do not retry
            return false;
        }
    
        /**
         * @return {@code true} if this handler will retry methods that have
         * successfully sent their request, {@code false} otherwise
         */
        public boolean isRequestSentRetryEnabled() {
            return requestSentRetryEnabled;
        }
    
        /**
         * @return the maximum number of times a method will be retried
         */
        public int getRetryCount() {
            return retryCount;
        }
    
        /**
         * @since 4.2
         */
        protected boolean handleAsIdempotent(final HttpRequest request) {
            return !(request instanceof HttpEntityEnclosingRequest);
        }
    
        /**
         * @since 4.2
         *
         * @deprecated (4.3)
         */
        @Deprecated
        protected boolean requestIsAborted(final HttpRequest request) {
            HttpRequest req = request;
            if (request instanceof RequestWrapper) { // does not forward request to original
                req = ((RequestWrapper) request).getOriginal();
            }
            return (req instanceof HttpUriRequest && ((HttpUriRequest)req).isAborted());
        }
    
    }
    

    默认构造函数是

        public DefaultHttpRequestRetryHandler() {
            this(3, false);
        }
    

    参数requestSentRetryEnabled是请求是否发送成功都重试 这里设置了false,一般情况下都不要为true我觉得。
    主要实现的方法是

     boolean retryRequest(IOException exception, int executionCount, HttpContext context);
    

    StandardHttpRequestRetryHandler并没有重写该方法

    @Immutable
    public class StandardHttpRequestRetryHandler extends DefaultHttpRequestRetryHandler {
    
        private final Map<String, Boolean> idempotentMethods;
    
    
        public StandardHttpRequestRetryHandler(final int retryCount, final boolean requestSentRetryEnabled) {
            super(retryCount, requestSentRetryEnabled);
            this.idempotentMethods = new ConcurrentHashMap<String, Boolean>();
            this.idempotentMethods.put("GET", Boolean.TRUE);
            this.idempotentMethods.put("HEAD", Boolean.TRUE);
            this.idempotentMethods.put("PUT", Boolean.TRUE);
            this.idempotentMethods.put("DELETE", Boolean.TRUE);
            this.idempotentMethods.put("OPTIONS", Boolean.TRUE);
            this.idempotentMethods.put("TRACE", Boolean.TRUE);
        }
    
       
        public StandardHttpRequestRetryHandler() {
            this(3, false);
        }
    
        @Override
        protected boolean handleAsIdempotent(final HttpRequest request) {
            final String method = request.getRequestLine().getMethod().toUpperCase(Locale.ROOT);
            final Boolean b = this.idempotentMethods.get(method);
            return b != null && b.booleanValue();
        }
    
    }
    

    只是重写了

    protected boolean handleAsIdempotent(final HttpRequest request)
    

    我们参考后完全可以实现自己的HttpRequestRetryHandler

    初始化httpClient
    在httpClient4.5中,初始化的方式已经和以前版有差异

    static  CloseableHttpClient client = HttpClients.createDefault();  
    和
    static CloseableHttpClient httpClient=HttpClients.custom().build();  
    在该方式中可以添加一些网络请求的设置
    

    可以直接使用匿名类

    HttpRequestRetryHandler handler = new HttpRequestRetryHandler() {  
      
                @Override  
                public boolean retryRequest(IOException arg0, int retryTimes, HttpContext arg2) {  
                  if (retryTimes > 5) {  
                        return false;  
                    }  
                    if (arg0 instanceof UnknownHostException || arg0 instanceof ConnectTimeoutException  
                            || !(arg0 instanceof SSLException) || arg0 instanceof NoHttpResponseException) {  
                        return true;  
                    }  
                   
                    HttpClientContext clientContext = HttpClientContext.adapt(arg2);  
                    HttpRequest request = clientContext.getRequest();  
                    boolean idempotent = !(request instanceof HttpEntityEnclosingRequest);  
                    if (idempotent) {  
                        // 如果请求被认为是幂等的,那么就重试。即重复执行不影响程序其他效果的  
                        return true;  
                    }  
                    return false;  
                }  
            };  
    
    

    还可以设置路由策略 即设置代理方式访问

    HttpHost proxy = new HttpHost("127.0.0.1", 80);// 设置代理ip  
            DefaultProxyRoutePlanner routePlanner = new DefaultProxyRoutePlanner(proxy);  
           CloseableHttpClient  httpClient = HttpClients.custom().setRoutePlanner(routePlanner).setRetryHandler(handler)  
                    .setConnectionTimeToLive(1, TimeUnit.DAYS).setDefaultCookieStore(cookieStore).build();  
    

    附录:
    Httpclient4.5后对于get请求方式的改变

    static RequestConfig config = RequestConfig.custom().setConnectTimeout(6000).setSocketTimeout(6000)  
                .setCookieSpec(CookieSpecs.STANDARD).build(); // 设置超时及cookie策略  
        public static String getDemo(String url) {  
            HttpGet get = new HttpGet(url);  
            get.setConfig(config);  
            HttpResponse response = null;  
            String html = null;  
            try {  
                response = client.execute(get);  
                int statusCode = response.getStatusLine().getStatusCode();// 连接代码  
                Header[] headers = response.getAllHeaders();  
                // 用于得到返回的文件头  
                for (Header header : headers) {  
                    System.out.println(header);  
                }  
                html = new String(EntityUtils.toString(response.getEntity()).getBytes("gb2312"), "utf8");  
                // 在后面参数输入网站的编码,一般为utf-8  
                // 返回的html代码,避免发生编码错误  
                System.out.println(html);  
            } catch (IOException e) {  
                e.printStackTrace();  
            }  
            return html;  
        }  
    

    大致流程:新建httpget对象->用httpClient执行->解析返回的response得到自己需要的内容
    cookieSpec:即cookie策略。参数为cookiespecs的一些字段。作用:1、如果网站header中有set-cookie字段时,采用默认方式可能会被cookie reject,无法写入cookie。将此属性设置成CookieSpecs.STANDARD_STRICT可避免此情况。2、如果要想忽略cookie访问,则将此属性设置成CookieSpecs.IGNORE_COOKIES。
    tips:注意网站编码,否则容易出现乱码
    执行post请求:

    public static void postDemo(String url) {  
            HttpPost post = new HttpPost(url);  
            post.setConfig(config);  
            post.setHeader("User-Agent",  
                    "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.93 Safari/537.36");  
            post.setHeader("Connection", "keep-alive");  
            List<NameValuePair> list = new ArrayList<NameValuePair>();  
            list.add(new BasicNameValuePair("key", "value"));  
            list.add(new BasicNameValuePair("key", "value"));  
            list.add(new BasicNameValuePair("key", "value"));  
            list.add(new BasicNameValuePair("key", "value"));  
            list.add(new BasicNameValuePair("key", "value"));  
            try {  
                HttpEntity entity = new UrlEncodedFormEntity(list, "utf-8");  
                post.setEntity(entity);  
                HttpResponse response = client.execute(post);  
                String responseHtml = EntityUtils.toString(response.getEntity());  
                System.out.println(responseHtml);  
            } catch (IOException e) {  
                e.printStackTrace();  
            }  
        }  
    

    大致流程:新建post对象->新建需要的表单页->将表单内容设置入请求中->执行并获得response
    解析response

    //得到返回内容
    String responseHtml = EntityUtils.toString(response.getEntity());  
    int statusCode = response.getStatusLine().getStatusCode();// 连接代码  
    //得到response header
    response.getFirstHeader("key");// 得到第一个名字为key的header  
                response.getHeaders("key");// 得到名字为key的所有header,返回一个数组  
                response.getLastHeader("key");  
    //得到inputstream:(下载网络部分资源的时候有可能会对cookie有要求,此时需要用到httpClient来下载。)例如验证码等等。
    InputStream inputStream = response.getEntity().getContent();
    

    管理cookie

    CookieStore cookieStore = new BasicCookieStore(); 
    CloseableHttpClient httpClient= HttpClients.custom()
    .setDefaultCookieStore(cookieStore).build();  
    

    httpClient里默认自动管理cookie,如果想要提取cookie或者发送自定义的cookie,则需要在httpClient对象初始化时设置一个默认的cookiestore来保存。(方法见初始化httpClient对象里的setDefaultCookieStore)。
    得到当前所有cookie:

    List<Cookie> list = cookieStore.getCookies();// get all cookies  
            System.out.println("cookie is:");  
            System.out.println("-----------------------");  
            for (Cookie cookie : list) {  
                System.out.println(cookie);  
            }  
            System.out.println("-----------------------");  
    

    清除所有cookie:

    cookieStore.clear();
    

    发送自定义cookie:(new了一个对象之后可以设置多种属性。)

    BasicClientCookie cookie = new BasicClientCookie("name", "value");  
            // new a cookie  
            cookie.setDomain("domain");  
            cookie.setExpiryDate(new Date());  
            // set the properties of the cookie  
                    cookieStore.addCookie(cookie);
    

    管理header:
    在平常抓取过程中,经常需要在请求中加入许多header伪装成一个正常的浏览器。以免被服务器认出是爬虫而被封。
    设置一些常见header:

    post.setHeader("User-Agent",  
                    "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.93 Safari/537.36");  
            post.setHeader("Connection", "keep-alive");  
    

    注意:下载某些网站的资源时,服务器会获取你的来源站,并发出对应的相应。如果来源站不对,可能会被服务器拒绝。此时只需要在请求中加个header就行。

    get1.setHeader("Referer", "http://www.a.com");  
    

    ps:
    1、爬虫也要遵守基本法,在多次请求的之中为了不给对方服务器造成负担(避免被封),尽量在请求间sleep一个随机数值。
    2、爬取非英文网站时注意编码格式,国内一般为utf-8,也有一些是gb2312.获取时注意转码。
    3、多获得一些可靠IP(备胎),一旦自身ip被封,赶快去找备胎。附带一个简单的判断网站是否需要代理方法:

    // 判断访问目标网站是否需要代理  
        private boolean isNeedProxy() {  
            boolean result = true;  
            URL url;  
            try {  
                url = new URL("http://apkpure.com/");  
                HttpURLConnection connection = (HttpURLConnection) url.openConnection();  
                connection.setConnectTimeout(6000);  
                // int i = connection.getResponseCode();  
                int i = connection.getContentLength();  
                if (i > 0) {  
                    result = false;  
                }  
            } catch (IOException e) {  
                e.printStackTrace();  
            }  
            return result;  
        }  
    

    相关文章

      网友评论

      • TimBing:@Override
        public boolean retryRequest(IOException arg0, int retryTimes, HttpContext arg2) {
        if (arg0 instanceof UnknownHostException || arg0 instanceof ConnectTimeoutException
        || !(arg0 instanceof SSLException) || arg0 instanceof NoHttpResponseException) {
        return true;
        }
        if (retryTimes > 5) {
        return false;
        }
        HttpClientContext clientContext = HttpClientContext.adapt(arg2);
        HttpRequest request = clientContext.getRequest();
        boolean idempotent = !(request instanceof HttpEntityEnclosingRequest);
        if (idempotent) {
        // 如果请求被认为是幂等的,那么就重试。即重复执行不影响程序其他效果的
        return true;
        }
        return false;
        }

        这会不会死循环啊,要是一直抛UnknownHostException异常,就死循环了,一直重试下去。
        holly_wang_王小飞:@TimBing 确实:wink:
        TimBing:@holly_wang_王小飞 我觉得应该把 retryTime>5判断放到前面去
        holly_wang_王小飞:@TimBing 有retryTimes>5判断

      本文标题:HttpClient之自定义HttpRequestRetryHa

      本文链接:https://www.haomeiwen.com/subject/gfqlpttx.html