Android 编码URL中文字符串

作者: simpleeeeee | 来源:发表于2016-01-04 15:45 被阅读6590次

    如果 url 包含中文,浏览器是会自动编码解析的,但是在客户端开发会遇到中文编码问题,所以必须做中文编码。提供三种中文编码方式,仅供开发参考。

    方案1:对源码进行修改,安全可靠

    package com.sunpeng.util;
    
    import java.io.CharArrayWriter;
    import java.io.UnsupportedEncodingException;
    import java.net.URLDecoder;
    import java.nio.charset.Charset;
    import java.nio.charset.IllegalCharsetNameException;
    import java.nio.charset.UnsupportedCharsetException;
    import java.util.BitSet;
    
    public class URLEncoderURI {
    
        static BitSet dontNeedEncoding;
        static final int caseDiff = ('a' - 'A');
    
    
        static {
    
            /*
             * The list of characters that are not encoded has been determined as
             * follows:
             * 
             * RFC 2396 states: ----- Data characters that are allowed in a URI but
             * do not have a reserved purpose are called unreserved. These include
             * upper and lower case letters, decimal digits, and a limited set of
             * punctuation marks and symbols.
             * 
             * unreserved = alphanum | mark
             * 
             * mark = "-" | "_" | "." | "!" | "~" | "*" | "'" | "(" | ")"
             * 
             * Unreserved characters can be escaped without changing the semantics
             * of the URI, but this should not be done unless the URI is being used
             * in a context that does not allow the unescaped character to appear.
             * -----
             * 
             * It appears that both Netscape and Internet Explorer escape all
             * special characters from this list with the exception of "-", "_",
             * ".", "*". While it is not clear why they are escaping the other
             * characters, perhaps it is safest to assume that there might be
             * contexts in which the others are unsafe if not escaped. Therefore, we
             * will use the same list. It is also noteworthy that this is consistent
             * with O'Reilly's "HTML: The Definitive Guide" (page 164).
             * 
             * As a last note, Intenet Explorer does not encode the "@" character
             * which is clearly not unreserved according to the RFC. We are being
             * consistent with the RFC in this matter, as is Netscape.
             */
    
            dontNeedEncoding = new BitSet(256);
            int i;
            for (i = 'a'; i <= 'z'; i++) {
                dontNeedEncoding.set(i);
            }
            for (i = 'A'; i <= 'Z'; i++) {
                dontNeedEncoding.set(i);
            }
            for (i = '0'; i <= '9'; i++) {
                dontNeedEncoding.set(i);
            }
            dontNeedEncoding.set(' '); /*
                                         * encoding a space to a + is done in the
                                         * encode() method
                                         */
            dontNeedEncoding.set('-');
            dontNeedEncoding.set('_');
            dontNeedEncoding.set('.');
            dontNeedEncoding.set('*');
            dontNeedEncoding.set(':');
            dontNeedEncoding.set('/');
            dontNeedEncoding.set('?');
            dontNeedEncoding.set(';');
            dontNeedEncoding.set('&');
            dontNeedEncoding.set('=');
    
        }
    
        /**
         * You can't call the constructor.
         */
        private URLEncoderURI() {
        }
    
    
    
        /**
         * Translates a string into <code>application/x-www-form-urlencoded</code>
         * format using a specific encoding scheme. This method uses the supplied
         * encoding scheme to obtain the bytes for unsafe characters.
         * <p>
         * <em><strong>Note:</strong> The <a href=
         * "http://www.w3.org/TR/html40/appendix/notes.html#non-ascii-chars">
         * World Wide Web Consortium Recommendation</a> states that
         * UTF-8 should be used. Not doing so may introduce
         * incompatibilites.</em>
         * 
         * @param s
         *            <code>String</code> to be translated.
         * @param enc
         *            The name of a supported <a
         *            href="../lang/package-summary.html#charenc">character
         *            encoding</a>.
         * @return the translated <code>String</code>.
         * @exception UnsupportedEncodingException
         *                If the named encoding is not supported
         * @see URLDecoder#decode(java.lang.String, java.lang.String)
         * @since 1.4
         */
        public static String encode(String s, String enc) throws UnsupportedEncodingException {
    
            boolean needToChange = false;
            StringBuffer out = new StringBuffer(s.length());
            Charset charset;
            CharArrayWriter charArrayWriter = new CharArrayWriter();
    
            if (enc == null)
                throw new NullPointerException("charsetName");
    
            try {
                charset = Charset.forName(enc);
            } catch (IllegalCharsetNameException e) {
                throw new UnsupportedEncodingException(enc);
            } catch (UnsupportedCharsetException e) {
                throw new UnsupportedEncodingException(enc);
            }
    
            for (int i = 0; i < s.length();) {
                int c = (int) s.charAt(i);
                // System.out.println("Examining character: " + c);
                if (dontNeedEncoding.get(c)) {
                    if (c == ' ') {
                        c = '+';
                        needToChange = true;
                    }
                    // System.out.println("Storing: " + c);
                    out.append((char) c);
                    i++;
                } else {
                    // convert to external encoding before hex conversion
                    do {
                        charArrayWriter.write(c);
                        /*
                         * If this character represents the start of a Unicode
                         * surrogate pair, then pass in two characters. It's not
                         * clear what should be done if a bytes reserved in the
                         * surrogate pairs range occurs outside of a legal surrogate
                         * pair. For now, just treat it as if it were any other
                         * character.
                         */
                        if (c >= 0xD800 && c <= 0xDBFF) {
                            /*
                             * System.out.println(Integer.toHexString(c) +
                             * " is high surrogate");
                             */
                            if ((i + 1) < s.length()) {
                                int d = (int) s.charAt(i + 1);
                                /*
                                 * System.out.println("\tExamining " +
                                 * Integer.toHexString(d));
                                 */
                                if (d >= 0xDC00 && d <= 0xDFFF) {
                                    /*
                                     * System.out.println("\t" +
                                     * Integer.toHexString(d) +
                                     * " is low surrogate");
                                     */
                                    charArrayWriter.write(d);
                                    i++;
                                }
                            }
                        }
                        i++;
                    } while (i < s.length() && !dontNeedEncoding.get((c = (int) s.charAt(i))));
    
                    charArrayWriter.flush();
                    String str = new String(charArrayWriter.toCharArray());
                    byte[] ba = str.getBytes(charset);
                    for (int j = 0; j < ba.length; j++) {
                        out.append('%');
                        char ch = Character.forDigit((ba[j] >> 4) & 0xF, 16);
                        // converting to use uppercase letter as part of
                        // the hex value if ch is a letter.
                        if (Character.isLetter(ch)) {
                            ch -= caseDiff;
                        }
                        out.append(ch);
                        ch = Character.forDigit(ba[j] & 0xF, 16);
                        if (Character.isLetter(ch)) {
                            ch -= caseDiff;
                        }
                        out.append(ch);
                    }
                    charArrayWriter.reset();
                    needToChange = true;
                }
            }
    
            return (needToChange ? out.toString() : s);
        }
    }
    

    用法:URLEncoderURI.encode(url, "UTF-8");

    方案2:使用正则表达式

        public static String encodeUrl(String url) {
            return Uri.encode(url, "-![.:/,%?&=]");
        }
    

    方案3:直接遍历url

    参考:java中url汉字编码互相转换实例

        public static String toUtf8String(String s) {
            StringBuffer sb = new StringBuffer();
            for (int i = 0; i < s.length(); i++) {
                char c = s.charAt(i);
                if (c >= 0 && c <= 255) {
                    sb.append(c);
                } else {
                    byte[] b;
                    try {
                        b = String.valueOf(c).getBytes("utf-8");
                    } catch (Exception ex) {
                        System.out.println(ex);
                        b = new byte[0];
                    }
                    for (int j = 0; j < b.length; j++) {
                        int k = b[j];
                        if (k < 0)
                            k += 256;
                        sb.append("%" + Integer.toHexString(k).toUpperCase());
                    }
                }
            }
            return sb.toString();
        }
    

    相关文章

      网友评论

        本文标题:Android 编码URL中文字符串

        本文链接:https://www.haomeiwen.com/subject/hyhahttx.html