美文网首页
Unicode字符, utf8编码, 展示小工具

Unicode字符, utf8编码, 展示小工具

作者: 那就太谢谢你了 | 来源:发表于2023-02-28 19:04 被阅读0次

    Unicode

    www.unicode.org/Public/UNIDATA/Blocks.txt

    Utf-8编码规则

    image.png

    Unicode 和 UTF-8 有什么区别? - 知乎 (zhihu.com)

    代码

    
    package com.vege;
    
    import org.apache.tomcat.util.buf.HexUtils;
    
    import java.io.IOException;
    import java.nio.charset.StandardCharsets;
    import java.util.HashMap;
    import java.util.Map;
    
    public class UnicodeToUtf8 {
    
        // 字符和十六进制数的映射
        private static final Map<Character, Integer> map = new HashMap<>();
    
        static {
            map.put('0', 0);
            map.put('1', 1);
            map.put('2', 2);
            map.put('3', 3);
            map.put('4', 4);
            map.put('5', 5);
            map.put('6', 6);
            map.put('7', 7);
            map.put('8', 8);
            map.put('9', 9);
            map.put('A', 10);
            map.put('B', 11);
            map.put('C', 12);
            map.put('D', 13);
            map.put('E', 14);
            map.put('F', 15);
        }
    
        public static void main(String[] args) throws IOException {
    
            // 打印unicode中, "0x4DC0"对应的utf8编码的字符
            String temp = unicodeToUtf8Str(hexStrToInt("4DC0"));
            System.out.println("0x4DC0 - " + temp);
            System.out.println();
    
            // ======================================================
    
            // 打印从from到to的所有的unicode字符
            // 每16个一行
            int from = 0x4DC0;
            int to = 0x4DFF;
            for (int i = from, j = 0; i < to; i++, j++) {
                if (j % 16 == 0) {
                    System.out.println();
                    String start = "0x" + HexUtils.toHexString(intToByteArrayHighFirst(i)).substring(2);
                    String end = start.substring(0, start.length() - 1) + "f";
                    System.out.println("-> " + start + " - " + end + " : ");
                }
    
                System.out.print("\t");
                System.out.print(unicodeToUtf8Str(i));
            }
    
            System.out.println();
        }
    
        /**
         * 字符串形式的十六进制数转成int
         * eg. "4DC0" -> 19904
         *
         * @param str 字符串形式的十六进制数
         * @return 对应的整型数据
         */
        private static int hexStrToInt(String str) {
            int result = 0;
            for (char c : str.toCharArray()) {
                if (!map.containsKey(c)) {
                    throw new RuntimeException("invalid char");
                }
                result = (result << 4) | map.get(c);
            }
            return result;
        }
    
        /**
         * 携带有unicode编码的信息整型数据 (int有32bit,unicode只用到低位24个bit)
         * 转成对应的utf-8的字符
         * <p>
         * 注意这里仅传入单个unicode编码, 传出单个utf8编码的字符
         *
         * @param unicode 单个unicode编码
         * @return 单个utf8编码的字符
         */
        private static String unicodeToUtf8Str(int unicode) {
            if (unicode >= 0 && unicode <= 0x7F) {
                byte b = (byte) unicode;
                byte[] bytes = new byte[1];
                bytes[0] = b;
                return new String(bytes, StandardCharsets.UTF_8);
            } else if (unicode > 0x7F && unicode <= 0x7FF) {
                int temp = unicode & 0b11111111111;
                byte b1 = (byte) (0b11000000 | (temp >> 6));
                byte b2 = (byte) (0b10000000 | (temp & 0b111111));
                byte[] bytes = new byte[2];
                bytes[0] = b1;
                bytes[1] = b2;
                return new String(bytes, StandardCharsets.UTF_8);
            } else if (unicode > 0x7FF && unicode <= 0xFFFF) {
                int temp = unicode & 0b1111111111111111;
                byte b1 = (byte) (0b11100000 | (temp >> 12));
                byte b2 = (byte) (0b10000000 | ((temp & 0b111111000000) >> 6));
                byte b3 = (byte) (0b10000000 | (temp & 0b111111));
                byte[] bytes = new byte[3];
                bytes[0] = b1;
                bytes[1] = b2;
                bytes[2] = b3;
                return new String(bytes, StandardCharsets.UTF_8);
            } else if (unicode > 0x10000 && unicode <= 0x10FFFF) {
                int temp = unicode & 0b111111111111111111111;
                byte b1 = (byte) (0b11110000 | (temp >> 18));
                byte b2 = (byte) (0b10000000 | ((temp & 0b111111000000000000) >> 12));
                byte b3 = (byte) (0b10000000 | ((temp & 0b111111000000) >> 6));
                byte b4 = (byte) (0b10000000 | (temp & 0b111111));
                byte[] bytes = new byte[4];
                bytes[0] = b1;
                bytes[1] = b2;
                bytes[2] = b3;
                bytes[3] = b4;
                return new String(bytes, StandardCharsets.UTF_8);
            } else {
                throw new RuntimeException("error unicode");
            }
        }
    
        /**
         * int转byte数组, int高位的部分存在数组低位的槽中
         * eg. 19904 -> new byte[] {0b00000000, 0b00000000, 0b01001101, 0b11000000}
         *
         * @param a 整型数据
         * @return byte数组
         */
        private static byte[] intToByteArrayHighFirst(int a) {
            byte[] b = new byte[4];
            b[3] = (byte) (a & 0xff);
            b[2] = (byte) (a >> 8 & 0xff);
            b[1] = (byte) (a >> 16 & 0xff);
            b[0] = (byte) (a >> 24 & 0xff);
            return b;
        }
    
    }
    

    结果

    image.png

    相关文章

      网友评论

          本文标题:Unicode字符, utf8编码, 展示小工具

          本文链接:https://www.haomeiwen.com/subject/hqilldtx.html