Unicode

www.unicode.org/Public/UNIDATA/Blocks.txt

Utf-8编码规则

image.png

Unicode 和 UTF-8 有什么区别？ - 知乎 (zhihu.com)

代码


package com.vege;

import org.apache.tomcat.util.buf.HexUtils;

import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.HashMap;
import java.util.Map;

public class UnicodeToUtf8 {

    // 字符和十六进制数的映射
    private static final Map<Character, Integer> map = new HashMap<>();

    static {
        map.put('0', 0);
        map.put('1', 1);
        map.put('2', 2);
        map.put('3', 3);
        map.put('4', 4);
        map.put('5', 5);
        map.put('6', 6);
        map.put('7', 7);
        map.put('8', 8);
        map.put('9', 9);
        map.put('A', 10);
        map.put('B', 11);
        map.put('C', 12);
        map.put('D', 13);
        map.put('E', 14);
        map.put('F', 15);
    }

    public static void main(String[] args) throws IOException {

        // 打印unicode中, "0x4DC0"对应的utf8编码的字符
        String temp = unicodeToUtf8Str(hexStrToInt("4DC0"));
        System.out.println("0x4DC0 - " + temp);
        System.out.println();

        // ======================================================

        // 打印从from到to的所有的unicode字符
        // 每16个一行
        int from = 0x4DC0;
        int to = 0x4DFF;
        for (int i = from, j = 0; i < to; i++, j++) {
            if (j % 16 == 0) {
                System.out.println();
                String start = "0x" + HexUtils.toHexString(intToByteArrayHighFirst(i)).substring(2);
                String end = start.substring(0, start.length() - 1) + "f";
                System.out.println("-> " + start + " - " + end + " : ");
            }

            System.out.print("\t");
            System.out.print(unicodeToUtf8Str(i));
        }

        System.out.println();
    }

    /**
     * 字符串形式的十六进制数转成int
     * eg. "4DC0" -> 19904
     *
     * @param str 字符串形式的十六进制数
     * @return 对应的整型数据
     */
    private static int hexStrToInt(String str) {
        int result = 0;
        for (char c : str.toCharArray()) {
            if (!map.containsKey(c)) {
                throw new RuntimeException("invalid char");
            }
            result = (result << 4) | map.get(c);
        }
        return result;
    }

    /**
     * 携带有unicode编码的信息整型数据 (int有32bit,unicode只用到低位24个bit)
     * 转成对应的utf-8的字符
     * <p>
     * 注意这里仅传入单个unicode编码, 传出单个utf8编码的字符
     *
     * @param unicode 单个unicode编码
     * @return 单个utf8编码的字符
     */
    private static String unicodeToUtf8Str(int unicode) {
        if (unicode >= 0 && unicode <= 0x7F) {
            byte b = (byte) unicode;
            byte[] bytes = new byte[1];
            bytes[0] = b;
            return new String(bytes, StandardCharsets.UTF_8);
        } else if (unicode > 0x7F && unicode <= 0x7FF) {
            int temp = unicode & 0b11111111111;
            byte b1 = (byte) (0b11000000 | (temp >> 6));
            byte b2 = (byte) (0b10000000 | (temp & 0b111111));
            byte[] bytes = new byte[2];
            bytes[0] = b1;
            bytes[1] = b2;
            return new String(bytes, StandardCharsets.UTF_8);
        } else if (unicode > 0x7FF && unicode <= 0xFFFF) {
            int temp = unicode & 0b1111111111111111;
            byte b1 = (byte) (0b11100000 | (temp >> 12));
            byte b2 = (byte) (0b10000000 | ((temp & 0b111111000000) >> 6));
            byte b3 = (byte) (0b10000000 | (temp & 0b111111));
            byte[] bytes = new byte[3];
            bytes[0] = b1;
            bytes[1] = b2;
            bytes[2] = b3;
            return new String(bytes, StandardCharsets.UTF_8);
        } else if (unicode > 0x10000 && unicode <= 0x10FFFF) {
            int temp = unicode & 0b111111111111111111111;
            byte b1 = (byte) (0b11110000 | (temp >> 18));
            byte b2 = (byte) (0b10000000 | ((temp & 0b111111000000000000) >> 12));
            byte b3 = (byte) (0b10000000 | ((temp & 0b111111000000) >> 6));
            byte b4 = (byte) (0b10000000 | (temp & 0b111111));
            byte[] bytes = new byte[4];
            bytes[0] = b1;
            bytes[1] = b2;
            bytes[2] = b3;
            bytes[3] = b4;
            return new String(bytes, StandardCharsets.UTF_8);
        } else {
            throw new RuntimeException("error unicode");
        }
    }

    /**
     * int转byte数组, int高位的部分存在数组低位的槽中
     * eg. 19904 -> new byte[] {0b00000000, 0b00000000, 0b01001101, 0b11000000}
     *
     * @param a 整型数据
     * @return byte数组
     */
    private static byte[] intToByteArrayHighFirst(int a) {
        byte[] b = new byte[4];
        b[3] = (byte) (a & 0xff);
        b[2] = (byte) (a >> 8 & 0xff);
        b[1] = (byte) (a >> 16 & 0xff);
        b[0] = (byte) (a >> 24 & 0xff);
        return b;
    }

}