美文网首页
c++实现UTF-16转UTF-8

c++实现UTF-16转UTF-8

作者: 一路向后 | 来源:发表于2021-11-24 23:01 被阅读0次

    1.源码实现

    #include <iostream>
    #include <cstring>
    
    class UTF16 {
    public:
        static int toUCS4(const unsigned short *utf16, unsigned short *ucs4);
        static int toUTF8(const unsigned short *utf16, unsigned char *utf8);
        static int toUTF8(const unsigned short *utf16, int n, unsigned char *utf8);
    };
    
    using namespace std;
    
    int UTF16::toUCS4(const unsigned short *utf16, unsigned short *ucs4)
    {
        if(utf16[0] >= 0xd800 && utf16[0] <= 0xdfff)
        {
            if(utf16[0] < 0xdc00)
            {
                if(utf16[1] >= 0xdc00 && utf16[1] <= 0xdfff)
                {
                    ucs4[1] = (utf16[0] & 0x3ff);
                    ucs4[0] = (utf16[1] & 0x3ff);
                    ucs4[0] = ((ucs4[1] << 10) | ucs4[0]);
                    ucs4[1] = ((ucs4[1] >> 6) | 1);
    
                    //printf("%04x\n", ucs4[0]);
                    //printf("%04x\n", ucs4[1]);
    
                    return 2;
                }
    
                return -1;
            }
    
            return -1;
        }
        else
        {
            ucs4[0] = utf16[0];
            ucs4[1] = 0x00;
        }
    
        return 1;
    }
    
    int UTF16::toUTF8(const unsigned short *utf16, unsigned char *utf8)
    {
        unsigned short ucs4[2];
        unsigned int *u = (unsigned int *)ucs4;
        int w;
    
        if(utf16[0] >= 0xd800 && utf16[0] <= 0xdfff)
        {
            if(utf16[0] < 0xdc00)
            {
                if(utf16[1] >= 0xdc00 && utf16[1] <= 0xdfff)
                {
                    ucs4[1] = (utf16[0] & 0x3ff);
                    ucs4[0] = (utf16[1] & 0x3ff);
                    ucs4[0] = ((ucs4[1] << 10) | ucs4[0]);
                    ucs4[1] = ((ucs4[1] >> 6) | 1);
                }
                else
                {
                    return -1;
                }
            }
            else
            {
                return -1;
            }
        }
        else
        {
            ucs4[0] = utf16[0];
            ucs4[1] = 0x00;
        }
    
        w = *u;
    
        if(w <= 0x0000007f)
        {
            /*U-00000000 - U-0000007F:  0xxxxxxx*/
            utf8[0] = (w & 0x7f);
    
            return 1;
        }
        else if(w >= 0x00000080 && w <= 0x000007ff)
        {
            /*U-00000080 - U-000007FF:  110xxxxx 10xxxxxx*/
            utf8[1] = (w & 0x3f) | 0x80;
            utf8[0] = ((w >> 6) & 0x1f) | 0xc0;
    
            return 2;
        }
        else if(w >= 0x00000800 && w <= 0x0000ffff)
        {
            /*U-00000800 - U-0000FFFF:  1110xxxx 10xxxxxx 10xxxxxx*/
            utf8[2] = (w & 0x3f) | 0x80;
            utf8[1] = ((w >> 6) & 0x3f) | 0x80;
            utf8[0] = ((w >> 12) & 0x0f) | 0xe0;
    
            return 3;
        }
        else if(w >= 0x00010000 && w <= 0x001fffff)
        {
            /*U-00010000 - U-001FFFFF:  11110xxx 10xxxxxx 10xxxxxx 10xxxxxx*/
            utf8[3] = (w & 0x3f) | 0x80;
            utf8[2] = ((w >> 6) & 0x3f) | 0x80;
            utf8[1] = ((w >> 12) & 0x3f) | 0x80;
            utf8[0] = ((w >> 18) & 0x07) | 0xf0;
    
            return 4;
        }
        else if(w >= 0x00200000 && w <= 0x03ffffff)
        {
            /*U-00200000 - U-03FFFFFF:  111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx*/
            utf8[4] = (w & 0x3f) | 0x80;
            utf8[3] = ((w >> 6) & 0x3f) | 0x80;
            utf8[2] = ((w >> 12) & 0x3f) | 0x80;
            utf8[1] = ((w >> 18) & 0x3f) | 0x80;
            utf8[0] = ((w >> 24) & 0x03) | 0xf8;
    
            return 5;
        }
        else if(w >= 0x04000000 && w <= 0x7fffffff)
        {
            /*U-04000000 - U-7FFFFFFF:  1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx*/
            utf8[5] = (w & 0x3f) | 0x80;
            utf8[4] = ((w >> 6) & 0x3f) | 0x80;
            utf8[3] = ((w >> 12) & 0x3f) | 0x80;
            utf8[2] = ((w >> 18) & 0x3f) | 0x80;
            utf8[1] = ((w >> 24) & 0x03) | 0xf8;
            utf8[0] = ((w >> 30) & 0x01) | 0xfc;
    
            return 6;
        }
    
        return 0;
    }
    
    int UTF16::toUTF8(const unsigned short *utf16, int n, unsigned char *utf8)
    {
        unsigned short ucs4[2];
        unsigned int *u = (unsigned int *)ucs4;
        int w;
        int m = 0;
        int e = 0;
        int i = 0;
        int j = 0;
    
        for(i=0; i<n; i+=m)
        {
            if(utf16[i] >= 0xd800 && utf16[i] <= 0xdfff)
            {
                if(utf16[i] < 0xdc00)
                {
                    if(utf16[i+1] >= 0xdc00 && utf16[i+1] <= 0xdfff)
                    {
                        ucs4[1] = (utf16[i+0] & 0x3ff);
                        ucs4[0] = (utf16[i+1] & 0x3ff);
                        ucs4[0] = ((ucs4[1] << 10) | ucs4[0]);
                        ucs4[1] = ((ucs4[1] >> 6) | 1);
    
                        m = 2;
                    }
                    else
                    {
                        m = -1;
                    }
                }
                else
                {
                    m = -1;
                }
            }
            else
            {
                ucs4[0] = utf16[i];
                ucs4[1] = 0x00;
    
                m = 1;
            }
    
            if(m == -1)
            {
                utf8[j] = 0x00;
    
                return j;
            }
    
            w = *u;
    
            e = 0;
    
            if(w <= 0x0000007f)
            {
                /*U-00000000 - U-0000007F:  0xxxxxxx*/
                utf8[j+0] = (w & 0x7f);
    
                e = 1;
            }
            else if(w >= 0x00000080 && w <= 0x000007ff)
            {
                /*U-00000080 - U-000007FF:  110xxxxx 10xxxxxx*/
                utf8[j+1] = (w & 0x3f) | 0x80;
                utf8[j+0] = ((w >> 6) & 0x1f) | 0xc0;
    
                e = 2;
            }
            else if(w >= 0x00000800 && w <= 0x0000ffff)
            {
                /*U-00000800 - U-0000FFFF:  1110xxxx 10xxxxxx 10xxxxxx*/
                utf8[j+2] = (w & 0x3f) | 0x80;
                utf8[j+1] = ((w >> 6) & 0x3f) | 0x80;
                utf8[j+0] = ((w >> 12) & 0x0f) | 0xe0;
    
                e = 3;
            }
            else if(w >= 0x00010000 && w <= 0x001fffff)
            {
                /*U-00010000 - U-001FFFFF:  11110xxx 10xxxxxx 10xxxxxx 10xxxxxx*/
                utf8[j+3] = (w & 0x3f) | 0x80;
                utf8[j+2] = ((w >> 6) & 0x3f) | 0x80;
                utf8[j+1] = ((w >> 12) & 0x3f) | 0x80;
                utf8[j+0] = ((w >> 18) & 0x07) | 0xf0;
    
                e = 4;
            }
            else if(w >= 0x00200000 && w <= 0x03ffffff)
            {
                /*U-00200000 - U-03FFFFFF:  111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx*/
                utf8[j+4] = (w & 0x3f) | 0x80;
                utf8[j+3] = ((w >> 6) & 0x3f) | 0x80;
                utf8[j+2] = ((w >> 12) & 0x3f) | 0x80;
                utf8[j+1] = ((w >> 18) & 0x3f) | 0x80;
                utf8[j+0] = ((w >> 24) & 0x03) | 0xf8;
    
                e = 5;
            }
            else if(w >= 0x04000000 && w <= 0x7fffffff)
            {
                /*U-04000000 - U-7FFFFFFF:  1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx*/
                utf8[j+5] = (w & 0x3f) | 0x80;
                utf8[j+4] = ((w >> 6) & 0x3f) | 0x80;
                utf8[j+3] = ((w >> 12) & 0x3f) | 0x80;
                utf8[j+2] = ((w >> 18) & 0x3f) | 0x80;
                utf8[j+1] = ((w >> 24) & 0x03) | 0xf8;
                utf8[j+0] = ((w >> 30) & 0x01) | 0xfc;
    
                e = 6;
            }
    
            j += e;
        }
    
        utf8[j] = 0x00;
    
        return j;
    }
    
    int main()
    {
        const unsigned short utf16[4] = {0x4F60, 0x597D, 0x00, 0x00};
        unsigned char utf8[128];
    
        UTF16::toUTF8(utf16, 2, utf8);
    
        printf("%s\n", utf8);
    
        return 0;
    }
    

    2.编译源码

    $ g++ -o test test.cpp -std=c++11
    

    3.运行及其结果

    $ ./test
    你好
    

    相关文章

      网友评论

          本文标题:c++实现UTF-16转UTF-8

          本文链接:https://www.haomeiwen.com/subject/xegotrtx.html