1.源码实现
#include <iostream>
#include <cstring>
class UTF16 {
public:
static int toUCS4(const unsigned short *utf16, unsigned short *ucs4);
static int toUTF8(const unsigned short *utf16, unsigned char *utf8);
static int toUTF8(const unsigned short *utf16, int n, unsigned char *utf8);
};
using namespace std;
int UTF16::toUCS4(const unsigned short *utf16, unsigned short *ucs4)
{
if(utf16[0] >= 0xd800 && utf16[0] <= 0xdfff)
{
if(utf16[0] < 0xdc00)
{
if(utf16[1] >= 0xdc00 && utf16[1] <= 0xdfff)
{
ucs4[1] = (utf16[0] & 0x3ff);
ucs4[0] = (utf16[1] & 0x3ff);
ucs4[0] = ((ucs4[1] << 10) | ucs4[0]);
ucs4[1] = ((ucs4[1] >> 6) | 1);
//printf("%04x\n", ucs4[0]);
//printf("%04x\n", ucs4[1]);
return 2;
}
return -1;
}
return -1;
}
else
{
ucs4[0] = utf16[0];
ucs4[1] = 0x00;
}
return 1;
}
int UTF16::toUTF8(const unsigned short *utf16, unsigned char *utf8)
{
unsigned short ucs4[2];
unsigned int *u = (unsigned int *)ucs4;
int w;
if(utf16[0] >= 0xd800 && utf16[0] <= 0xdfff)
{
if(utf16[0] < 0xdc00)
{
if(utf16[1] >= 0xdc00 && utf16[1] <= 0xdfff)
{
ucs4[1] = (utf16[0] & 0x3ff);
ucs4[0] = (utf16[1] & 0x3ff);
ucs4[0] = ((ucs4[1] << 10) | ucs4[0]);
ucs4[1] = ((ucs4[1] >> 6) | 1);
}
else
{
return -1;
}
}
else
{
return -1;
}
}
else
{
ucs4[0] = utf16[0];
ucs4[1] = 0x00;
}
w = *u;
if(w <= 0x0000007f)
{
/*U-00000000 - U-0000007F: 0xxxxxxx*/
utf8[0] = (w & 0x7f);
return 1;
}
else if(w >= 0x00000080 && w <= 0x000007ff)
{
/*U-00000080 - U-000007FF: 110xxxxx 10xxxxxx*/
utf8[1] = (w & 0x3f) | 0x80;
utf8[0] = ((w >> 6) & 0x1f) | 0xc0;
return 2;
}
else if(w >= 0x00000800 && w <= 0x0000ffff)
{
/*U-00000800 - U-0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx*/
utf8[2] = (w & 0x3f) | 0x80;
utf8[1] = ((w >> 6) & 0x3f) | 0x80;
utf8[0] = ((w >> 12) & 0x0f) | 0xe0;
return 3;
}
else if(w >= 0x00010000 && w <= 0x001fffff)
{
/*U-00010000 - U-001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx*/
utf8[3] = (w & 0x3f) | 0x80;
utf8[2] = ((w >> 6) & 0x3f) | 0x80;
utf8[1] = ((w >> 12) & 0x3f) | 0x80;
utf8[0] = ((w >> 18) & 0x07) | 0xf0;
return 4;
}
else if(w >= 0x00200000 && w <= 0x03ffffff)
{
/*U-00200000 - U-03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx*/
utf8[4] = (w & 0x3f) | 0x80;
utf8[3] = ((w >> 6) & 0x3f) | 0x80;
utf8[2] = ((w >> 12) & 0x3f) | 0x80;
utf8[1] = ((w >> 18) & 0x3f) | 0x80;
utf8[0] = ((w >> 24) & 0x03) | 0xf8;
return 5;
}
else if(w >= 0x04000000 && w <= 0x7fffffff)
{
/*U-04000000 - U-7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx*/
utf8[5] = (w & 0x3f) | 0x80;
utf8[4] = ((w >> 6) & 0x3f) | 0x80;
utf8[3] = ((w >> 12) & 0x3f) | 0x80;
utf8[2] = ((w >> 18) & 0x3f) | 0x80;
utf8[1] = ((w >> 24) & 0x03) | 0xf8;
utf8[0] = ((w >> 30) & 0x01) | 0xfc;
return 6;
}
return 0;
}
int UTF16::toUTF8(const unsigned short *utf16, int n, unsigned char *utf8)
{
unsigned short ucs4[2];
unsigned int *u = (unsigned int *)ucs4;
int w;
int m = 0;
int e = 0;
int i = 0;
int j = 0;
for(i=0; i<n; i+=m)
{
if(utf16[i] >= 0xd800 && utf16[i] <= 0xdfff)
{
if(utf16[i] < 0xdc00)
{
if(utf16[i+1] >= 0xdc00 && utf16[i+1] <= 0xdfff)
{
ucs4[1] = (utf16[i+0] & 0x3ff);
ucs4[0] = (utf16[i+1] & 0x3ff);
ucs4[0] = ((ucs4[1] << 10) | ucs4[0]);
ucs4[1] = ((ucs4[1] >> 6) | 1);
m = 2;
}
else
{
m = -1;
}
}
else
{
m = -1;
}
}
else
{
ucs4[0] = utf16[i];
ucs4[1] = 0x00;
m = 1;
}
if(m == -1)
{
utf8[j] = 0x00;
return j;
}
w = *u;
e = 0;
if(w <= 0x0000007f)
{
/*U-00000000 - U-0000007F: 0xxxxxxx*/
utf8[j+0] = (w & 0x7f);
e = 1;
}
else if(w >= 0x00000080 && w <= 0x000007ff)
{
/*U-00000080 - U-000007FF: 110xxxxx 10xxxxxx*/
utf8[j+1] = (w & 0x3f) | 0x80;
utf8[j+0] = ((w >> 6) & 0x1f) | 0xc0;
e = 2;
}
else if(w >= 0x00000800 && w <= 0x0000ffff)
{
/*U-00000800 - U-0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx*/
utf8[j+2] = (w & 0x3f) | 0x80;
utf8[j+1] = ((w >> 6) & 0x3f) | 0x80;
utf8[j+0] = ((w >> 12) & 0x0f) | 0xe0;
e = 3;
}
else if(w >= 0x00010000 && w <= 0x001fffff)
{
/*U-00010000 - U-001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx*/
utf8[j+3] = (w & 0x3f) | 0x80;
utf8[j+2] = ((w >> 6) & 0x3f) | 0x80;
utf8[j+1] = ((w >> 12) & 0x3f) | 0x80;
utf8[j+0] = ((w >> 18) & 0x07) | 0xf0;
e = 4;
}
else if(w >= 0x00200000 && w <= 0x03ffffff)
{
/*U-00200000 - U-03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx*/
utf8[j+4] = (w & 0x3f) | 0x80;
utf8[j+3] = ((w >> 6) & 0x3f) | 0x80;
utf8[j+2] = ((w >> 12) & 0x3f) | 0x80;
utf8[j+1] = ((w >> 18) & 0x3f) | 0x80;
utf8[j+0] = ((w >> 24) & 0x03) | 0xf8;
e = 5;
}
else if(w >= 0x04000000 && w <= 0x7fffffff)
{
/*U-04000000 - U-7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx*/
utf8[j+5] = (w & 0x3f) | 0x80;
utf8[j+4] = ((w >> 6) & 0x3f) | 0x80;
utf8[j+3] = ((w >> 12) & 0x3f) | 0x80;
utf8[j+2] = ((w >> 18) & 0x3f) | 0x80;
utf8[j+1] = ((w >> 24) & 0x03) | 0xf8;
utf8[j+0] = ((w >> 30) & 0x01) | 0xfc;
e = 6;
}
j += e;
}
utf8[j] = 0x00;
return j;
}
int main()
{
const unsigned short utf16[4] = {0x4F60, 0x597D, 0x00, 0x00};
unsigned char utf8[128];
UTF16::toUTF8(utf16, 2, utf8);
printf("%s\n", utf8);
return 0;
}
2.编译源码
$ g++ -o test test.cpp -std=c++11
3.运行及其结果
$ ./test
你好
网友评论