1.UTF-16编码规则
在UTF-16中,表示数据有单16位和双16位(32位)两种,1101 10xx xxxx xxxx,区间就是D800~DBFF为前16位,1101 11xx xxxx xxxx,区间就是DC00~DFFF为后16位,其余为单16位。
2.源码实现
#include <iostream>
#include <cstring>
using namespace std;
class UTF16 {
public:
static int toUCS4(const unsigned short *utf16, unsigned short *ucs4);
};
int UTF16::toUCS4(const unsigned short *utf16, unsigned short *ucs4)
{
if(utf16[0] >= 0xd800 && utf16[0] <= 0xdfff)
{
if(utf16[0] < 0xdc00)
{
if(utf16[1] >= 0xdc00 && utf16[1] <= 0xdfff)
{
ucs4[1] = (utf16[0] & 0x3ff);
ucs4[0] = (utf16[1] & 0x3ff);
ucs4[0] = ((ucs4[1] << 10) | ucs4[0]);
ucs4[1] = ((ucs4[1] >> 6) | 1);
//printf("%04x\n", ucs4[0]);
//printf("%04x\n", ucs4[1]);
return 2;
}
return -1;
}
return -1;
}
else
{
ucs4[0] = utf16[0];
ucs4[1] = 0x00;
}
return 1;
}
int main()
{
const unsigned short utf16[4] = {0xd802, 0xdc01, 0x00, 0x00};
unsigned short ucs4[2];
unsigned int *p = (unsigned int *)ucs4;
UTF16::toUCS4(utf16, ucs4);
printf("%08x\n", *p);
return 0;
}
3.编译源码
$ g++ -o test test.cpp -std=c++11
4.运行及其结果
$ ./test
00010801
网友评论