首先,lua 内部使用 utf8 存储字符串。
-- 转换成大端字符串
local function utf8_to_unicode(convertStr)
if type(convertStr)~="string" then
return convertStr
end
local resultStr=""
local i=1
local num1=string.byte(convertStr,i)
while num1~=nil do
local tempVar1,tempVar2
if num1 >= 0x00 and num1 <= 0x7f then
tempVar1=num1
tempVar2=0
elseif bit.band(num1,0xe0)== 0xc0 then
local t1 = 0
local t2 = 0
t1 = bit.band(num1,bit.rshift(0xff,3))
i=i+1
num1=string.byte(convertStr,i)
t2 = bit.band(num1,bit.rshift(0xff,2))
tempVar1=bit.bor(t2,bit.lshift(bit.band(t1,bit.rshift(0xff,6)),6))
tempVar2=bit.rshift(t1,2)
elseif bit.band(num1,0xf0)== 0xe0 then
local t1 = 0
local t2 = 0
local t3 = 0
t1 = bit.band(num1,bit.rshift(0xff,3))
i=i+1
num1=string.byte(convertStr,i)
t2 = bit.band(num1,bit.rshift(0xff,2))
i=i+1
num1=string.byte(convertStr,i)
t3 = bit.band(num1,bit.rshift(0xff,2))
tempVar1=bit.bor(bit.lshift(bit.band(t2,bit.rshift(0xff,6)),6),t3)
tempVar2=bit.bor(bit.lshift(t1,4),bit.rshift(t2,2))
end
resultStr=resultStr..string.format("\\u%02x%02x",tempVar2,tempVar1) -- tempVar1, tempVar2 就是小端结果
i=i+1
num1=string.byte(convertStr,i)
end
return resultStr
end
使用:
local ret = string.upper(string.gsub(utf8_to_unicode('吉米 abc'), '\\u', ''))
print(ret)
配合 java 验证:
public static void printbyte(byte[] bt)
{
for (int i = 0; i < bt.length; i++){
int hex = (int)bt[i] & 0xff;
System.out.print(Integer.toHexString(hex) + " ");
}
System.out.println(" length = "+bt.length);
}
@Test
public void test1() throws UnsupportedEncodingException {
String name = "杨彬abc";
// TODO Auto-generated method stub
byte[] defaultBytes = name.getBytes();
printbyte(defaultBytes);
byte[] utf_8 = name.getBytes("utf-8");
printbyte(utf_8);
byte[] utf_16be = name.getBytes("utf-16be");
printbyte(utf_16be);
byte[] utf_16le = name.getBytes("utf-16le");
printbyte(utf_16le);
byte[] gbk = name.getBytes("gbk");
printbyte(gbk);
/*
*
e6 9d a8 e5 bd ac 61 62 63 length = 9 默认
e6 9d a8 e5 bd ac 61 62 63 length = 9 utf-8 汉字占三个字节
而utf-8 采用 3 个字节存储汉字 1个字节存储英文字符
unicode 采用两个字节存储汉字及英文字符
67 68 5f 6c 0 61 0 62 0 63 length = 10 大端方式 汉字占两个字节
68 67 6c 5f 61 0 62 0 63 0 length = 10 小端方式
gbk采用两个字节存储汉字
d1 ee b1 f2 61 62 63 length = 7
*/
//网络编程socket 通讯 针对字符串采用 utf-16le 这种方式存储
}
http://www.cocoachina.com/bbs/read.php?tid-312194-page-1.html
http://yangbinfx.iteye.com/blog/1768501
http://blog.csdn.net/operhero1990/article/details/47044697
网友评论