UTF8字符串在lua的截取和字数统计

作者: 自由快挂 | 来源:发表于2017-05-05 14:08 被阅读220次

UTF8字符串在lua的截取和字数统计
Lua(模糊查找):判断两个字符串(含中文)是否存在至少一个相同
大端 Unicode 字符串
NSString 字符串查找与截取
mac 终端常见命令
iOS 字符串常见处理
swift-字符串截取截取指定字符前面或后面的字符串
2019-05-24
OC - NSString
SQL学习笔记2：函数运算

link: https://github.com/pangliang/pangliang.github.com/blob/master/_posts/2014-08-23-UTF8%E5%AD%97%E7%AC%A6%E4%B8%B2%E5%9C%A8lua%E7%9A%84%E6%88%AA%E5%8F%96%E5%92%8C%E5%AD%97%E6%95%B0%E7%BB%9F%E8%AE%A1.md

-- 判断utf8字符byte长度
-- 0xxxxxxx - 1 byte
-- 110yxxxx - 192, 2 byte
-- 1110yyyy - 225, 3 byte
-- 11110zzz - 240, 4 byte
local function chsize(char)
    if not char then
        print("not char")
        return 0
    elseif char > 240 then
        return 4
    elseif char > 225 then
        return 3
    elseif char > 192 then
        return 2
    else
        return 1
    end
end

-- 计算utf8字符串字符数, 各种字符都按一个字符计算
-- 例如utf8len("1你好") => 3
function utf8len(str)
    local len = 0
    local currentIndex = 1
    while currentIndex <= #str do
        local char = string.byte(str, currentIndex)
        currentIndex = currentIndex + chsize(char)
        len = len +1
    end
    return len
end

-- 截取utf8 字符串
-- str:         要截取的字符串
-- startChar:   开始字符下标,从1开始
-- numChars:    要截取的字符长度
function utf8sub(str, startChar, numChars)
    local startIndex = 1
    while startChar > 1 do
        local char = string.byte(str, startIndex)
        startIndex = startIndex + chsize(char)
        startChar = startChar - 1
    end

    local currentIndex = startIndex

    while numChars > 0 and currentIndex <= #str do
        local char = string.byte(str, currentIndex)
        currentIndex = currentIndex + chsize(char)
        numChars = numChars -1
    end
    return str:sub(startIndex, currentIndex - 1)
end

-- 自测
function test()
    -- test utf8len
    assert(utf8len("你好1世界哈哈") == 7)
    assert(utf8len("你好世界1哈哈 ") == 8)
    assert(utf8len(" 你好世 界1哈哈") == 9)
    assert(utf8len("12345678") == 8)
    assert(utf8len("øpø你好pix") == 8)

    -- test utf8sub
    assert(utf8sub("你好1世界哈哈",2,5) == "好1世界哈")
    assert(utf8sub("1你好1世界哈哈",2,5) == "你好1世界")
    assert(utf8sub(" 你好1世界 哈哈",2,6) == "你好1世界 ")
    assert(utf8sub("你好世界1哈哈",1,5) == "你好世界1")
    assert(utf8sub("12345678",3,5) == "34567")
    assert(utf8sub("øpø你好pix",2,5) == "pø你好p")

    print("all test succ")
end

test()