美文网首页Golang深入浅出golang
golang unicode/utf8源码分析

golang unicode/utf8源码分析

作者: 第八共同体 | 来源:发表于2019-07-23 15:44 被阅读22次

    简介

    // Package utf8 implements functions and constants to support text encoded in
    // UTF-8. It includes functions to translate between runes and UTF-8 byte sequences.
    package utf8
    

    utf-8实现的功能和常量用于文章utf8编码,包含runes和utf8字节序列的转换功能.在unicode中,一个中文占两个字节,utf-8中一个中文占三个字节,golang默认的编码是utf-8编码,因此默认一个中文占三个字节,但是golang中的字符串底层实际上是一个byte数组.

    package main
    
    import (
        "fmt"
        "reflect"
        "unicode/utf8"
    )
    
    // Numbers fundamental to the encoding.
    const (
        RuneError = '\uFFFD'     // the "error" Rune or "Unicode replacement character"
        RuneSelf  = 0x80         // characters below Runeself are represented as themselves in a single byte.
        MaxRune   = '\U0010FFFF' // Maximum valid Unicode code point.
        UTFMax    = 4            // maximum number of bytes of a UTF-8 encoded Unicode character.
    )
    
    const (
        t1 = 0x00 // 0000 0000
        tx = 0x80 // 1000 0000
        t2 = 0xC0 // 1100 0000
        t3 = 0xE0 // 1110 0000
        t4 = 0xF0 // 1111 0000
        t5 = 0xF8 // 1111 1000
    
        maskx = 0x3F // 0011 1111
        mask2 = 0x1F // 0001 1111
        mask3 = 0x0F // 0000 1111
        mask4 = 0x07 // 0000 0111
    
        rune1Max = 1<<7 - 1
        rune2Max = 1<<11 - 1
        rune3Max = 1<<16 - 1
    
        // The default lowest and highest continuation byte.
        locb = 0x80 // 1000 0000
        hicb = 0xBF // 1011 1111
    
        // These names of these constants are chosen to give nice alignment in the
        // table below. The first nibble is an index into acceptRanges or F for
        // special one-byte cases. The second nibble is the Rune length or the
        // Status for the special one-byte case.
        xx = 0xF1 // invalid: size 1
        as = 0xF0 // ASCII: size 1
        s1 = 0x02 // accept 0, size 2
        s2 = 0x13 // accept 1, size 3
        s3 = 0x03 // accept 0, size 3
        s4 = 0x23 // accept 2, size 3
        s5 = 0x34 // accept 3, size 4
        s6 = 0x04 // accept 0, size 4
        s7 = 0x44 // accept 4, size 4
    )
    
    type acceptRange struct {
        lo uint8 // lowest value for second byte.
        hi uint8 // highest value for second byte.
    }
    
    var acceptRanges = [...]acceptRange{
        0: {locb, hicb},
        1: {0xA0, hicb},
        2: {locb, 0x9F},
        3: {0x90, hicb},
        4: {locb, 0x8F},
    }
    
    // first is information about the first byte in a UTF-8 sequence.
    var first = [256]uint8{
        //   1   2   3   4   5   6   7   8   9   A   B   C   D   E   F
        as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x00-0x0F
        as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x10-0x1F
        as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x20-0x2F
        as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x30-0x3F
        as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x40-0x4F
        as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x50-0x5F
        as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x60-0x6F
        as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x70-0x7F
        //   1   2   3   4   5   6   7   8   9   A   B   C   D   E   F
        xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0x80-0x8F
        xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0x90-0x9F
        xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xA0-0xAF
        xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xB0-0xBF
        xx, xx, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, // 0xC0-0xCF
        s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, // 0xD0-0xDF
        s2, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s4, s3, s3, // 0xE0-0xEF
        s5, s6, s6, s6, s7, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xF0-0xFF
    }
    
    
    // RuneCountInString is like RuneCount but its input is a string.
    func RuneCountInString(s string) (n int) {
        ns := len(s) 
        fmt.Println(ns)
        for i := 0; i < ns; n++ {
            c := s[i]
            if c < RuneSelf {
                // ASCII fast path
                i++
                continue
            }
            fmt.Println("c=", c)
            x := first[c]
            fmt.Println("x=", x)
            if x == xx {
                i++ // invalid.
                continue
            }
            size := int(x & 7)
            fmt.Println("size=", size)
            if i+size > ns {
                i++ // Short or invalid.
                continue
            }
            accept := acceptRanges[x>>4]
            fmt.Println("accept: ", accept)
            if c := s[i+1]; c < accept.lo || accept.hi < c {
                size = 1
            } else if size == 2 {
            } else if c := s[i+2]; c < locb || hicb < c {
                size = 1
            } else if size == 3 {
            } else if c := s[i+3]; c < locb || hicb < c {
                size = 1
            }
            i += size
        }
        return n
    }
    
    
    func FullRune(p []byte) bool {
        n := len(p)
        if n == 0 {
            return false
        }
        fmt.Println("po=", p[0])
        x := first[p[0]]
        if n >= int(x&7) {
            return true // ASCII, invalid or valid.
        }
        // Must be short or invalid.
        accept := acceptRanges[x>>4]
        if n > 1 && (p[1] < accept.lo || accept.hi < p[1]) {
            return true
        } else if n > 2 && (p[2] < locb || hicb < p[2]) {
            return true
        }
        return false
    }
    
    
    // FullRuneInString is like FullRune but its input is a string.
    func FullRuneInString(s string) bool {
        n := len(s)
        if n == 0 {
            return false
        }
        x := first[s[0]]
        fmt.Println("xxx= ", x)
        fmt.Println("x&7= ", x&7)
        if n >= int(x&7) {
            fmt.Println("--------")
            return true // ASCII, invalid, or valid.
        }
        // Must be short or invalid.
        accept := acceptRanges[x>>4]
        if n > 1 && (s[1] < accept.lo || accept.hi < s[1]) {
            fmt.Println("xxxxxx")
            return true
        } else if n > 2 && (s[2] < locb || hicb < s[2]) {
            fmt.Println("eeeee")
            return true
        }
        return false
    }
    
    func main(){
        fmt.Println(reflect.TypeOf(acceptRanges))
        str := "Hello, 钢铁侠"
        fmt.Println(FullRuneInString(`\ubbbbbbb`))
        fmt.Println(FullRune([]byte(str)))
        fmt.Println(utf8.RuneCount([]byte(str)))
        fmt.Println(str)
        for i:=0;i<len(str);i++ {
            fmt.Println(str[i])
        }
        fmt.Println([]byte(str))
        for _, s := range str {
            fmt.Println(s)
        }
        fmt.Println(reflect.TypeOf([]rune(str)[4]))
        fmt.Println([]rune(str))
        fmt.Println([]int32(str))
        fmt.Println(utf8.RuneCountInString(str))
        //fmt.Println(first[uint8(str[6])])
        //accept := acceptRanges[4]
        fmt.Println(RuneCountInString(str))
        fmt.Println(utf8.ValidString(str))
    }
    
    

    Output:

    [5]main.acceptRange
    xxx=  240
    x&7=  0
    --------
    true
    po= 72
    true
    10
    Hello, 钢铁侠
    72
    101
    108
    108
    111
    44
    32
    233
    146
    162
    233
    147
    129
    228
    190
    160
    [72 101 108 108 111 44 32 233 146 162 233 147 129 228 190 160]
    72
    101
    108
    108
    111
    44
    32
    38050
    38081
    20384
    int32
    [72 101 108 108 111 44 32 38050 38081 20384]
    [72 101 108 108 111 44 32 38050 38081 20384]
    10
    16
    c= 233
    x= 3
    size= 3
    accept:  {128 191}
    c= 233
    x= 3
    size= 3
    accept:  {128 191}
    c= 228
    x= 3
    size= 3
    accept:  {128 191}
    10
    true
    

    常量定义

    RuneSelf该值的字节码值为128,在判断是否是常规的ascii码是使用。hicb字节码值为191.FF的对应的字节码为255。

    // The conditions RuneError==unicode.ReplacementChar and
    // MaxRune==unicode.MaxRune are verified in the tests.
    // Defining them locally avoids this package depending on package unicode.
    
    // Numbers fundamental to the encoding.
    const (
        RuneError = '\uFFFD'     // the "error" Rune or "Unicode replacement character"
        RuneSelf  = 0x80         // characters below Runeself are represented as themselves in a single byte.
        MaxRune   = '\U0010FFFF' // Maximum valid Unicode code point.
        UTFMax    = 4            // maximum number of bytes of a UTF-8 encoded Unicode character.
    )
    
    // Code points in the surrogate range are not valid for UTF-8.
    const (
        surrogateMin = 0xD800
        surrogateMax = 0xDFFF
    )
    
    const (
        t1 = 0x00 // 0000 0000
        tx = 0x80 // 1000 0000
        t2 = 0xC0 // 1100 0000
        t3 = 0xE0 // 1110 0000
        t4 = 0xF0 // 1111 0000
        t5 = 0xF8 // 1111 1000
    
        maskx = 0x3F // 0011 1111
        mask2 = 0x1F // 0001 1111
        mask3 = 0x0F // 0000 1111
        mask4 = 0x07 // 0000 0111
    
        rune1Max = 1<<7 - 1
        rune2Max = 1<<11 - 1
        rune3Max = 1<<16 - 1
    
        // The default lowest and highest continuation byte.
        locb = 0x80 // 1000 0000
        hicb = 0xBF // 1011 1111
    
        // These names of these constants are chosen to give nice alignment in the
        // table below. The first nibble is an index into acceptRanges or F for
        // special one-byte cases. The second nibble is the Rune length or the
        // Status for the special one-byte case.
        xx = 0xF1 // invalid: size 1
        as = 0xF0 // ASCII: size 1
        s1 = 0x02 // accept 0, size 2
        s2 = 0x13 // accept 1, size 3
        s3 = 0x03 // accept 0, size 3
        s4 = 0x23 // accept 2, size 3
        s5 = 0x34 // accept 3, size 4
        s6 = 0x04 // accept 0, size 4
        s7 = 0x44 // accept 4, size 4
    )
    
    // first is information about the first byte in a UTF-8 sequence.
    var first = [256]uint8{
        //   1   2   3   4   5   6   7   8   9   A   B   C   D   E   F
        as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x00-0x0F
        as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x10-0x1F
        as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x20-0x2F
        as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x30-0x3F
        as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x40-0x4F
        as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x50-0x5F
        as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x60-0x6F
        as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x70-0x7F
        //   1   2   3   4   5   6   7   8   9   A   B   C   D   E   F
        xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0x80-0x8F
        xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0x90-0x9F
        xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xA0-0xAF
        xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xB0-0xBF
        xx, xx, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, // 0xC0-0xCF
        s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, // 0xD0-0xDF
        s2, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s4, s3, s3, // 0xE0-0xEF
        s5, s6, s6, s6, s7, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xF0-0xFF
    }
    
    // acceptRange gives the range of valid values for the second byte in a UTF-8 sequence.
    // acceptRange给出了一个utf8序列中第二个字节的有效范围
    type acceptRange struct {
        lo uint8 // lowest value for second byte.
        hi uint8 // highest value for second byte.
    }
    
    var acceptRanges = [...]acceptRange{
        0: {locb, hicb},
        1: {0xA0, hicb},
        2: {locb, 0x9F},
        3: {0x90, hicb},
        4: {locb, 0x8F},
    }
    

    RuneCountInString

    计算字符串中的rune数量,原理:首先取出字符串的码值,然后判断是不是个小于128的,如果是小于则直接continue.rune个数++.
    如果是个十六进制f1.的则是无效字符,直接continue.rune个数++,也就是说一个无效的字符也当成一个字长为1的rune.如果字符的码值在first列表中的值和7按位的结果为其字长,比如上面示例中的。其字长为三位,第一位的值为233.二进制形式为11101001;与7按位与后的值为0.从acceptRanges中取出的结果为{locb, hicb}。也就是标识ox800xbf之间的值。而结果n也就是直接size+3跳过3个字节后,rune个数++。其他函数的处理流程差不多,不再过多叙述。

    // RuneCountInString is like RuneCount but its input is a string.
    func RuneCountInString(s string) (n int) {
        ns := len(s)
        fmt.Println(ns)
        for i := 0; i < ns; n++ {
            c := s[i]
            if c < RuneSelf {
                // ASCII fast path
                i++
                continue
            }
            fmt.Println("c=", c)
            x := first[c]
            fmt.Println("x=", x)
            if x == xx {
                i++ // invalid.
                continue
            }
            size := int(x & 7)
            fmt.Println("size=", size)
            if i+size > ns {
                i++ // Short or invalid.
                continue
            }
            accept := acceptRanges[x>>4]
            fmt.Println("accept: ", accept)
            if c := s[i+1]; c < accept.lo || accept.hi < c {
                size = 1
            } else if size == 2 {
            } else if c := s[i+2]; c < locb || hicb < c {
                size = 1
            } else if size == 3 {
            } else if c := s[i+3]; c < locb || hicb < c {
                size = 1
            }
            i += size
        }
        return n
    }
    

    示例:

    package main
    
    import (
        "fmt"
        "unicode/utf8"
    )
    
    func main(){
        str := "Hello, 钢铁侠"
        fmt.Println(utf8.RuneCountInString(str)) // 10
    }
    

    ValidString

    ValidString返回值表明参数字符串是否是一个合法的可utf8编码的字符串。

    // ValidString reports whether s consists entirely of valid UTF-8-encoded runes.
    func ValidString(s string) bool {
        n := len(s)
        for i := 0; i < n; {
            si := s[i]
            if si < RuneSelf {
                i++
                continue
            }
            x := first[si]
            if x == xx {
                return false // Illegal starter byte.
            }
            size := int(x & 7)
            if i+size > n {
                return false // Short or invalid.
            }
            accept := acceptRanges[x>>4]
            if c := s[i+1]; c < accept.lo || accept.hi < c {
                return false
            } else if size == 2 {
            } else if c := s[i+2]; c < locb || hicb < c {
                return false
            } else if size == 3 {
            } else if c := s[i+3]; c < locb || hicb < c {
                return false
            }
            i += size
        }
        return true
    }
    

    RuneCount

    RuneCount返回参数中包含的rune数量,第一个例子中将utf8.RuneCountInString,改成该方法调用,返回的结果相同。错误的和短的被当成一个长一字节的rune.单个字符H就表示一个长度为1字节的rune.

    // RuneCount returns the number of runes in p. Erroneous and short
    // encodings are treated as single runes of width 1 byte.
    func RuneCount(p []byte) int {
        np := len(p)
        var n int
        for i := 0; i < np; {
            n++
            c := p[i]
            if c < RuneSelf {
                // ASCII fast path
                i++
                continue
            }
            x := first[c]
            if x == xx {
                i++ // invalid.
                continue
            }
            size := int(x & 7)
            if i+size > np {
                i++ // Short or invalid.
                continue
            }
            accept := acceptRanges[x>>4]
            if c := p[i+1]; c < accept.lo || accept.hi < c {
                size = 1
            } else if size == 2 {
            } else if c := p[i+2]; c < locb || hicb < c {
                size = 1
            } else if size == 3 {
            } else if c := p[i+3]; c < locb || hicb < c {
                size = 1
            }
            i += size
        }
        return n
    }
    

    FullRune

    该函数标识参数是否以一个可编码的rune开头,上面的例子中,因为字符串是以一个ascii码值在0-127内的字符开头,所以在执行
    first[p[0]]时,取到的是p[0]是72,在first列表中,127之前的值都相同都为0xF0,十进制标识为240,与7按位与后值为0,所以,直接返回true.

    // FullRune reports whether the bytes in p begin with a full UTF-8 encoding of a rune.
    // An invalid encoding is considered a full Rune since it will convert as a width-1 error rune.
    func FullRune(p []byte) bool {
        n := len(p)
        if n == 0 {
            return false
        }
        x := first[p[0]]
        if n >= int(x&7) {
            return true // ASCII, invalid or valid.
        }
        // Must be short or invalid.
        accept := acceptRanges[x>>4]
        if n > 1 && (p[1] < accept.lo || accept.hi < p[1]) {
            return true
        } else if n > 2 && (p[2] < locb || hicb < p[2]) {
            return true
        }
        return false
    }
    

    FullRuneInString

    和FullRune类似,只是参数为字符串形式

    // FullRuneInString is like FullRune but its input is a string.
    func FullRuneInString(s string) bool {
        n := len(s)
        if n == 0 {
            return false
        }
        x := first[s[0]]
        if n >= int(x&7) {
            fmt.Println("--------")
            return true // ASCII, invalid, or valid.
        }
        // Must be short or invalid.
        accept := acceptRanges[x>>4]
        if n > 1 && (s[1] < accept.lo || accept.hi < s[1]) {
            fmt.Println("xxxxxx")
            return true
        } else if n > 2 && (s[2] < locb || hicb < s[2]) {
            fmt.Println("eeeee")
            return true
        }
        return false
    }
    

    相关文章

      网友评论

        本文标题:golang unicode/utf8源码分析

        本文链接:https://www.haomeiwen.com/subject/weiolctx.html