美文网首页
用go学习nlp -- 分词

用go学习nlp -- 分词

作者: frank3 | 来源:发表于2020-10-20 19:54 被阅读0次

    参考文章 https://github.com/NLP-LOVE/Introduction-NLP/blob/master/chapter/2.词典分词.md

    废话不多说了,go代码如下:

    package level
    
    import (
        "bufio"
        "fmt"
        "io"
        "os"
        "strings"
    )
    
    type PLN struct {
        Dict map[string]string
    }
    
    func NewPLN(dictFile string) *PLN {
        p := &PLN{}
        p.loadDict(dictFile)
    
        return p
    }
    
    func (p *PLN) loadDict(file string) error {
        f, err := os.Open(file)
        if err != nil {
            fmt.Println("load dict error: ", err)
        }
    
        dic := make(map[string]string)
        buf := bufio.NewReader(f)
        for {
            line, err := buf.ReadString('\n')
            if err != nil && err != io.EOF {
                fmt.Println("raad dict file fail", err)
                return err
            }
    
            c := strings.Split(line, "\t")
            if len(c) > 1 {
                dic[c[0]] = line
            } else {
                fmt.Println("dict line split fail", c)
            }
    
            // io.EOF
            if err != nil {
                break
            }
        }
    
        p.Dict = dic
        return nil
    }
    
    func (p *PLN) FullSegment(text string) []string {
        var ret []string
        s := []rune(text)
    
        for i := 0; i < len(s); i++ {
            for j := i + 1; j <= len(s); j++ {
                c := string(s[i:j])
                //fmt.printf("%#v  %#v\n", c, s[i:j])
                if _, ok := p.Dict[c]; ok {
                    ret = append(ret, c)
                }
            }
        }
        return ret
    }
    
    func (p *PLN) ForwardSegment(text string) []string {
        var ret []string
        var count int
        s := []rune(text)
    
        for i := 0; i < len(s); i += count {
            count = 0
            for j := i + 1; j <= len(s); j++ {
                c := string(s[i:j])
                //fmt.printf("%#v  %#v\n", c, s[i:j])
                if _, ok := p.Dict[c]; ok {
                    if m := j - i; m > count {
                        count = m
                    }
                }
            }
            ret = append(ret, string(s[i:i+count]))
        }
        return ret
    
    }
    
    func (p *PLN) BackwardSegment(text string) []string {
        var ret []string
        var count int
        s := []rune(text)
    
        for j := len(s); j > 0; j -= count {
            count = 0
            for i := j - 1; i >= 0; i-- {
                c := string(s[i:j])
                //fmt.printf("%#v  %#v\n", c, s[i:j])
                if _, ok := p.Dict[c]; ok {
                    if m := j - i; m > count {
                        count = m
                    }
                }
            }
            //fmt.Println(string(s[j-count : j]))
            ret = append(ret, string(s[j-count:j]))
        }
    
        var rret []string
        for i := len(ret) - 1; i >= 0; i-- {
            rret = append(rret, ret[i])
        }
    
        return rret
    }
    
    func countSingleChar(chars []string) int {
        count := 0
        for _, ch := range chars {
            s := []rune(ch)
            if len(s) == 1 {
                count += 1
            }
        }
        return count
    }
    
    func (p *PLN) BidirectionalSegment(text string) []string {
        f := p.ForwardSegment(text)
        b := p.BackwardSegment(text)
    
        if len(f) < len(b) {
            return f
        }
        if len(f) > len(b) {
            return b
        }
    
        if countSingleChar(f) > countSingleChar(b) {
            return b
        }
    
        return b
    }
    

    测试代码:

    package level
    
    import "testing"
    
    func TestLoadDict(t *testing.T) {
        p := NewPLN("CoreNatureDictionary.mini.txt")
    
        c := 10
        for k, v := range p.Dict {
            t.Logf("k:%v v:%v\n", k, v)
            c -= 1
            if c <= 0 {
                break
            }
        }
    
        t.Log("项目: ", p.Dict["项目"])
    
        t.Log("dict len: ", len(p.Dict))
    }
    
    var testCase = []string{
        "就读北京大学",
        "研究生命起源",
        "项目的研究",
        "当下雨天地面积水",
        "结婚的和未结婚的",
        "欢迎新老师生前来就餐",
    }
    
    func TestFullSegment(t *testing.T) {
        p := NewPLN("CoreNatureDictionary.mini.txt")
    
        for _, text := range testCase {
            seg := p.FullSegment(text)
            t.Logf("full seg: %v", seg)
        }
    }
    
    func TestForwardSegment(t *testing.T) {
        p := NewPLN("CoreNatureDictionary.mini.txt")
        for _, text := range testCase {
            seg := p.ForwardSegment(text)
            t.Logf("forward seg: %v", seg)
        }
    }
    
    func TestBackwardSegment(t *testing.T) {
        p := NewPLN("CoreNatureDictionary.mini.txt")
        for _, text := range testCase {
            seg := p.BackwardSegment(text)
            t.Logf("back seg: %v", seg)
        }
    }
    
    func TestBidirectionalSegment(t *testing.T) {
        p := NewPLN("CoreNatureDictionary.mini.txt")
        for _, text := range testCase {
            seg := p.BidirectionalSegment(text)
            t.Logf("bi seg: %v", seg)
        }
    }
    

    相关文章

      网友评论

          本文标题:用go学习nlp -- 分词

          本文链接:https://www.haomeiwen.com/subject/gjfjmktx.html