美文网首页
基于同义词的分词算法

基于同义词的分词算法

作者: 吃番茄的土拨鼠 | 来源:发表于2021-07-11 21:13 被阅读0次

    话不多说,debug一遍就可以完全看懂

    #!/usr/bin/python
    # coding=utf-8
    # dag 分词
    
    
    word_dict = {
        u'我是': 30,
        u'是小帅': 60,
        u"我": 1,
        u"是": 1,
        u"小帅哥": 18,
        u"哥": 1
    }
    
    
    #
    def get_dag(txt):
        dag_dict = {}
        n = len(txt)
        for idx in range(n):
            tmp_list = [idx]
            frag = txt[idx:idx + 2]
            i = idx + 1
            while i < n:
                if frag in word_dict:
                    tmp_list.append(i)
    
                i += 1
                frag = txt[idx:i + 1]
            if len(tmp_list) > 0:
                dag_dict[idx] = tmp_list
        return dag_dict
    
    
    def get_route(dag_dict, txt):
        n = len(txt)
        route_dict = {n: [0, 0]}
        for idx in range(n - 1, -1, -1):
            max_weight = 0
            if idx not in dag_dict:
                route_dict[idx] = [1, idx]
                continue
            for x in dag_dict[idx]:
                frag = txt[idx:x + 1]
                cur_weight = word_dict[frag] if frag in word_dict else 0
                total_weight = cur_weight + route_dict[x + 1][0]
                if total_weight > max_weight:
                    max_weight = total_weight
                    route_dict[idx] = [total_weight, x]
        return route_dict
    
    
    def get_words(route_dict, txt):
        n = len(txt)
        word_list = []
        idx = 0
        while idx < n:
            edx = route_dict[idx][1]
            word = txt[idx:edx + 1]
            word_list.append(word)
            idx = edx + 1
        return word_list
    
    
    query = u'我是小帅哥'
    dag = get_dag(query)
    route = get_route(dag, query)
    words = get_words(route, query)
    print('-'.join(words))
    
    

    相关文章

      网友评论

          本文标题:基于同义词的分词算法

          本文链接:https://www.haomeiwen.com/subject/acklpltx.html