话不多说,debug一遍就可以完全看懂
#!/usr/bin/python
# coding=utf-8
# dag 分词
word_dict = {
u'我是': 30,
u'是小帅': 60,
u"我": 1,
u"是": 1,
u"小帅哥": 18,
u"哥": 1
}
#
def get_dag(txt):
dag_dict = {}
n = len(txt)
for idx in range(n):
tmp_list = [idx]
frag = txt[idx:idx + 2]
i = idx + 1
while i < n:
if frag in word_dict:
tmp_list.append(i)
i += 1
frag = txt[idx:i + 1]
if len(tmp_list) > 0:
dag_dict[idx] = tmp_list
return dag_dict
def get_route(dag_dict, txt):
n = len(txt)
route_dict = {n: [0, 0]}
for idx in range(n - 1, -1, -1):
max_weight = 0
if idx not in dag_dict:
route_dict[idx] = [1, idx]
continue
for x in dag_dict[idx]:
frag = txt[idx:x + 1]
cur_weight = word_dict[frag] if frag in word_dict else 0
total_weight = cur_weight + route_dict[x + 1][0]
if total_weight > max_weight:
max_weight = total_weight
route_dict[idx] = [total_weight, x]
return route_dict
def get_words(route_dict, txt):
n = len(txt)
word_list = []
idx = 0
while idx < n:
edx = route_dict[idx][1]
word = txt[idx:edx + 1]
word_list.append(word)
idx = edx + 1
return word_list
query = u'我是小帅哥'
dag = get_dag(query)
route = get_route(dag, query)
words = get_words(route, query)
print('-'.join(words))
网友评论