import jieba
s1 = '这只皮靴号码大了,那只号码合适'
s2 = '这只皮靴号码不小,那只更合适'
def s_seg(s):
s_seg = '/'.join([x for x in jieba.cut(s,cut_all=True) if x!=''])
s_lst = [x for x in jieba.cut(s,cut_all=True) if x!='']
s_set = set(s_lst)
return s_seg,s_lst,s_set
s1_seg,s1_lst,s1_set = s_seg(s1)
s2_seg,s2_lst,s2_set = s_seg(s2)
#词库
s_dict = s1_set.union(s2_set)
#词库索引
i = 0
word_dict = dict()
for word in s_dict:
word_dict[word] = i
i += 1
print(word_dict)
def word_2_vec(word_dict,s_list):
word_cnt = dict()
s_vector = [0] * len(word_dict)
#word count
for word in s_list:
if word_cnt.get(word, -1) == -1:
word_cnt[word] = 1
else:
word_cnt[word] += 1
# print(word_cnt)
# vector
for word,freq in word_cnt.items():
word_id = word_dict[word]
s_vector[word_id] = freq
print(s_vector)
s1_vec = word_2_vec(word_dict, s1_lst)
s2_vec = word_2_vec(word_dict, s2_lst)
网友评论