word2vec

作者: 进击的小恶魔 | 来源:发表于2019-03-09 00:45 被阅读0次
    import jieba
    
    s1 = '这只皮靴号码大了,那只号码合适'
    s2 = '这只皮靴号码不小,那只更合适'
    
    def s_seg(s):
        s_seg = '/'.join([x for x in jieba.cut(s,cut_all=True) if x!=''])
        s_lst = [x for x in jieba.cut(s,cut_all=True) if x!='']
        s_set = set(s_lst)
        return s_seg,s_lst,s_set
    
    s1_seg,s1_lst,s1_set = s_seg(s1)
    s2_seg,s2_lst,s2_set = s_seg(s2)
    
    #词库
    s_dict = s1_set.union(s2_set)
    
    #词库索引
    i = 0
    word_dict = dict()
    for word in s_dict:
        word_dict[word] = i
        i += 1
    print(word_dict)
    
    def word_2_vec(word_dict,s_list):
        word_cnt = dict()
        s_vector = [0] * len(word_dict)
    
        #word count
        for word in s_list:
            if word_cnt.get(word, -1) == -1:
                word_cnt[word] = 1
            else:
                word_cnt[word] += 1
        # print(word_cnt)
    
        # vector
        for word,freq in word_cnt.items():
            word_id = word_dict[word]
            s_vector[word_id] = freq
        print(s_vector)
    
    s1_vec = word_2_vec(word_dict, s1_lst)
    s2_vec = word_2_vec(word_dict, s2_lst)
    

    相关文章

      网友评论

          本文标题:word2vec

          本文链接:https://www.haomeiwen.com/subject/kiyjpqtx.html