美文网首页
WMD原始定义的代码实现

WMD原始定义的代码实现

作者: 0_oHuanyu | 来源:发表于2020-03-25 19:17 被阅读0次
glovefile = open("glove.6B.100d.txt","r",encoding="utf-8")  
from cvxopt import matrix, solvers
import numpy as np

# 加载训练好的词向量
def load_embedding():
    word_emds = {}
    for line in glovefile:
        eles = line.split(" ")
        vec = list(map(float, eles[1:]))
        word_emds[eles[0]] = vec
    return word_emds


# 计算两个向量的欧式距离
def get_word_embedding_distance(emb1, emb2):
    if (len(emb1) != len(emb2)):
        print('error input,x and y is not in the same space')
        return
    result1 = 0.0
    for i in range(len(emb1)):
        result1 += (emb1[i] - emb2[i]) * (emb1[i] - emb2[i])
    distance = result1 ** 0.5
    return distance

# 把list转成map,key为list中的元素,value为在原句中出现的次数
def word_count(words):
    word_map = {}
    for word in words:
        if word in word_map.keys():
            word_map[word] += 1.0
        else:
            word_map[word] = 1.0
    return word_map

# TODO: 编写WMD函数来计算两个句子之间的相似度

def WMD (sent1, sent2):
    """
    这是主要的函数模块。参数sent1是第一个句子, 参数sent2是第二个句子,可以认为没有经过分词。
    在英文里,用空格作为分词符号。
    
    在实现WMD算法的时候,需要用到LP Solver用来解决Transportation proboem. 请使用http://cvxopt.org/examples/tutorial/lp.html
    也可以参考blog: https://scaron.info/blog/linear-programming-in-python-with-cvxopt.html
    
    需要做的事情为:
    
    1. 对句子做分词: 调用 .split() 函数即可
    2. 获取每个单词的词向量。这需要读取文件之后构建embedding matrix. 
    3. 构建lp问题,并用solver解决
    
    可以自行定义其他的函数,但务必不要改写WMD函数名。测试时保证WMD函数能够正确运行。
    """
    
    # 分词
    words1 = sent1.split()
    words2 = sent2.split()

    # 去重、统计词频
    word_map1 = word_count(words1)
    word_map2 = word_count(words2)

    # 将分词转为词向量
    word_embs1 = [word_emds[word] for word in word_map1.keys()]
    word_embs2 = [word_emds[word] for word in word_map2.keys()]

    # 准备遍历
    len1 = len(word_embs1)
    len2 = len(word_embs2)
    # 获得两个句子之间的词汇距离ci,j,存储在二维数组cc中
    c_ij = []
    for i in range(len1):
        for j in range(len2):
            c_ij.append(get_word_embedding_distance(word_embs1[i], word_embs2[j]))

    # 设计A矩阵
    a = []
    #   句子1对应到句子2部分
    for i in range(len1):
        line_ele = []
        for ii in range(i * len2):
            line_ele.append(0.0)
        for j in range(len2):
            line_ele.append(1.0)
        for ii in range((len1 - i - 1) * len2):
            line_ele.append(0.0)
        a.append(line_ele)

    #   句子2对应到句子1
    for i in range(len2):
        line_ele = []
        for ii in range(len1):
            for jj in range(i):
                line_ele.append(0.0)
            line_ele.append(1.0)
            for jj in range(0, len2 - i - 1):
                line_ele.append(0.0)
        a.append(line_ele)

    # 获得出入量之和 这部分逻辑是跟A的设计连在一起的
    b = [ele / len(words1) for ele in list(word_map1.values())] + \
        [ele / len(words2) for ele in list(word_map2.values())]

    # 列出线性规划问题
    A_matrix = matrix(a).trans()
    b_matrix = matrix(b)
    c_matrix = matrix(c_ij)
    num_of_T = len(c_ij)
    G = matrix(-np.eye(num_of_T))
    h = matrix(np.zeros(num_of_T))

#     求解器求解,注意这里必须指定solver,否则会报错,蛋疼
    sol = solvers.lp(c_matrix, G, h, A=A_matrix, b=b_matrix, solver='glpk')
    return sol['primal objective']


# 读取glove的预训练词向量
word_emds = load_embedding()
print("读取完成")

sent1 = "boy like eat apple"
sent2 = "he want some fruit"
sent3 = "a dog dive into water"
print(WMD(sent1, sent2))
print(WMD(sent1, sent3))

相关文章

网友评论

      本文标题:WMD原始定义的代码实现

      本文链接:https://www.haomeiwen.com/subject/vwwxuhtx.html