利用Word2vec生成句向量（二）

作者: 菜菜鑫 | 来源:发表于2018-12-31 04:51 被阅读0次

利用Word2vec生成句向量（二）
利用Word2vec生成句向量（一）
gensim-word2vec
word2vec
《算法竞赛入门经典》第七章学习笔记
探秘Word2Vec(二)-背景知识
词表征学习算法 — Word2Vec
NLP-词向量
bert生成句向量
Word2vec（gensim）使用

在之前的文章《利用Word2vec生成句向量（一）》中，介绍了两种句向量的生成方法，本文将介绍一种号称"简单却具有一定竞争力"的句向量表示方法：SIF加权平均
论文见A simple but tough-to-beat baseline for sentence embeddings
本文依旧不会对论文及其原理做过多解读，我更着眼于源码的修改，使源码能运行起来跑得通，人人都能拿来就用
和之前提到的TFIDF加权相似，SIF也是对于每个词向量给出一定的权重，并且权重的大小也是基于词频的，论文把该方法命名为平滑倒词频。
官方给出了SIF的源码，但是源码只支持英文，并且只支持python2，不支持python3，本文将会对源码进行改造使其适用于中文和python3版本，并进行简单解读。

首先，将SIF源码拷贝到本地

git clone https://github.com/PrincetonML/SIF.git

注意，SIF源码本身存在很多BUG，Glove词向量中字段一塌糊涂，向量中的两个值都会连在一块，根本读不出来，如果想运行原有的SIF需要进行很多调试工作。

主程序入口sif_embedding.py
修改部分：

将原有代码中的Glove词向量变为自己的Word2vec词向量Word2Vec.load(‘词向量文件’)
原有源码中有一个enwiki_vocab_min200.txt的文件，源码中没有提及这个文件的来源，通过观察可以看出文件存储的是每个单词的词频。
但是我们用gensim训练出的Word2vec模型中本身就有类似的统计参数，无需额外获取，只需要把model_100.wv.vocab传进去再修改getWordWeight的访问方式即可

from src import data_io, params, SIF_embedding
import os
from gensim.models import Word2Vec

model_100 = Word2Vec.load(os.path.join('/media/brx/TOSHIBA EXT/wiki_zh_word2vec/', 'ngram_100_5_90w.bin'))
words = {}
for index, word in enumerate(model_100.wv.index2entity):
    words[word] = index
We = model_100.wv.vectors

# input
weightpara = 1e-3 # the parameter in the SIF weighting scheme, usually in the range [3e-5, 3e-3]
rmpc = 1 # number of principal components to remove in SIF weighting scheme
sentences = ['这是一个测试句子', '这是另一个测试句子']

# load word vectors
# (words, We) = data_io.getWordmap(wordfile)
# load word weights
word2weight = data_io.getWordWeight(model_100.wv.vocab, weightpara) # word2weight['str'] is the weight for the word 'str'
weight4ind = data_io.getWeight(words, word2weight) # weight4ind[i] is the weight for the i-th word
# load sentences
x, m = data_io.sentences2idx(sentences, words) # x is the array of word indices, m is the binary mask indicating whether there is a word in that location
w = data_io.seq2weight(x, m, weight4ind) # get word weights

# set parameters
params = params.params()
params.rmpc = rmpc
# get SIF embedding
embedding = SIF_embedding.SIF_embedding(We, x, w, params) # embedding[i,:] is the embedding for sentence i
print()

数据处理data_io.py
修改部分：

getSeq函数中将原有的split()改为jieba分词
sentences2idx函数中定义seq1=[]（源代码BUG）
getWordWeight中实现了平滑倒词频，函数中将当前词的词频和总词数从原来文件中读取，改为从Word2vec模型中读取
iteritems->items, xrange->range （python2->python3）

from __future__ import print_function

import numpy as np
import pickle
from src.tree import tree
import jieba
#from theano import config

def getWordmap(textfile):
    words={}
    We = []
    f = open(textfile,'r', errors='ignore')
    lines = f.readlines()
    for (n,i) in enumerate(lines):
        i=i.split()
        j = 1
        v = []
        while j < len(i):
            if i[j] == '.':
                v.append(0)
            else:
                v.append(float(i[j]))
            j += 1
        words[i[0]] = n
        We.append(v)
    return (words, np.array(We))

def prepare_data(list_of_seqs):
    lengths = [len(s) for s in list_of_seqs]
    n_samples = len(list_of_seqs)
    maxlen = np.max(lengths)
    x = np.zeros((n_samples, maxlen)).astype('int32')
    x_mask = np.zeros((n_samples, maxlen)).astype('float32')
    for idx, s in enumerate(list_of_seqs):
        x[idx, :lengths[idx]] = s
        x_mask[idx, :lengths[idx]] = 1.
    x_mask = np.asarray(x_mask, dtype='float32')
    return x, x_mask

def lookupIDX(words,w):
    w = w.lower()
    if len(w) > 1 and w[0] == '#':
        w = w.replace("#","")
    if w in words:
        return words[w]
    elif 'UUUNKKK' in words:
        return words['UUUNKKK']
    else:
        return len(words) - 1

def getSeq(p1,words):
    p1 = jieba.cut(p1)
    X1 = []
    for i in p1:
        X1.append(lookupIDX(words,i))
    return X1

def getSeqs(p1,p2,words):
    p1 = p1.split()
    p2 = p2.split()
    X1 = []
    X2 = []
    for i in p1:
        X1.append(lookupIDX(words,i))
    for i in p2:
        X2.append(lookupIDX(words,i))
    return X1, X2

def get_minibatches_idx(n, minibatch_size, shuffle=False):
    idx_list = np.arange(n, dtype="int32")

    if shuffle:
        np.random.shuffle(idx_list)

    minibatches = []
    minibatch_start = 0
    for i in range(n // minibatch_size):
        minibatches.append(idx_list[minibatch_start:
        minibatch_start + minibatch_size])
        minibatch_start += minibatch_size

    if (minibatch_start != n):
        minibatches.append(idx_list[minibatch_start:])

    return zip(range(len(minibatches)), minibatches)

def getSimEntDataset(f,words,task):
    data = open(f,'r')
    lines = data.readlines()
    examples = []
    for i in lines:
        i=i.strip()
        if(len(i) > 0):
            i=i.split('\t')
            if len(i) == 3:
                if task == "sim":
                    e = (tree(i[0], words), tree(i[1], words), float(i[2]))
                    examples.append(e)
                elif task == "ent":
                    e = (tree(i[0], words), tree(i[1], words), i[2])
                    examples.append(e)
                else:
                    raise ValueError('Params.traintype not set correctly.')

            else:
                print(i)
    return examples

def getSentimentDataset(f,words):
    data = open(f,'r')
    lines = data.readlines()
    examples = []
    for i in lines:
        i=i.strip()
        if(len(i) > 0):
            i=i.split('\t')
            if len(i) == 2:
                e = (tree(i[0], words), i[1])
                examples.append(e)
            else:
                print(i)
    return examples

def getDataSim(batch, nout):
    g1 = []
    g2 = []
    for i in batch:
        g1.append(i[0].embeddings)
        g2.append(i[1].embeddings)

    g1x, g1mask = prepare_data(g1)
    g2x, g2mask = prepare_data(g2)

    scores = []
    if nout <=0:
        return (scores, g1x, g1mask, g2x, g2mask)

    for i in batch:
        temp = np.zeros(nout)
        score = float(i[2])
        ceil, fl = int(np.ceil(score)), int(np.floor(score))
        if ceil == fl:
            temp[fl - 1] = 1
        else:
            temp[fl - 1] = ceil - score
            temp[ceil - 1] = score - fl
        scores.append(temp)
    scores = np.matrix(scores) + 0.000001
    scores = np.asarray(scores, dtype='float32')
    return (scores, g1x, g1mask, g2x, g2mask)

def getDataEntailment(batch):
    g1 = []; g2 = []
    for i in batch:
        g1.append(i[0].embeddings)
        g2.append(i[1].embeddings)

    g1x, g1mask = prepare_data(g1)
    g2x, g2mask = prepare_data(g2)

    scores = []
    for i in batch:
        temp = np.zeros(3)
        label = i[2].strip()
        if label == "CONTRADICTION":
            temp[0]=1
        if label == "NEUTRAL":
            temp[1]=1
        if label == "ENTAILMENT":
            temp[2]=1
        scores.append(temp)
    scores = np.matrix(scores)+0.000001
    scores = np.asarray(scores,dtype='float32')
    return (scores,g1x,g1mask,g2x,g2mask)

def getDataSentiment(batch):
    g1 = []
    for i in batch:
        g1.append(i[0].embeddings)

    g1x, g1mask = prepare_data(g1)

    scores = []
    for i in batch:
        temp = np.zeros(2)
        label = i[1].strip()
        if label == "0":
            temp[0]=1
        if label == "1":
            temp[1]=1
        scores.append(temp)
    scores = np.matrix(scores)+0.000001
    scores = np.asarray(scores,dtype='float32')
    return (scores,g1x,g1mask)

def sentences2idx(sentences, words):
    """
    Given a list of sentences, output array of word indices that can be fed into the algorithms.
    :param sentences: a list of sentences
    :param words: a dictionary, words['str'] is the indices of the word 'str'
    :return: x1, m1. x1[i, :] is the word indices in sentence i, m1[i,:] is the mask for sentence i (0 means no word at the location)
    """
    seq1 = []
    for i in sentences:
        seq1.append(getSeq(i,words))
    x1,m1 = prepare_data(seq1)
    return x1, m1


def sentiment2idx(sentiment_file, words):
    """
    Read sentiment data file, output array of word indices that can be fed into the algorithms.
    :param sentiment_file: file name
    :param words: a dictionary, words['str'] is the indices of the word 'str'
    :return: x1, m1, golds. x1[i, :] is the word indices in sentence i, m1[i,:] is the mask for sentence i (0 means no word at the location), golds[i] is the label (0 or 1) for sentence i.
    """
    f = open(sentiment_file,'r')
    lines = f.readlines()
    golds = []
    seq1 = []
    for i in lines:
        i = i.split("\t")
        p1 = i[0]; score = int(i[1]) # score are labels 0 and 1
        X1 = getSeq(p1,words)
        seq1.append(X1)
        golds.append(score)
    x1,m1 = prepare_data(seq1)
    return x1, m1, golds

def sim2idx(sim_file, words):
    """
    Read similarity data file, output array of word indices that can be fed into the algorithms.
    :param sim_file: file name
    :param words: a dictionary, words['str'] is the indices of the word 'str'
    :return: x1, m1, x2, m2, golds. x1[i, :] is the word indices in the first sentence in pair i, m1[i,:] is the mask for the first sentence in pair i (0 means no word at the location), golds[i] is the score for pair i (float). x2 and m2 are similar to x1 and m2 but for the second sentence in the pair.
    """
    f = open(sim_file,'r')
    lines = f.readlines()
    golds = []
    seq1 = []
    seq2 = []
    for i in lines:
        i = i.split("\t")
        p1 = i[0]; p2 = i[1]; score = float(i[2])
        X1, X2 = getSeqs(p1,p2,words)
        seq1.append(X1)
        seq2.append(X2)
        golds.append(score)
    x1,m1 = prepare_data(seq1)
    x2,m2 = prepare_data(seq2)
    return x1, m1, x2, m2, golds

def entailment2idx(sim_file, words):
    """
    Read similarity data file, output array of word indices that can be fed into the algorithms.
    :param sim_file: file name
    :param words: a dictionary, words['str'] is the indices of the word 'str'
    :return: x1, m1, x2, m2, golds. x1[i, :] is the word indices in the first sentence in pair i, m1[i,:] is the mask for the first sentence in pair i (0 means no word at the location), golds[i] is the label for pair i (CONTRADICTION NEUTRAL ENTAILMENT). x2 and m2 are similar to x1 and m2 but for the second sentence in the pair.
    """
    f = open(sim_file,'r')
    lines = f.readlines()
    golds = []
    seq1 = []
    seq2 = []
    for i in lines:
        i = i.split("\t")
        p1 = i[0]; p2 = i[1]; score = i[2]
        X1, X2 = getSeqs(p1,p2,words)
        seq1.append(X1)
        seq2.append(X2)
        golds.append(score)
    x1,m1 = prepare_data(seq1)
    x2,m2 = prepare_data(seq2)
    return x1, m1, x2, m2, golds

def getWordWeight(word2weight, a=1e-3):
    if a <=0: # when the parameter makes no sense, use unweighted
        a = 1.0

    # word2weight = {}
    # with open(weightfile) as f:
    #     lines = f.readlines()
    # N = 0
    # for i in lines:
    #     i=i.strip()
    #     if(len(i) > 0):
    #         i=i.split()
    #         if(len(i) == 2):
    #             word2weight[i[0]] = float(i[1])
    #             N += float(i[1])
    #         else:
    #             print(i)
    for key, value in word2weight.items():
        word2weight[key] = a / (a + value.count/value.sample_int)
    return word2weight

def getWeight(words, word2weight):
    weight4ind = {}
    for word, ind in words.items():
        if word in word2weight:
            weight4ind[ind] = word2weight[word]
        else:
            weight4ind[ind] = 1.0
    return weight4ind

def seq2weight(seq, mask, weight4ind):
    weight = np.zeros(seq.shape).astype('float32')
    for i in range(seq.shape[0]):
        for j in range(seq.shape[1]):
            if mask[i,j] > 0 and seq[i,j] >= 0:
                weight[i,j] = weight4ind[seq[i,j]]
    weight = np.asarray(weight, dtype='float32')
    return weight

def getIDFWeight(wordfile, save_file=''):
    def getDataFromFile(f, words):
        f = open(f,'r')
        lines = f.readlines()
        golds = []
        seq1 = []
        seq2 = []
        for i in lines:
            i = i.split("\t")
            p1 = i[0]; p2 = i[1]; score = float(i[2])
            X1, X2 = getSeqs(p1,p2,words)
            seq1.append(X1)
            seq2.append(X2)
            golds.append(score)
        x1,m1 = prepare_data(seq1)
        x2,m2 = prepare_data(seq2)
        return x1,m1,x2,m2

    prefix = "../data/"
    farr = ["MSRpar2012"]
    #farr = ["MSRpar2012",
    #        "MSRvid2012",
    #        "OnWN2012",
    #        "SMTeuro2012",
    #        "SMTnews2012", # 4
    #        "FNWN2013",
    #        "OnWN2013",
    #        "SMT2013",
    #        "headline2013", # 8
    #        "OnWN2014",
    #        "deft-forum2014",
    #        "deft-news2014",
    #        "headline2014",
    #        "images2014",
    #        "tweet-news2014", # 14
    #        "answer-forum2015",
    #        "answer-student2015",
    #        "belief2015",
    #        "headline2015",
    #        "images2015",    # 19
    #        "sicktest",
    #        "twitter",
    #        "JHUppdb",
    #        "anno-dev",
    #        "anno-test"]
    (words, We) = getWordmap(wordfile)
    df = np.zeros((len(words),))
    dlen = 0
    for f in farr:
        g1x,g1mask,g2x,g2mask = getDataFromFile(prefix+f, words)
        dlen += g1x.shape[0]
        dlen += g2x.shape[0]
        for i in range(g1x.shape[0]):
            for j in range(g1x.shape[1]):
                if g1mask[i,j] > 0:
                    df[g1x[i,j]] += 1
        for i in range(g2x.shape[0]):
            for j in range(g2x.shape[1]):
                if g2mask[i,j] > 0:
                    df[g2x[i,j]] += 1

    weight4ind = {}
    for i in range(len(df)):
        weight4ind[i] = np.log2((dlen+2.0)/(1.0+df[i]))
    if save_file:
        pickle.dump(weight4ind, open(save_file, 'w'))
    return weight4ind

移除纠正项SIF_embedding.py
可以看到这里的移除项是通过SVD奇异值分解训练出来的，类似于PCA主成分分析，可用于降维。
svd.components_是一个矩阵，每一行为主题在每个单词上的分布。我们可以通过这个矩阵得到哪些词对主题贡献最大。
接着，在remove_pc函数中将svd.components_这一项进行了移除，原文说的是：移出（减去）所有句子向量组成的矩阵的第一个主成分(principal component / singular vector)上的投影

修改部分:

get_weighted_average函数中修改加权向量的生成方式，改为python3的语法，测试显示，相同句子在原有代码python2环境下，和更改后代码在python3环境下，结果相同。

import numpy as np
from sklearn.decomposition import TruncatedSVD


def get_weighted_average(We, x, w):
    """
    Compute the weighted average vectors
    :param We: We[i,:] is the vector for word i
    :param x: x[i, :] are the indices of the words in sentence i
    :param w: w[i, :] are the weights for the words in sentence i
    :return: emb[i, :] are the weighted average vector for sentence i
    """
    n_samples = x.shape[0]
    emb = np.zeros((n_samples, We.shape[1]))
    for i in range(n_samples):
        # emb[i] = w[i].dot(np.array(We[x[i]])) / np.count_nonzero(w[i])
        for j in range(len(w[i])):
            emb[i] += w[i][j] * np.array(We[x[i]][j])
        emb[i] = emb[i] / np.count_nonzero(w[i])
    return emb

def compute_pc(X,npc=1):
    """
    Compute the principal components. DO NOT MAKE THE DATA ZERO MEAN!
    :param X: X[i,:] is a data point
    :param npc: number of principal components to remove
    :return: component_[i,:] is the i-th pc
    """
    svd = TruncatedSVD(n_components=npc, n_iter=7, random_state=0)
    svd.fit(X)
    return svd.components_

def remove_pc(X, npc=1):
    """
    Remove the projection on the principal components
    :param X: X[i,:] is a data point
    :param npc: number of principal components to remove
    :return: XX[i, :] is the data point after removing its projection
    """
    pc = compute_pc(X, npc)
    if npc==1:
        XX = X - X.dot(pc.transpose()) * pc
    else:
        XX = X - X.dot(pc.transpose()).dot(pc)
    return XX


def SIF_embedding(We, x, w, params):
    """
    Compute the scores between pairs of sentences using weighted average + removing the projection on the first principal component
    :param We: We[i,:] is the vector for word i
    :param x: x[i, :] are the indices of the words in the i-th sentence
    :param w: w[i, :] are the weights for the words in the i-th sentence
    :param params.rmpc: if >0, remove the projections of the sentence embeddings to their first principal component
    :return: emb, emb[i, :] is the embedding for sentence i
    """
    emb = get_weighted_average(We, x, w)
    if  params.rmpc > 0:
        emb = remove_pc(emb, params.rmpc)
    return emb

论文实验表明该方法具有不错的竞争力，在大部分数据集上都比平均词向量或者使用TFIDF加权平均的效果好，在使用PSL作为词向量时甚至能达到最优结果。
根据论文中的实验结果来看，在句子相似度任务上超过平均水平，甚至超过部分复杂的模型。在句子分类上效果也很明显，甚至是最好成绩。

经验之谈

我用了词向量平均和TFIDF加权与SIF方法进行了对比，都采用100维的Word2vec词向量，做相似度匹配的任务，通过欧氏距离和余弦相似度度量向量之间的距离，但是SIF的效果却不如另外两种更简单的方法。
另外，对于两个很相似的句子，SIF生成的向量的正负项截然相反。比如对于‘这是一个测试句子’和‘这是另一个测试句子’这两个非常相似的中文句子，生成的句向量每一项的正负值都刚好相反，这让我没有想明白。这应该不利于欧氏距离的计算，因为这两个句子的欧氏距离并不大。所以用了SIF方法最好用余弦距离进行相似度匹配。

这是一个测试句子
[ 0.07638578 -0.15427788  0.04004123 -0.11843429 -0.06013182  0.03942103
 -0.01382917  0.01305546  0.06177262 -0.02547832 -0.04165836  0.02171577
  0.03483471  0.05667425 -0.117093   -0.02521048 -0.00686271 -0.02931183
  0.05059035 -0.02502487 -0.00903647  0.00778577  0.01954736 -0.03124137
  0.10074088  0.02835767 -0.08591071 -0.05027893  0.09560275 -0.08829507
 -0.07332305 -0.06830808  0.09723447  0.01102427 -0.10592448 -0.01029612
  0.07102155 -0.03058108 -0.01676355 -0.06929373 -0.05900271 -0.05584531
 -0.00446632  0.07027014  0.14057033 -0.05284498 -0.02534611 -0.01722914
 -0.07428796 -0.05775267 -0.00475082  0.00043147 -0.0978087   0.08172205
 -0.10074747 -0.03555521 -0.08807748  0.07520326  0.01554954 -0.00893718
  0.07821482  0.00935646  0.0465772   0.00160614 -0.05490717 -0.01119706
 -0.04844879 -0.06298091  0.01656367  0.00719948  0.12924895 -0.00991099
  0.08364741 -0.00887778 -0.05152184  0.10083027  0.0076994   0.03921235
  0.00199744  0.0446614  -0.06055355  0.12712339  0...]
这是另一个测试句子
[-0.08652813  0.1747626  -0.04535782  0.13415977  0.06811601 -0.04465528
  0.01566538 -0.01478894 -0.06997467  0.02886128  0.04718968 -0.02459915
 -0.03946    -0.06419934  0.13264038  0.02855788  0.00777393  0.0332038
 -0.05730764  0.02834763  0.01023632 -0.00881955 -0.02214282  0.03538954
 -0.11411705 -0.03212295  0.09731777  0.05695486 -0.10829669  0.10001872
  0.08305874  0.07737789 -0.11014507 -0.01248805  0.11998893  0.01166321
 -0.08045165  0.03464158  0.01898938  0.07849441  0.06683698  0.06326034
  0.00505935 -0.07960047 -0.15923499  0.05986164  0.02871152  0.01951679
  0.08415177  0.06542096  0.00538162 -0.00048876  0.11079555 -0.09257294
  0.11412452  0.04027616  0.09977224 -0.08518861 -0.01761418  0.01012385
 -0.08860003 -0.0105988  -0.05276163 -0.0018194   0.06219764  0.01268379
  0.05488173  0.0713434  -0.01876297 -0.00815541 -0.14641037  0.01122695
 -0.09475394  0.01005656  0.05836281 -0.11421831 -0.00872171 -0.04441888
 -0.00226266 -0.05059145  0.06859373 -0.14400259 -0....]

之后可能会考虑将代码上传至github
$\color{red}{(纯原创,转载请注明来源)}$