美文网首页
[python]加载词向量以及用cache的方式加载

[python]加载词向量以及用cache的方式加载

作者: VanJordan | 来源:发表于2019-06-07 15:49 被阅读0次
  • split跳过头部, numpy.asarray(values[1:], dtype='float32')读取词向量。
with open(file, "r", encoding="utf-8") as f:
    for i, line in enumerate(f, 1):

        # skip the first row if it is a header
        if i == 1:
            if len(line.split()) < dim:
                header = True
                continue

        values = line.split(" ")
        word = values[0]
        vector = numpy.asarray(values[1:], dtype='float32')

        index = i - 1 if header else i

        idx2word[index] = word
        word2idx[word] = index
        embeddings.append(vector)

    # add an unk token, for OOV words
    if "<unk>" not in word2idx:
        idx2word[len(idx2word) + 1] = "<unk>"
        word2idx["<unk>"] = len(word2idx) + 1
        embeddings.append(
            numpy.random.uniform(low=-0.05, high=0.05, size=dim))
  • 因为第一次加载得到了映射关系是加载了所有的词表,接下来应该存储一个catch,加快后面加载的速度,存储成pickle的形式
def write_cache_word_vectors(file, data):
    with open(file_cache_name(file), 'wb') as pickle_file:
        pickle.dump(data, pickle_file)


def load_cache_word_vectors(file):
    with open(file_cache_name(file), 'rb') as f:
        return pickle.load(f)
"""  分割线  """
    try:
        cache = load_cache_word_vectors(file)
        print("Loaded word embeddings from cache.")
        return cache
    except OSError:
        print("Didn't find embeddings cache file {}".format(file))

    # create the necessary dictionaries and the word embeddings matrix
    if os.path.exists(file):
        print('Indexing file {} ...'.format(file))

        word2idx = {}  # dictionary of words to ids
        idx2word = {}  # dictionary of ids to words
        embeddings = []  # the word embeddings matrix

        # create the 2D array, which will be used for initializing
        # the Embedding layer of a NN.
        # We reserve the first row (idx=0), as the word embedding,
        # which will be used for zero padding (word with id = 0).
        embeddings.append(numpy.zeros(dim))

        # flag indicating whether the first row of the embeddings file
        # has a header
        header = False

        # read file, line by line
        with open(file, "r", encoding="utf-8") as f:
            for i, line in enumerate(f, 1):

                # skip the first row if it is a header
                if i == 1:
                    if len(line.split()) < dim:
                        header = True
                        continue

                values = line.split(" ")
                word = values[0]
                vector = numpy.asarray(values[1:], dtype='float32')

                index = i - 1 if header else i

                idx2word[index] = word
                word2idx[word] = index
                embeddings.append(vector)

            # add an unk token, for OOV words
            if "<unk>" not in word2idx:
                idx2word[len(idx2word) + 1] = "<unk>"
                word2idx["<unk>"] = len(word2idx) + 1
                embeddings.append(
                    numpy.random.uniform(low=-0.05, high=0.05, size=dim))

            print(set([len(x) for x in embeddings]))

            print('Found %s word vectors.' % len(embeddings))
            embeddings = numpy.array(embeddings, dtype='float32')

        # write the data to a cache file
        write_cache_word_vectors(file, (word2idx, idx2word, embeddings))

        return word2idx, idx2word, embeddings

相关文章

网友评论

      本文标题:[python]加载词向量以及用cache的方式加载

      本文链接:https://www.haomeiwen.com/subject/vfamxctx.html