-
split
跳过头部, numpy.asarray(values[1:], dtype='float32')
读取词向量。
with open(file, "r", encoding="utf-8") as f:
for i, line in enumerate(f, 1):
# skip the first row if it is a header
if i == 1:
if len(line.split()) < dim:
header = True
continue
values = line.split(" ")
word = values[0]
vector = numpy.asarray(values[1:], dtype='float32')
index = i - 1 if header else i
idx2word[index] = word
word2idx[word] = index
embeddings.append(vector)
# add an unk token, for OOV words
if "<unk>" not in word2idx:
idx2word[len(idx2word) + 1] = "<unk>"
word2idx["<unk>"] = len(word2idx) + 1
embeddings.append(
numpy.random.uniform(low=-0.05, high=0.05, size=dim))
- 因为第一次加载得到了映射关系是加载了所有的词表,接下来应该存储一个
catch
,加快后面加载的速度,存储成pickle
的形式
def write_cache_word_vectors(file, data):
with open(file_cache_name(file), 'wb') as pickle_file:
pickle.dump(data, pickle_file)
def load_cache_word_vectors(file):
with open(file_cache_name(file), 'rb') as f:
return pickle.load(f)
""" 分割线 """
try:
cache = load_cache_word_vectors(file)
print("Loaded word embeddings from cache.")
return cache
except OSError:
print("Didn't find embeddings cache file {}".format(file))
# create the necessary dictionaries and the word embeddings matrix
if os.path.exists(file):
print('Indexing file {} ...'.format(file))
word2idx = {} # dictionary of words to ids
idx2word = {} # dictionary of ids to words
embeddings = [] # the word embeddings matrix
# create the 2D array, which will be used for initializing
# the Embedding layer of a NN.
# We reserve the first row (idx=0), as the word embedding,
# which will be used for zero padding (word with id = 0).
embeddings.append(numpy.zeros(dim))
# flag indicating whether the first row of the embeddings file
# has a header
header = False
# read file, line by line
with open(file, "r", encoding="utf-8") as f:
for i, line in enumerate(f, 1):
# skip the first row if it is a header
if i == 1:
if len(line.split()) < dim:
header = True
continue
values = line.split(" ")
word = values[0]
vector = numpy.asarray(values[1:], dtype='float32')
index = i - 1 if header else i
idx2word[index] = word
word2idx[word] = index
embeddings.append(vector)
# add an unk token, for OOV words
if "<unk>" not in word2idx:
idx2word[len(idx2word) + 1] = "<unk>"
word2idx["<unk>"] = len(word2idx) + 1
embeddings.append(
numpy.random.uniform(low=-0.05, high=0.05, size=dim))
print(set([len(x) for x in embeddings]))
print('Found %s word vectors.' % len(embeddings))
embeddings = numpy.array(embeddings, dtype='float32')
# write the data to a cache file
write_cache_word_vectors(file, (word2idx, idx2word, embeddings))
return word2idx, idx2word, embeddings
网友评论