当语料库很大无法一次性加载时,可以使用如下方法:
class MySentences(object):
def __init__(self, filename):
self.filename = filename
def __iter__(self):
for line in open(self.filename, encoding='utf8', errors='ignore'):
yield line.split()
def train():
sentences = MySentences('/media/jlan/E/Projects/nlp/数据集/thu_rice.txt') # a memory-friendly iterator
model = Word2Vec(sentences, size=100, window=5, min_count=5, workers=4)
model.save(GEN_MODEL_FILE)
return model
增量训练
import os
import sys
import multiprocessing
import gensim
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
# python train_word2vec_model.py wiki.zh.text.jian.seg.utf-8 wiki.zh.text.model wiki.zh.text.vector
# python train_word2vec_model_update.py new_corpus.jieba meili.txt.model meili.txt.update.model meili.txt.update.vector
if __name__ == '__main__':
program = os.path.basename(sys.argv[0])
logger = logging.getLogger(program)
logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s')
logging.root.setLevel(level=logging.INFO)
logger.info("running %s" % ' '.join(sys.argv))
# check and process input arguments
if len(sys.argv) < 4:
print(globals()['__doc__'] % locals())
sys.exit(1)
inp_file, inp_model, outp_model, outp_vector = sys.argv[1:5]
model = gensim.models.Word2Vec.load(inp_model)
# for update 词汇
model.build_vocab(LineSentence(inp_file),update=True)
#model.train(LineSentence(inp_file))
model.train(LineSentence(inp_file), total_examples=model.corpus_count, epochs=model.iter)
#model = Word2Vec(LineSentence(inp_file), size=400, window=5, min_count=5,
#workers=multiprocessing.cpu_count())
# trim unneeded model memory = use(much) less RAM
# model.init_sims(replace=True)
model.save(outp_model)
model.wv.save_word2vec_format(outp_vector, binary=False)
网友评论