word2vec
from gensim.models import Word2Vec
def address_split(x):
format_address = x['format_address']
address_split = format_address.split('|')
words = [item.replace(',', '#') for item in address_split]
return words
df = pd.read_csv(path.joinpath('hn_1123_10w.csv'))
df['model_input'] = df.apply(lambda x: address_split(x), axis=1)
model = Word2Vec(df['model_input'].tolist(), vector_size=128, window=5, min_count=1, workers=4)
model.save('/Users/xx/' + model_name)
model = Word2Vec.load('/Users/shilulu/' + model_name)
print(model.vector_size)
print(model.wv.similarity('北京市#2', '朝阳区#4'))
print(model.wv.most_similar('上海市#2'))
from gensim.models.doc2vec import TaggedDocument, Doc2Vec
words_list = df['model_input'].to_list()
tagged_doc = [TaggedDocument(words, [i]) for i, words in enumerate(words_list)]
print(tagged_doc[:2])
model = Doc2Vec(vector_size=20, min_count=2, epochs=40)
model.build_vocab(tagged_doc)
model.train(tagged_doc, total_examples=model.corpus_count, epochs=model.epochs)
n=1
print(words_list[n] )
ss = model.dv.most_similar([model.infer_vector(words_list[n])], topn=3)
for s in ss:
if s[1]>0.8:
print(s)
print(words_list[s[0]])
网友评论