gensim

作者: hehehehe | 来源:发表于2021-12-29 14:47 被阅读0次

    word2vec
    from gensim.models import Word2Vec

    def address_split(x):
        format_address = x['format_address']
        address_split = format_address.split('|')
        words = [item.replace(',', '#') for item in address_split]
        return words
    
    df = pd.read_csv(path.joinpath('hn_1123_10w.csv'))
    df['model_input'] = df.apply(lambda x: address_split(x), axis=1)
    model = Word2Vec(df['model_input'].tolist(), vector_size=128, window=5, min_count=1, workers=4)
    model.save('/Users/xx/' + model_name)
    
    model = Word2Vec.load('/Users/shilulu/' + model_name)
    print(model.vector_size)
    print(model.wv.similarity('北京市#2', '朝阳区#4'))
    print(model.wv.most_similar('上海市#2'))
    

    from gensim.models.doc2vec import TaggedDocument, Doc2Vec

    words_list = df['model_input'].to_list()
    tagged_doc = [TaggedDocument(words, [i]) for i, words in enumerate(words_list)]
    print(tagged_doc[:2])
    
    model = Doc2Vec(vector_size=20, min_count=2, epochs=40)
    model.build_vocab(tagged_doc)
    model.train(tagged_doc, total_examples=model.corpus_count, epochs=model.epochs)
    n=1
    print(words_list[n] )
    ss = model.dv.most_similar([model.infer_vector(words_list[n])], topn=3)
    for s in ss:
        if s[1]>0.8:
            print(s)
            print(words_list[s[0]])
    
    

    相关文章

      网友评论

          本文标题:gensim

          本文链接:https://www.haomeiwen.com/subject/okbvqrtx.html