美文网首页
gensim Word2Vec 基本用法

gensim Word2Vec 基本用法

作者: 万州客 | 来源:发表于2023-03-24 08:37 被阅读0次

    慢慢走,关键不要停下来。

    参考URL:
    https://zhuanlan.zhihu.com/p/28943718

    一,代码

    # Word2Vec
    import os
    import jieba
    # 数据训练与分析
    from gensim.models import word2vec
    
    org_txt = './3b.txt'
    cut_txt = './3b_jieba.txt'
    model_name = './3b.model'
    
    """
    # 准备数据
    with open(org_txt, 'r') as fp:
        if os.path.exists(cut_txt):
            os.remove(cut_txt)
        lines = fp.readlines()
        for line in lines:
            seg_list = jieba.cut(line)
            with open(cut_txt, 'a',encoding='utf-8') as ff:
                ff.write(' '.join(seg_list))
    
    
    # 加载语料
    sentences = word2vec.Text8Corpus(cut_txt)
    # 训练模型
    model = word2vec.Word2Vec(sentences)
    
    # 选出最相近的10个词
    for e in model.wv.most_similar(positive=['叶文洁'], topn=10):
        print(e[0], e[1])
    
    # 存储和加载模型
    model.save(model_name)
    """
    
    model = word2vec.Word2Vec.load(model_name)
    
    # Word2Vec的其它用途
    # 计算两个词的相似度
    print(model.wv.similarity('叶文洁', '杨冬'))
    # 计算两个集合的相似度
    list1 = ['周文王', '秦始皇']
    list2 = ['丁仪', '大史']
    print(model.wv.n_similarity(list1, list2))
    
    # 选出集合中不同类的词语
    list3 = ['叶文洁', '杨冬', '丁仪', '周文王']
    print(model.wv.doesnt_match(list3))
    
    # 查看词的向量值
    print(type(model.wv['叶文洁']))
    print(len(model.wv['叶文洁']))
    print(model.wv['叶文洁'])
    
    

    二,输出

    C:\Users\ccc\AppData\Local\Programs\Python\Python310\python.exe D:\tmp\textclass\pytxt.py 
    0.980158
    0.99339145
    杨冬
    <class 'numpy.ndarray'>
    100
    [-0.52459264  0.42894545  0.2405044   0.5670044   0.17116037 -0.4805367
      0.5195656   0.76600087 -0.29118067 -0.40652213  0.02053648 -0.55843574
     -0.00442393  0.13712034  0.27877867 -0.6454355   0.47351456 -0.63771605
     -0.30295673 -0.95495903  0.7742779   0.03675407  0.90363574 -0.642732
     -0.13329592  0.06366161 -0.3403903  -0.38197997 -0.585721    0.22453749
      0.4137146  -0.1787358   0.10432581 -0.62286484 -0.03213086  0.4513987
      0.20170954 -0.11258729 -0.3685589  -0.3779576  -0.34012443 -0.26127875
     -0.04648408 -0.05903878  0.39310023  0.36001405 -0.43748587 -0.11494742
      0.238909    0.11225623  0.37921125 -0.21532379 -0.04863171  0.1027706
     -0.4405764   0.25152412  0.34206703  0.09669768 -0.6519396   0.20798652
     -0.07624672 -0.31123203 -0.08219526 -0.01892793 -0.38171357  0.5272733
     -0.47259566  0.4399706  -0.5804483   0.3796891   0.03033544  0.5003571
      0.33728248 -0.04558028  0.15972577 -0.2579778   0.2591412   0.13919042
     -0.41843528 -0.04623118 -0.8242269   0.09157909 -0.49296165  0.32215548
     -0.3206883  -0.3282018   0.34820762  0.0526211   0.77261025  0.3747695
      0.51636213  0.10613303 -0.22898611 -0.29524592  0.86904347  0.43539488
      0.18083698 -0.48167413  0.08040039  0.13137183]
    
    Process finished with exit code 0
    
    

    相关文章

      网友评论

          本文标题:gensim Word2Vec 基本用法

          本文链接:https://www.haomeiwen.com/subject/khpwrdtx.html