美文网首页
gensim Word2Vec 基本用法

gensim Word2Vec 基本用法

作者: 万州客 | 来源:发表于2023-03-24 08:37 被阅读0次

慢慢走,关键不要停下来。

参考URL:
https://zhuanlan.zhihu.com/p/28943718

一,代码

# Word2Vec
import os
import jieba
# 数据训练与分析
from gensim.models import word2vec

org_txt = './3b.txt'
cut_txt = './3b_jieba.txt'
model_name = './3b.model'

"""
# 准备数据
with open(org_txt, 'r') as fp:
    if os.path.exists(cut_txt):
        os.remove(cut_txt)
    lines = fp.readlines()
    for line in lines:
        seg_list = jieba.cut(line)
        with open(cut_txt, 'a',encoding='utf-8') as ff:
            ff.write(' '.join(seg_list))


# 加载语料
sentences = word2vec.Text8Corpus(cut_txt)
# 训练模型
model = word2vec.Word2Vec(sentences)

# 选出最相近的10个词
for e in model.wv.most_similar(positive=['叶文洁'], topn=10):
    print(e[0], e[1])

# 存储和加载模型
model.save(model_name)
"""

model = word2vec.Word2Vec.load(model_name)

# Word2Vec的其它用途
# 计算两个词的相似度
print(model.wv.similarity('叶文洁', '杨冬'))
# 计算两个集合的相似度
list1 = ['周文王', '秦始皇']
list2 = ['丁仪', '大史']
print(model.wv.n_similarity(list1, list2))

# 选出集合中不同类的词语
list3 = ['叶文洁', '杨冬', '丁仪', '周文王']
print(model.wv.doesnt_match(list3))

# 查看词的向量值
print(type(model.wv['叶文洁']))
print(len(model.wv['叶文洁']))
print(model.wv['叶文洁'])

二,输出

C:\Users\ccc\AppData\Local\Programs\Python\Python310\python.exe D:\tmp\textclass\pytxt.py 
0.980158
0.99339145
杨冬
<class 'numpy.ndarray'>
100
[-0.52459264  0.42894545  0.2405044   0.5670044   0.17116037 -0.4805367
  0.5195656   0.76600087 -0.29118067 -0.40652213  0.02053648 -0.55843574
 -0.00442393  0.13712034  0.27877867 -0.6454355   0.47351456 -0.63771605
 -0.30295673 -0.95495903  0.7742779   0.03675407  0.90363574 -0.642732
 -0.13329592  0.06366161 -0.3403903  -0.38197997 -0.585721    0.22453749
  0.4137146  -0.1787358   0.10432581 -0.62286484 -0.03213086  0.4513987
  0.20170954 -0.11258729 -0.3685589  -0.3779576  -0.34012443 -0.26127875
 -0.04648408 -0.05903878  0.39310023  0.36001405 -0.43748587 -0.11494742
  0.238909    0.11225623  0.37921125 -0.21532379 -0.04863171  0.1027706
 -0.4405764   0.25152412  0.34206703  0.09669768 -0.6519396   0.20798652
 -0.07624672 -0.31123203 -0.08219526 -0.01892793 -0.38171357  0.5272733
 -0.47259566  0.4399706  -0.5804483   0.3796891   0.03033544  0.5003571
  0.33728248 -0.04558028  0.15972577 -0.2579778   0.2591412   0.13919042
 -0.41843528 -0.04623118 -0.8242269   0.09157909 -0.49296165  0.32215548
 -0.3206883  -0.3282018   0.34820762  0.0526211   0.77261025  0.3747695
  0.51636213  0.10613303 -0.22898611 -0.29524592  0.86904347  0.43539488
  0.18083698 -0.48167413  0.08040039  0.13137183]

Process finished with exit code 0

相关文章

网友评论

      本文标题:gensim Word2Vec 基本用法

      本文链接:https://www.haomeiwen.com/subject/khpwrdtx.html