慢慢走,关键不要停下来。
参考URL:
https://zhuanlan.zhihu.com/p/28943718
一,代码
# Word2Vec
import os
import jieba
# 数据训练与分析
from gensim.models import word2vec
org_txt = './3b.txt'
cut_txt = './3b_jieba.txt'
model_name = './3b.model'
"""
# 准备数据
with open(org_txt, 'r') as fp:
if os.path.exists(cut_txt):
os.remove(cut_txt)
lines = fp.readlines()
for line in lines:
seg_list = jieba.cut(line)
with open(cut_txt, 'a',encoding='utf-8') as ff:
ff.write(' '.join(seg_list))
# 加载语料
sentences = word2vec.Text8Corpus(cut_txt)
# 训练模型
model = word2vec.Word2Vec(sentences)
# 选出最相近的10个词
for e in model.wv.most_similar(positive=['叶文洁'], topn=10):
print(e[0], e[1])
# 存储和加载模型
model.save(model_name)
"""
model = word2vec.Word2Vec.load(model_name)
# Word2Vec的其它用途
# 计算两个词的相似度
print(model.wv.similarity('叶文洁', '杨冬'))
# 计算两个集合的相似度
list1 = ['周文王', '秦始皇']
list2 = ['丁仪', '大史']
print(model.wv.n_similarity(list1, list2))
# 选出集合中不同类的词语
list3 = ['叶文洁', '杨冬', '丁仪', '周文王']
print(model.wv.doesnt_match(list3))
# 查看词的向量值
print(type(model.wv['叶文洁']))
print(len(model.wv['叶文洁']))
print(model.wv['叶文洁'])
二,输出
C:\Users\ccc\AppData\Local\Programs\Python\Python310\python.exe D:\tmp\textclass\pytxt.py
0.980158
0.99339145
杨冬
<class 'numpy.ndarray'>
100
[-0.52459264 0.42894545 0.2405044 0.5670044 0.17116037 -0.4805367
0.5195656 0.76600087 -0.29118067 -0.40652213 0.02053648 -0.55843574
-0.00442393 0.13712034 0.27877867 -0.6454355 0.47351456 -0.63771605
-0.30295673 -0.95495903 0.7742779 0.03675407 0.90363574 -0.642732
-0.13329592 0.06366161 -0.3403903 -0.38197997 -0.585721 0.22453749
0.4137146 -0.1787358 0.10432581 -0.62286484 -0.03213086 0.4513987
0.20170954 -0.11258729 -0.3685589 -0.3779576 -0.34012443 -0.26127875
-0.04648408 -0.05903878 0.39310023 0.36001405 -0.43748587 -0.11494742
0.238909 0.11225623 0.37921125 -0.21532379 -0.04863171 0.1027706
-0.4405764 0.25152412 0.34206703 0.09669768 -0.6519396 0.20798652
-0.07624672 -0.31123203 -0.08219526 -0.01892793 -0.38171357 0.5272733
-0.47259566 0.4399706 -0.5804483 0.3796891 0.03033544 0.5003571
0.33728248 -0.04558028 0.15972577 -0.2579778 0.2591412 0.13919042
-0.41843528 -0.04623118 -0.8242269 0.09157909 -0.49296165 0.32215548
-0.3206883 -0.3282018 0.34820762 0.0526211 0.77261025 0.3747695
0.51636213 0.10613303 -0.22898611 -0.29524592 0.86904347 0.43539488
0.18083698 -0.48167413 0.08040039 0.13137183]
Process finished with exit code 0
网友评论