话不多说,直接上源码
import jieba
from gensim import corpora, models, similarities
import codecs
def cut_words(file):
with open(file, 'r') as f:
text = f.read()
words = jieba.lcut(text)
# print(len(words),words) #查看分词结果
return words
def drop_Disable_Words(cut_res, stopwords):
res = []
for word in cut_res:
if word in stopwords or word == "\n" or word == "\u3000":
continue
res.append(word)
# print(len(res),res) #查看去停用词结果
return res
def read_stop_word(file_path):
file = file_path
stopwords = codecs.open(file, 'r', encoding='GBK').readlines()
stopwords = [w.strip() for w in stopwords]
return stopwords
# 读取原始语料、停用词表
files = ['F:/fenxi/测试之美食.txt',
'F:/fenxi/测试之名人自述.txt',
'F:/fenxi/吃货大师的欲望清单.txt',
'F:/fenxi/名人自述1.txt',
'F:/fenxi/名人自述2.txt',
'F:/fenxi/世界上什么下酒菜最美.txt',
'F:/fenxi/鱼藏剑和酿菜.txt',
'F:/fenxi/臭味食物.txt'
]
stopwords = read_stop_word("F:/fenxi/stop_word.txt")
# 分词、去停用词
corpus = []
for file in files:
# 分词
cut_res = cut_words(file)
# 去停用词
res = drop_Disable_Words(cut_res, stopwords)
corpus.append(res)
# print(len(corpus))
# 建立词袋模型
dictionary = corpora.Dictionary(corpus)
doc_vectors = [dictionary.doc2bow(text) for text in corpus]
# print(len(doc_vectors),doc_vectors)
tfidf = models.TfidfModel(doc_vectors)
tfidf_vectors = tfidf[doc_vectors]
print(len(tfidf_vectors))
print(len(tfidf_vectors[0]))
print(tfidf_vectors[0])
# 建立TF-IDF模型
def TF_IDF(tfidf_vectors, doc_vectors):
index = similarities.MatrixSimilarity(tfidf_vectors)
sims = index[doc_vectors[0]]
print(list(enumerate(sims)))
# 建立LSI模型
def LSI(tfidf_vectors, dictionary, doc_vectors, theme_num):
lsi = models.LsiModel(tfidf_vectors, id2word=dictionary, num_topics=theme_num)
lsi_vector = lsi[tfidf_vectors]
query_lsi = lsi[doc_vectors[0]]
index = similarities.MatrixSimilarity(lsi_vector)
sims = index[query_lsi]
print(list(enumerate(sims)))
# 使用LSI模型计算相似度
LSI(tfidf_vectors, dictionary, doc_vectors, 2)
# 使用TF-IDF模型计算相似度
TF_IDF(tfidf_vectors, doc_vectors)
网友评论