python 主题模型分析存档
from gensimimport corpora, models
import jieba.possegas jp, jieba
import numpyas np
# 文本集
f =open(r'文档路径',encoding='utf-8')
texts = [[wordfor wordin line.split()]for linein f]
#获得文本个数
M =len(texts)
# 分词过滤条件
flags = ('n','nr','ns','nt','eng','v','d')# 词性
stopwords =open(r'C:\Users\Hikaru\PycharmProjects\pythonPractice\datamining\stopwords.txt')
stopword=['想想','越来越']# 停词
# 分词
words_ls = []
for iin texts:
text=''.join(i)
words = [w.wordfor win jp.cut(text)if w.flagin flagsand w.wordnot in stopwordsand w.wordnot in stopwordand len(w.word)>1]
words_ls.append(words)
# print(words_ls)
text2=[[wordfor wordin line.split()]for linein f]
# 构造词典
dictionary = corpora.Dictionary(words_ls)
# 基于词典,使【词】→【稀疏向量】,并将向量放入列表,形成【稀疏向量集】
corpus = [dictionary.doc2bow(words)for wordsin words_ls]
print(dictionary.token2id)
print(corpus)
# 计算文档TF-IDF
corpus_tfidf = models.TfidfModel(corpus)[corpus]
# lda模型,num_topics设置主题的个数
lda = models.ldamodel.LdaModel(corpus_tfidf,id2word=dictionary,num_topics=3,alpha=0.01,eta=0.01)#alpha 主题顺滑度 eta词语顺滑度 越小越宽泛
# 所有文档的主题
for topicin lda.print_topics(num_words=10):
print(topic)
# 主题推断
# for e, values in enumerate(lda.inference(corpus)[0]):
# print(texts[e])
# for ee, value in enumerate(values):
# print('\t主题%d推断值%.2f' % (ee, value))
# # 打印文档的主题分布
# num_show_topic = 5 #每个文档显示前几个主题
# 每个主题的词分布
num_show_term =10 # 每个主题显示几个词
for topic_idin range(20):
print('主题#%d:\t' % topic_id)
term_distribute_all = lda.get_topic_terms(topicid=topic_id)# 所有词的词分布
term_distribute = term_distribute_all[:10]# 只显示前几个词
print(term_distribute)
term_distribute = np.array(term_distribute)
term_id = term_distribute[:,0].astype(np.int)
print('词:',end="")
for tin term_id:
print(dictionary.id2token[t],end=' ')
print('概率:',end="")
print(term_distribute[:,1])
#
# # # 主题推断
# print(lda.inference(corpus))
网友评论