美文网首页
python LDA分析

python LDA分析

作者: VivaVida | 来源:发表于2022-05-05 15:51 被阅读0次

    python 主题模型分析存档


    from gensimimport corpora, models

    import jieba.possegas jp, jieba

    import numpyas np

    # 文本集

    f =open(r'文档路径',encoding='utf-8')

    texts = [[wordfor wordin line.split()]for linein f]

    #获得文本个数

    M =len(texts)

    # 分词过滤条件

    flags = ('n','nr','ns','nt','eng','v','d')# 词性

    stopwords =open(r'C:\Users\Hikaru\PycharmProjects\pythonPractice\datamining\stopwords.txt')

    stopword=['想想','越来越']# 停词

    # 分词

    words_ls = []

    for iin texts:

    text=''.join(i)

    words = [w.wordfor win jp.cut(text)if w.flagin flagsand w.wordnot in stopwordsand w.wordnot in stopwordand len(w.word)>1]

    words_ls.append(words)

    # print(words_ls)

    text2=[[wordfor wordin line.split()]for linein f]

    # 构造词典

    dictionary = corpora.Dictionary(words_ls)

    # 基于词典,使【词】→【稀疏向量】,并将向量放入列表,形成【稀疏向量集】

    corpus = [dictionary.doc2bow(words)for wordsin words_ls]

    print(dictionary.token2id)

    print(corpus)

    # 计算文档TF-IDF

    corpus_tfidf = models.TfidfModel(corpus)[corpus]

    # lda模型,num_topics设置主题的个数

    lda = models.ldamodel.LdaModel(corpus_tfidf,id2word=dictionary,num_topics=3,alpha=0.01,eta=0.01)#alpha 主题顺滑度 eta词语顺滑度 越小越宽泛

    # 所有文档的主题

    for topicin lda.print_topics(num_words=10):

    print(topic)

    # 主题推断

    # for e, values in enumerate(lda.inference(corpus)[0]):

    #    print(texts[e])

    #    for ee, value in enumerate(values):

    #        print('\t主题%d推断值%.2f' % (ee, value))

    # # 打印文档的主题分布

    # num_show_topic = 5 #每个文档显示前几个主题

    # 每个主题的词分布

    num_show_term =10  # 每个主题显示几个词

    for topic_idin range(20):

    print('主题#%d:\t' % topic_id)

    term_distribute_all = lda.get_topic_terms(topicid=topic_id)# 所有词的词分布

        term_distribute = term_distribute_all[:10]# 只显示前几个词

        print(term_distribute)

    term_distribute = np.array(term_distribute)

    term_id = term_distribute[:,0].astype(np.int)

    print('词:',end="")

    for tin term_id:

    print(dictionary.id2token[t],end=' ')

    print('概率:',end="")

    print(term_distribute[:,1])

    #

    # # # 主题推断

    # print(lda.inference(corpus))

    相关文章

      网友评论

          本文标题:python LDA分析

          本文链接:https://www.haomeiwen.com/subject/unamyrtx.html