美文网首页
python文本相似度计算

python文本相似度计算

作者: 轻语风 | 来源:发表于2020-07-22 18:56 被阅读0次

    话不多说,直接上源码

    import jieba
    from gensim import corpora, models, similarities
    import codecs
    
    
    def cut_words(file):
        with open(file, 'r') as f:
            text = f.read()
            words = jieba.lcut(text)
            # print(len(words),words) #查看分词结果
        return words
    
    
    def drop_Disable_Words(cut_res, stopwords):
        res = []
        for word in cut_res:
            if word in stopwords or word == "\n" or word == "\u3000":
                continue
            res.append(word)
        # print(len(res),res) #查看去停用词结果
        return res
    
    
    def read_stop_word(file_path):
        file = file_path
        stopwords = codecs.open(file, 'r', encoding='GBK').readlines()
        stopwords = [w.strip() for w in stopwords]
        return stopwords
    
    
    # 读取原始语料、停用词表
    files = ['F:/fenxi/测试之美食.txt',
             'F:/fenxi/测试之名人自述.txt',
             'F:/fenxi/吃货大师的欲望清单.txt',
             'F:/fenxi/名人自述1.txt',
             'F:/fenxi/名人自述2.txt',
             'F:/fenxi/世界上什么下酒菜最美.txt',
             'F:/fenxi/鱼藏剑和酿菜.txt',
             'F:/fenxi/臭味食物.txt'
             ]
    stopwords = read_stop_word("F:/fenxi/stop_word.txt")
    
    # 分词、去停用词
    corpus = []
    for file in files:
        # 分词
        cut_res = cut_words(file)
        # 去停用词
        res = drop_Disable_Words(cut_res, stopwords)
        corpus.append(res)
    # print(len(corpus))
    
    
    # 建立词袋模型
    dictionary = corpora.Dictionary(corpus)
    doc_vectors = [dictionary.doc2bow(text) for text in corpus]
    # print(len(doc_vectors),doc_vectors)
    
    
    
    tfidf = models.TfidfModel(doc_vectors)
    tfidf_vectors = tfidf[doc_vectors]
    print(len(tfidf_vectors))
    print(len(tfidf_vectors[0]))
    print(tfidf_vectors[0])
    
    
    # 建立TF-IDF模型
    def TF_IDF(tfidf_vectors, doc_vectors):
        index = similarities.MatrixSimilarity(tfidf_vectors)
        sims = index[doc_vectors[0]]
        print(list(enumerate(sims)))
    
    
    # 建立LSI模型
    def LSI(tfidf_vectors, dictionary, doc_vectors, theme_num):
        lsi = models.LsiModel(tfidf_vectors, id2word=dictionary, num_topics=theme_num)
        lsi_vector = lsi[tfidf_vectors]
        query_lsi = lsi[doc_vectors[0]]
        index = similarities.MatrixSimilarity(lsi_vector)
        sims = index[query_lsi]
        print(list(enumerate(sims)))
    
    
    # 使用LSI模型计算相似度
    LSI(tfidf_vectors, dictionary, doc_vectors, 2)
    
    # 使用TF-IDF模型计算相似度
    TF_IDF(tfidf_vectors, doc_vectors)
    
    

    相关文章

      网友评论

          本文标题:python文本相似度计算

          本文链接:https://www.haomeiwen.com/subject/qnbvcktx.html