美文网首页
文本挖掘Test

文本挖掘Test

作者: 在做算法的巨巨 | 来源:发表于2018-07-23 20:02 被阅读0次
import codecs
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
f = codecs.open("C:\\Users\\Data Engineer\\Desktop\\xx\\十九大政府工作报告.txt",'r','utf8')
doc = f.read()
f.close()
doc
  • 导入常用stopwords

stopwords= pd.read_csv("C:\\Users\\Data Engineer\\Desktop\\xx\\StopwordsCN.txt",encoding='utf8',index_col=False)
  • 导入专业词库

import jieba
for filename in ['党建理论词汇.txt','九大常委20120821.txt','科学发展观.txt','科学发展观.txt','社会主义革命和社会主义建设词汇.txt',
                '社会主义好.txt','社会主义核心价值观.txt','十八大报告节选.txt','政治学词库.txt']:
    jieba.load_userdict("C:\\Users\\Data Engineer\\Desktop\\xx\\党政专业词汇\\"+filename)
wordsgroup=[]
for word in jieba.cut(doc):
    if word not in stopwords and len(word.strip())>1:
        wordsgroup.append(word)
text=pd.DataFrame({'words':wordsgroup})
text = text.groupby(by='words')['words'].agg({'计数':np.size}).reset_index().sort_values('计数',ascending=False)
from scipy.misc import imread
from wordcloud import WordCloud
from wordcloud import ImageColorGenerator
bimg = imread("C:\\Users\\Data Engineer\\Desktop\\xx\\chinamap.jpg")
image_colors = ImageColorGenerator(bimg)
wordcloud = WordCloud(font_path='C:\\Users\\Data Engineer\\Desktop\\xx\\2.4 词云绘制\\2.4\\simhei.ttf',background_color='black',mask=bimg,max_font_size=300)  #这里的mask就是选择背景的过程)
words = text.set_index('words').to_dict()
wordcloud.fit_words(words['计数'])
plt.show(wordcloud.recolor(color_func=image_colors))
plt.imshow(wordcloud)
plt.axis('off')
plt.show()
  • 关键词提取

  1. 用jieba提取关键词
import jieba.analyse
tags = jieba.analyse.extract_tags(doc, topK=5)

2. 用sklearn实现关键词提取

wordscombine=' '.join(wordsgroup)
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
#将文本转化为词频矩阵
countvec = CountVectorizer(min_df=0, token_pattern=r'\b\w+\b')
#计算文本词频
content=countvec.fit_transform([wordscombine])
#类调用
transf = TfidfTransformer()
#计算矩阵的tfidf值
tfidf = transf.fit_transform(content)
sort=np.argsort(tfidf.toarray())[:,-5:]
names = countvec.get_feature_names()
keywords = pd.Index(names)[sort].values
print(content.toarray())
print(tfidf.toarray())
tagdf=pd.DataFrame({'tag1':keywords[:,4],
                  'tag2':keywords[:,3],
                  'tag3':keywords[:,2],
                  'tag4':keywords[:,1],
                  'tag5':keywords[:,0],})
tagdf
  1. 自己写函数实现
def word2vec(verblist): #define input format should be list
    #func1 find unique word and build matrix
    uniquemat=set([])
    for words in verblist:
        uniquemat = uniquemat | set(words)
    uniquemat=list(uniquemat)
    #statistics for TF
    vectormat=[]
    for words in verblist:
        vector=[0]*len(uniquemat)
        for word in words:
            if word in uniquemat:
                vector[uniquemat.index(word)]+=1
        vectormat.append(vector)
    vectormat=pd.DataFrame(vectormat).T
    #vectormat=pd.DataFrame([uniquemat]+vectormat).T
    #vectormat.columns=['words','count']
    #vectormat=pd.DataFrame(vectormat)
    #vectormat.rename(columns={0:'words'},inplace=True)
    return(vectormat,uniquemat)
def count(vectormat):
    return(vectormat.sum(axis=1))
def tfcount(vectormat):  #data normalized
    return(vectormat.sum(axis=1)/len(vectormat))
def idfcount(vectormat): 
    return(np.log2(vectormat.columns.size/(vectormat.sum(axis=1)+1)))
import re
doc1=re.split(r'[。?!\n]\s*',doc)  #原始文本和拆分好的文本都放进去
zh =re.compile(u'[\u4e00-\u9fa5]+')
wordsgroup=[]
for words in doc1:
    segs=[]
    for word in jieba.cut(words):
        if len(word.strip())>1 and word not in stopwords and zh.search(word):
            segs.append(word)
    wordsgroup.append(segs)      
[wordmat,uniquemat]=word2vec(wordsgroup)
wordtfidf=pd.DataFrame({'words':uniquemat,'count':count(wordmat),'tf':tfcount(wordmat),'idf':idfcount(wordmat)})
wordtfidf['tfidf']=wordtfidf.idf*wordtfidf.tf
wordtfidf.sort_values(by='tfidf',ascending=False).head(5)
  • 自动摘要

  1. sklearn调包实现
doc1=[doc]+re.split(r'[。?!\n]\s*',doc)
from sklearn.metrics import pairwise_distances
segments=[]
suitCorpos=[]
for cont in doc1:
    segs = jieba.cut(cont)
    segment = " ".join(segs)
    if len(segment.strip())>10:
        segments.append(segment)
        suitCorpos.append(cont)
countVectorizer = CountVectorizer(stop_words=list(stopwords['stopword'].values),min_df=0, token_pattern=r"\b\w+\b")
textVector = countVectorizer.fit_transform(segments)
distance_matrix = pairwise_distances(textVector,metric="cosine")
sort = np.argsort(distance_matrix, axis=1)
summary = pd.Index(suitCorpos)[sort[0]].values[1]
summary
  1. 自定义函数实现

摘要的核心在于找到文中的非常符合全文内容的一句话。在这个过程中,首先我们需要知道全文主要在讲什么,这时候需要遍历全文,建立全文文本向量,然后我们对文中每一句进行遍历,建立每一句话的文本向量,剩下的就是向量匹配,用cosine相似余弦定理。

doc2=[doc]+re.split(r'[。?!\n]\s*',doc)  #原始文本和拆分好的文本都放进去

相比于doc1,这里在list里我们合并了原文,之所以这样做,是因为我们需要将文中每一句的向量拿出来和全文比较,那么索性放在一起。

def similarity(vectormat):
    simimat=np.zeros((vectormat.columns.size))
    #for i in range(vectormat.columns.size):
    i=0;
    for j in range(vectormat.columns.size):
        simimat[j]=sum(vectormat.iloc[:,0]*vectormat.iloc[:,j])/(np.sqrt(sum(vectormat.iloc[:,0]**2))*np.sqrt(sum(vectormat.iloc[:,j]**2)))
    return(simimat)       
contents=[]
wordsgroup=[]
for content in doc2:
    if len(content.strip())>1:
        segs=[]
        for word in jieba.cut(content):
            if zh.search(word) and len(word.strip())>1 and word not in stopwords:
                segs.append(word)
        wordsgroup.append(segs)
        contents.append(content)
[wordmat, uniquemat] = word2vec(wordsgroup)
CosineSimilarity=similarity(wordmat)
contentStat=pd.DataFrame({'content':contents,'CosineSimilarity':CosineSimilarity})
contentStat.loc[1::].sort_values('CosineSimilarity',ascending=False).head(5)
summary=list(contentStat.loc[1::].sort_values('CosineSimilarity',ascending=False).head(1).content)

在这里,我们取最接近全文内容的句子,也就是cos(\theta)最大的值对应的句子。

相关文章

网友评论

      本文标题:文本挖掘Test

      本文链接:https://www.haomeiwen.com/subject/xixsmftx.html