import codecs
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
f = codecs.open("C:\\Users\\Data Engineer\\Desktop\\xx\\十九大政府工作报告.txt",'r','utf8')
doc = f.read()
f.close()
doc
-
导入常用stopwords
stopwords= pd.read_csv("C:\\Users\\Data Engineer\\Desktop\\xx\\StopwordsCN.txt",encoding='utf8',index_col=False)
-
导入专业词库
import jieba
for filename in ['党建理论词汇.txt','九大常委20120821.txt','科学发展观.txt','科学发展观.txt','社会主义革命和社会主义建设词汇.txt',
'社会主义好.txt','社会主义核心价值观.txt','十八大报告节选.txt','政治学词库.txt']:
jieba.load_userdict("C:\\Users\\Data Engineer\\Desktop\\xx\\党政专业词汇\\"+filename)
wordsgroup=[]
for word in jieba.cut(doc):
if word not in stopwords and len(word.strip())>1:
wordsgroup.append(word)
text=pd.DataFrame({'words':wordsgroup})
text = text.groupby(by='words')['words'].agg({'计数':np.size}).reset_index().sort_values('计数',ascending=False)
from scipy.misc import imread
from wordcloud import WordCloud
from wordcloud import ImageColorGenerator
bimg = imread("C:\\Users\\Data Engineer\\Desktop\\xx\\chinamap.jpg")
image_colors = ImageColorGenerator(bimg)
wordcloud = WordCloud(font_path='C:\\Users\\Data Engineer\\Desktop\\xx\\2.4 词云绘制\\2.4\\simhei.ttf',background_color='black',mask=bimg,max_font_size=300) #这里的mask就是选择背景的过程)
words = text.set_index('words').to_dict()
wordcloud.fit_words(words['计数'])
plt.show(wordcloud.recolor(color_func=image_colors))
plt.imshow(wordcloud)
plt.axis('off')
plt.show()
-
关键词提取
- 用jieba提取关键词
import jieba.analyse
tags = jieba.analyse.extract_tags(doc, topK=5)
2. 用sklearn实现关键词提取
wordscombine=' '.join(wordsgroup)
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
#将文本转化为词频矩阵
countvec = CountVectorizer(min_df=0, token_pattern=r'\b\w+\b')
#计算文本词频
content=countvec.fit_transform([wordscombine])
#类调用
transf = TfidfTransformer()
#计算矩阵的tfidf值
tfidf = transf.fit_transform(content)
sort=np.argsort(tfidf.toarray())[:,-5:]
names = countvec.get_feature_names()
keywords = pd.Index(names)[sort].values
print(content.toarray())
print(tfidf.toarray())
tagdf=pd.DataFrame({'tag1':keywords[:,4],
'tag2':keywords[:,3],
'tag3':keywords[:,2],
'tag4':keywords[:,1],
'tag5':keywords[:,0],})
tagdf
- 自己写函数实现
def word2vec(verblist): #define input format should be list
#func1 find unique word and build matrix
uniquemat=set([])
for words in verblist:
uniquemat = uniquemat | set(words)
uniquemat=list(uniquemat)
#statistics for TF
vectormat=[]
for words in verblist:
vector=[0]*len(uniquemat)
for word in words:
if word in uniquemat:
vector[uniquemat.index(word)]+=1
vectormat.append(vector)
vectormat=pd.DataFrame(vectormat).T
#vectormat=pd.DataFrame([uniquemat]+vectormat).T
#vectormat.columns=['words','count']
#vectormat=pd.DataFrame(vectormat)
#vectormat.rename(columns={0:'words'},inplace=True)
return(vectormat,uniquemat)
def count(vectormat):
return(vectormat.sum(axis=1))
def tfcount(vectormat): #data normalized
return(vectormat.sum(axis=1)/len(vectormat))
def idfcount(vectormat):
return(np.log2(vectormat.columns.size/(vectormat.sum(axis=1)+1)))
import re
doc1=re.split(r'[。?!\n]\s*',doc) #原始文本和拆分好的文本都放进去
zh =re.compile(u'[\u4e00-\u9fa5]+')
wordsgroup=[]
for words in doc1:
segs=[]
for word in jieba.cut(words):
if len(word.strip())>1 and word not in stopwords and zh.search(word):
segs.append(word)
wordsgroup.append(segs)
[wordmat,uniquemat]=word2vec(wordsgroup)
wordtfidf=pd.DataFrame({'words':uniquemat,'count':count(wordmat),'tf':tfcount(wordmat),'idf':idfcount(wordmat)})
wordtfidf['tfidf']=wordtfidf.idf*wordtfidf.tf
wordtfidf.sort_values(by='tfidf',ascending=False).head(5)
-
自动摘要
- sklearn调包实现
doc1=[doc]+re.split(r'[。?!\n]\s*',doc)
from sklearn.metrics import pairwise_distances
segments=[]
suitCorpos=[]
for cont in doc1:
segs = jieba.cut(cont)
segment = " ".join(segs)
if len(segment.strip())>10:
segments.append(segment)
suitCorpos.append(cont)
countVectorizer = CountVectorizer(stop_words=list(stopwords['stopword'].values),min_df=0, token_pattern=r"\b\w+\b")
textVector = countVectorizer.fit_transform(segments)
distance_matrix = pairwise_distances(textVector,metric="cosine")
sort = np.argsort(distance_matrix, axis=1)
summary = pd.Index(suitCorpos)[sort[0]].values[1]
summary
- 自定义函数实现
摘要的核心在于找到文中的非常符合全文内容的一句话。在这个过程中,首先我们需要知道全文主要在讲什么,这时候需要遍历全文,建立全文文本向量,然后我们对文中每一句进行遍历,建立每一句话的文本向量,剩下的就是向量匹配,用cosine相似余弦定理。
doc2=[doc]+re.split(r'[。?!\n]\s*',doc) #原始文本和拆分好的文本都放进去
相比于doc1,这里在list里我们合并了原文,之所以这样做,是因为我们需要将文中每一句的向量拿出来和全文比较,那么索性放在一起。
def similarity(vectormat):
simimat=np.zeros((vectormat.columns.size))
#for i in range(vectormat.columns.size):
i=0;
for j in range(vectormat.columns.size):
simimat[j]=sum(vectormat.iloc[:,0]*vectormat.iloc[:,j])/(np.sqrt(sum(vectormat.iloc[:,0]**2))*np.sqrt(sum(vectormat.iloc[:,j]**2)))
return(simimat)
contents=[]
wordsgroup=[]
for content in doc2:
if len(content.strip())>1:
segs=[]
for word in jieba.cut(content):
if zh.search(word) and len(word.strip())>1 and word not in stopwords:
segs.append(word)
wordsgroup.append(segs)
contents.append(content)
[wordmat, uniquemat] = word2vec(wordsgroup)
CosineSimilarity=similarity(wordmat)
contentStat=pd.DataFrame({'content':contents,'CosineSimilarity':CosineSimilarity})
contentStat.loc[1::].sort_values('CosineSimilarity',ascending=False).head(5)
summary=list(contentStat.loc[1::].sort_values('CosineSimilarity',ascending=False).head(1).content)
在这里,我们取最接近全文内容的句子,也就是cos(\theta)最大的值对应的句子。
网友评论