1.1 文本的分析与展示
在任何建模之前,一般我们会对数据进行一定的分析,文本也不例外,不过文本不像其他业务的数据,是数值型,这时候很多统计的方式,是大家理解数据的初步,而文本可视化,经常会用到词云的形式,词云是有说服力的一种可视化方式,尤其是对于文本中心内容展示。
1.2 工具库的引入
import warings
warings.filterwarning("ignore")
import jieba
import pandas as pd
import numpy as np
import codecs # codecs提供的open方法来指定打开的文件的语言编码,它会在读取的时候自动转换为内部unicode
import matplotlib.pyplot as plt
%matplotlib inline
matplotlib.rcParam['figure.figsize'] = (10.0,5.0)
from wordcloud import WordCloud
1.3 数据读取
df = pd.read_csv('./origin_data/enterainment_news.csv',encoding = 'utf-8')
df = df.dropna()
content = df['content'].values.tolist()
segment = []
for line in content:
try:
segs = jieba.lcut(content)
for seg in segs:
if len(seg) >1 and seg != '\r\n':
segment.append(seg)
except:
print(line)
continue
1.3.1 数据处理——去停用词
words_df = pd.DataFrame({'segment':segment})
words_df.head()
stopwords = pd.read_csv('origin_data/stopwords.txt',index_col = False , quoting = 3 , sep = '\t',names = ['stopword'],encoding = 'utf-8')
stopwords.head()
words_df = words_df[~words_df.segment.isin(stopwords.stopword)]
1.3.2 词频统计
words_stat = words_df.groupby(by = ['segment'])['segment'].agg({'计数':numpy.size})
words_stat = words_ stat.reset_index().sort_values (by = ['计数'],ascending = False)
words_stat.head()
1.3.3 做词云
matplotlib.rcParams['figure_figsize'] = (12,12)
wordcloud = WordCloud(font_path = 'origin_data/simhei.ttf',background_color = 'white' , max_font_size = 80)
word_frequence = {x[0]:x[1] for x in words_stat.head(100).values}
wordcloud = wordcloud.fit_words(word_frequence)
plt.imshow(wordcloud)
网友评论