from wordcloud import WordCloud,STOPWORDS
import matplotlib.pyplot as plt
import jieba
text = open('C:\\Users\\longxiaojiangi\\AppData\\Local\\Packages\\903DB504.QQ_a99ra4d2cbcxa\\LocalState\\User\\1405935821\\NetworkFile\\西游记.txt',encoding='GB18030').read()
X = ' '.join(jieba.cut(text,cut_all = False,HMM=True))
my_wordcloud=WordCloud(
background_color='white',
stopwords=STOPWORDS,
font_path='C:\\Windows\\Fonts\\simsun.ttc')
my_wordcloud.generate(X)
plt.imshow(my_wordcloud)
plt.axis('off')
plt.show()
import pandas as pd
df_news=pd.read_table(r'C:\Users\longxiaojiangi\AppData\Local\Packages\903DB504.QQ_a99ra4d2cbcxa\LocalState\User\1405935821\NetworkFile\news.txt',names=['category','theme','url','content'])
df_health = df_news[df_news.category=='健康']
content_list = df_health.content.values.tolist()
stopwords=pd.read_csv(r'C:\\Users\\longxiaojiangi\\Desktop\\stopwords.txt',sep='\t',quoting=3,names=['stopword'])
stopwords_list = stopwords['stopword'].values.tolist()
import jieba
words = []
seg = jieba.lcut(content_list[0])
for word in seg:
if word == '\n' or len(word)<=1:
continue
elif word in stopwords_list:
continue
else:
words.append(word)
#对500条健康新闻进行分词和预处理
words=[]
for line in content_list:
seg=jieba.lcut(line)
for word in seg:
if word=='\n'or len(word)<=1:
continue
elif word in stopwords_list:
continue
else:
words.append(word)
df_health=pd.DataFrame({'health_words':words})
import numpy as np
df_health.groupby(by=['health_words'])['health_words'].agg({'count':np.size})
from wordcloud import WordCloud
import matplotlib.pyplot as plt
%matplotlib inline
cloud = WordCloud(font_path='C:\\Windows\\Fonts\\simsun.ttc',width=500,height=300)
pic_cloud = {x[0]:x[1] for x in words_count_sort.values}
pic_cloud = cloud.fit_words(pic_cloud)
# 绘制词云
plt.imshow(pic_cloud,interpolation='bilinear')
plt.axis('off')
plt.show()
网友评论