导包
import nltk, urllib.request
from bs4 import BeautifulSoup
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import jieba
import PIL.Image as image
import numpy as np
获取网络文本,清除干扰词
response = urllib.request.urlopen('http://python.org/')
html = response.read()
html = nltk.clean_html(html)
html = BeautifulSoup(html).get_text()
tokens = [tok for tok in html.split()]
获取词频
Freq_dist_nltk=nltk.FreqDist(tokens)
print(Freq_dist_nltk.plot(50, cumulative=False))
for k,v in Freq_dist_nltk.items():
print(str(k)+':'+str(v)
去除停用词,画词频曲线图
stopwords = [word.strip().lower() for word in open("stop.txt")]
clean_tokens = [tok for tok in tokens if len(tok.lower()) > 1 and (tok.lower() not in stopwords)]
Freq_dist_nltk=nltk.FreqDist(clean_tokens)
print(Freq_dist_nltk.plot(50, cumulative=False))
屏幕快照 2019-04-28 下午21.14.15 下午.png
中文分词
def trans_CN(text):
word_list = jieba.cut(text)
result = " ".join(word_list)
return result
生成中文词云
wordcloud = WordCloud(
# 添加遮罩层
mask=mask,
# 生成中文字的字体,必须要加,不然看不到中文
font_path="STFANGSO.ttf"
).generate(text)
image_produce = wordcloud.to_image()
image_produce.show()
生成英文词云
wordcloud = WordCloud(
background_color="white", #设置背景为白色,默认为黑色
width=1500, #设置图片的宽度
height=960, #设置图片的高度
margin=10 #设置图片的边缘
).generate(str(clean_tokens))
# 绘制图片
plt.imshow(wordcloud)
# 消除坐标轴
plt.axis("off")
# 展示图片
plt.show()
# 保存图片
wordcloud.to_file('my_test2.png')
生成效果图
屏幕快照 2019-04-28 下午21.08.26 下午.png
网友评论