最近由于大屏项目需要生成一个词云所以这边做个记录
demo.py下:
from os import path
import chnSegment
import plotWordcloud
if __name__ == '__main__':
# 读取文件
d = path.dirname(__file__)
# text = open(path.join(d, 'doc//十九大报告全文.txt')).read()
text = open(path.join(d, 'doc//临汾2019.txt'), encoding='utf-8').read()
# text="付求爱很帅并来到付求爱了网易研行大厦很帅 很帅 很帅"
# 若是中文文本,则先进行分词操作
text = chnSegment.word_segment(text)
print(text)
# 生成词云
plotWordcloud.generate_wordcloud(text)
chnSegment.py下 这边主要对文章进行了一个分词:
from collections import Counter
from os import path
import jieba
jieba.load_userdict(path.join(path.dirname(__file__), 'userdict//userdict.txt')) # 导入用户自定义词典
def word_segment(text):
'''
通过jieba进行分词并通过空格分隔,返回分词后的结果
'''
# 计算每个词出现的频率,并存入txt文件
jieba_word = jieba.cut(text, cut_all=False) # cut_all是分词模式,True是全模式,False是精准模式,默认False
data = []
for word in jieba_word:
# print(word)
data.append(word)
dataDict = Counter(data)
with open('doc//词频统计.txt', 'w') as fw:
for k, v in dataDict.items():
fw.write("%s,%d\n" % (k, v))
# fw.write("%s"%dataDict)
# 返回分词后的结果
jieba_word = jieba.cut(text, cut_all=False) # cut_all是分词模式,True是全模式,False是精准模式,默认False
seg_list = ' '.join(jieba_word)
# print(seg_list)
return seg_list
plotWordcloud.py文件下 进行生成词云
from os import path
from PIL import Image
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import colors
from wordcloud import WordCloud, STOPWORDS
import wordcloud
def generate_wordcloud(text):
'''
输入文本生成词云,如果是中文文本需要先进行分词处理
'''
# 设置显示方式
d = path.dirname(__file__)
color_list = ['#FF0000', '#FF6347', '#0000FF', '#00FF7F', '#FFD700'] # 建立颜色数组
colormap = colors.ListedColormap(color_list) # 调用
alice_mask = np.array(Image.open(path.join(d, "Images//timg.jpg")))
image_colors = wordcloud.ImageColorGenerator(alice_mask)
font_path = path.join(d, "font//FZSTK.TTF")
stopwords = set(STOPWORDS)
wc = WordCloud(
background_color=None, # 设置背景颜色
mode="RGBA",
max_words=2000, # 词云显示的最大词数
mask=alice_mask, # 设置背景图片
stopwords=stopwords, # 设置停用词
font_path=font_path, # 兼容中文字体,不然中文会显示乱码
# colormap=colormap, # 设置颜色
color_func=image_colors,
# scale=32,
# margin=1,
)
# 生成词云
wc.generate(text)
# 生成的词云图像保存到本地
wc.to_file(path.join(d, "Images//alice.png"))
# 显示图像
plt.imshow(wc, interpolation='bilinear')
# interpolation='bilinear' 表示插值方法为双线性插值
plt.axis("off") # 关掉图像的坐标
plt.show()
网友评论