import jieba
from wordcloud import WordCloud
import imageio
# 自定义词云形状
mask = imageio.imread("china.jpg")
# 打开文件,进行分词
with open('hongloumeng.txt', 'r', encoding='UTF-8') as f:
data = f.read()
word_list = jieba.lcut(data)
# 建立一个集合用于存储无关词,方便筛选,用集合是由于其去重性
excludes = {"什么", "一个", "我们", "你们", "如今", "说道", "知道", "姑娘", "起来", "这里",
"出来", "众人", "那里", "自己", "太太", "一面", "只见", "两个", "怎么", "不是",
"不知", "这个", "听见", "这样", "进来", "咱们", "就是", "东西", "告诉", "回来",
"只是", "大家", "只得", "丫头", "这些", "他们", "不敢", "出去", "所以", "不过",
"不好", "姐姐", "的话", "一时", "鸳鸯", "过来", "不能", "心里", "她们", "如此",
"银子", "今日", "二人", "答应", "几个", "这么", "还有", "只管", "说话", "那边",
"奶奶", "老太太", "贾政", "凤姐儿", "没有", "贾宝玉", "宝二爷", '老祖宗'} # 截到至333频次
# "",
# 建立一个字典用于存储信息
counts = {}
# 去掉长度小于等于1的数据,筛选出名字,添加到字典中
for words in word_list:
if len(words) <= 1:
continue
else:
counts[words] = counts.get(words, 0) + 1
# 进行排序
# 合并同义词
counts['贾母'] = counts['老太太'] + counts['老祖宗'] + counts['贾母']
counts['老爷'] += counts['贾政']
counts['凤姐'] += counts['凤姐儿']
counts['宝玉'] = counts['贾宝玉'] + counts['宝玉'] + counts['宝二爷']
# 删除excludes 中的 无关词
for word in excludes:
del counts[word]
# 转化为列表
sort_list = list(counts.items())
# print(sort_list)
# 对列表进行排序
sort_list = sorted(sort_list, key=lambda x: x[1], reverse=True)
print(sort_list)
# 先取出前20为了合并词语与去除无关词
# for i in range(20):
# role, count = sort_list[i]
# print(role, count)
# 筛选出出出现次数前10的人名,放入list1中便于生成词云
list1 = []
for i in range(10):
role, count = sort_list[i]
print(role, count)
for _ in range(count):
list1.append(role)
print(list1)
# 把list1转化为字符串
text = ' '.join(list1)
# 这个print用于查找这个人名是否存在,方便进行同义词合并
# print(counts['人名'])
# 生成词云
WordCloud(
background_color="white",
font_path='msyh.ttc',
mask=mask,
collocations=False
).generate(text).to_file('红楼梦人物分析.png')
![](https://img.haomeiwen.com/i19864756/05eb86bc89e9dfe0.png)
生成词云
网友评论