import jieba
from matplotlib import pyplot as plt
#解决中文显示问题
plt.rcParams['font.sans-serif'] = ['KaiTi'] # 指定默认字体
plt.rcParams['axes.unicode_minus'] = False # 解决保存图像是负号'-'显示为方块的问题
# 读取小说
class SanGuo():
def __init__(self, n=10):
"""top n"""
self.n = n
self.new_word_list = []
self.counts = {}
self.stop_words = {"将军", "却说", "丞相", "二人", "不可", "荆州", "不能", "如此", "商议",
"如何", "主公", "军士", "军马", "左右", "次日", "引兵", "大喜", "天下",
"东吴", "于是", "今日", "不敢", "魏兵", "陛下", "都督", "人马", "不知",
"孔明曰", "玄德曰", "刘备",'云长'}
def read_data(self):
with open('./novel/threekingdom.txt', mode='r', encoding='utf-8') as f:
return f.read()
def parse(self, data):
word_list = jieba.lcut(data)
for word in word_list:
if len(word) <= 1:
continue
else:
self.counts[word] =self.counts.get(word, 0) + 1
def sort_and_filter(self):
self.counts['孔明'] = self.counts['孔明'] + self.counts['孔明曰']
self.counts['玄德'] = self.counts['玄德曰'] + self.counts['刘备'] + self.counts['玄德']
self.counts['关公'] = self.counts['关公'] + self.counts['云长']
for word in self.stop_words:
del self.counts[word]
self.new_word_list = list(self.counts.items())
self.new_word_list.sort(key=lambda x:x[1], reverse=True)
def show(self):
num_list = []
role_list = []
print("前top{}的分析结果".format(self.n))
for i in range(self.n):
name, num = self.new_word_list[i]
print(name, num)
num_list.append(num)
role_list.append(name)
plt.pie(num_list, labels=role_list, shadow=True, autopct='%1.1f%%')
plt.axis('equal')
plt.title('三国TOP{}人物出场频次占比图'.format(self.n), fontsize=30)
plt.show()
def run(self):
data = self.read_data()
self.parse(data)
self.sort_and_filter()
self.show()
if __name__ == '__main__':
s = SanGuo(5)
s.run()
网友评论