效果
image.png读取文章:word.txt(内容截取片段)
VOLUME I
CHAPTER I
Emma Woodhouse, handsome, clever, and rich, with a comfortable home
and happy disposition, seemed to unite some of the best blessings
of existence; and had lived nearly twenty-one years in the world
with very little to distress or vex her.
The event had every promise of happiness for her friend. Mr. Weston
was a man of unexceptionable character, easy fortune, suitable age,
and pleasant manners; and there was some satisfaction in considering
with what self-denying, generous friendship she had always wished
and promoted the match; but it was a black morning's work for her.
The want of Miss Taylor would be felt every hour of every day.
She recalled her past kindness--the kindness, the affection of sixteen
years--how she had taught and how she had played with her from five
years old--how she had devoted all her powers to attach and amuse
her in health--and how nursed her through the various illnesses
of childhood.……
第一步:读取文件
# 打开文件
with open("word.txt") as f:
# 读取每一行
for line in f:
第二步:分析文件,把单词提炼出来
# 按空格和符号分割
words=[]
for word in re.split(r'[^a-zA-Z]', line):
# 过滤空白字符
if (word.strip() != ''):
# 转换为小写
word = word.lower()
words.append(word)
第三步:利用字典统计词频
# 存储单词出现次数字典
dict_word ={}
# 单词词典
dict_word[word] = dict_word.get(word, 0) + 1
统计词频结果
{'volume': 3, 'i': 3192, 'chapter': 56, 'emma': 865,
'woodhouse': 314, 'handsome': 38, 'clever': 27, 'and': 4897,
……
第四步:获取单词频率前10名的单词
# 获取排名前n个单词
# 参数为单词字典,排名
def get_topN(hist, num):
l = []
#迭代字典key,value
for key,value in hist.items():
#把单词,数量用元祖形式加入到列表中
l.append((value,key))
#按数量进行反向排序
l.sort(reverse=True)
#获得前num名
return l[:num+1]
#第二种求取排序的方法
sorted(hist.items(), key=lambda x: x[1], reverse=True)[:num+1]
#排序结果
#[(5242, 'to'), (5205, 'the'), (4897, 'and'),
(4295, 'of'), (3192, 'i'), (3130, 'a')]……
第五步:画图
# 导入画图
from matplotlib import pyplot as plt
# 导入字体库
from matplotlib.font_manager import FontProperties
# 获得字体
font_set = FontProperties(fname=r"c:\windows\fonts\simsun.ttc", size=15)
# 获取topN
data = get_topN(hist, 10)
for d in data:
#填充数据项y,x
plt.bar((d[-1],), (d[0],))
# 画图
plt.xlabel(u'单词', fontproperties=font_set)
plt.ylabel(u'出现次数', fontproperties=font_set)
plt.title('单词直方图', fontproperties=font_set)
plt.show()
代码:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @Time : 2018/1/22 15:23
# @Author : hyang
# @File : demo.py
# @Software: 获取单词直方图
import re
import string
# 导入画图
from matplotlib import pyplot as plt
# 导入字体
from matplotlib.font_manager import FontProperties
# 获得字体
font_set = FontProperties(fname=r"c:\windows\fonts\simsun.ttc", size=15)
# 读取文件得到单词list
def deal_file(file_name):
# 存储单词
words = []
# 存储单词出现次数字典
dict_word ={}
try:
# 打开文件
with open(file_name) as f:
# 读取每一行
for line in f:
# 按空格和符号分割
for word in re.split(r'[^a-zA-Z]', line):
# 过滤空白字符
if (word.strip() != ''):
# 转换为小写
word = word.lower()
words.append(word)
# 单词词典
dict_word[word] = dict_word.get(word, 0) + 1
except Exception as e:
print(e)
return dict_word
# 获取前n个单词
def get_topN(hist, num):
return sorted(hist.items(), key=lambda x: x[1], reverse=True)[:num+1]
def run():
hist = deal_file('emma.txt')
print(hist)
# 获取topN
data = get_topN(hist, 10)
print(data)
for d in data:
plt.bar((d[0],), (d[-1],))
# 画图
plt.xlabel(u'单词', fontproperties=font_set)
plt.ylabel(u'出现次数', fontproperties=font_set)
plt.title('单词直方图', fontproperties=font_set)
plt.show()
pass
# 主函数
if __name__ == '__main__':
run()
网友评论