练习一:三国人物分析top10并绘制成条形图、饼状图
import jieba
from matplotlib import pyplot as plt
plt.rcParams["font.sans-serif"] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
import numpy
# 读取文件
with open('novel/threekingdom.txt', 'r', encoding='UTF-8') as f:
data = f.read()
# 分词
words_list = jieba.lcut(data)
# print(words_list)
# print(type(words_list)) # <class 'list'>
# 构建一个集合,定义无关词 ,用集合因为可以去重
excludes = {"将军", "却说", "二人", "不可", "荆州", "不能", "如此", "丞相",
"商议", "如何", "主公", "军士", "军马", "左右", "次日", "引兵",
"大喜", "天下", "东吴", "于是", "今日", "不敢", "魏兵", "陛下",
"都督", "人马", "不知", "孔明曰", "玄德曰", "刘备", "云长"}
# , ""
#构建一个容器,存储我们要的数据
#{"夏侯渊":34,"害怕":33...}\
counts = {}
print(type(counts)) # <class 'dict'>
# 遍历wordlist 目标是筛选出人名
for word in words_list:
# print(word)
if len(word) <= 1:
# 过滤无关词语即可
continue
else:
# 向字典counts里更新值
# counts[word] = 字典中原来该词出现的次数 + 1
# counts[word] = counts[word] + 1
# counts["正文"] = count["正文"] + 1
counts[word] = counts.get(word, 0) + 1
# print(counts)
# 指向同一个词的人进行合并 , 记得把合并的词变为无关词,放到excludes中
counts['孔明'] = counts['孔明'] + counts['孔明曰']
counts['玄德'] = counts['玄德'] + counts['玄德曰'] + counts['刘备']
counts['关公'] = counts['关公'] + counts['云长']
# 删除无关的词语
for word in excludes:
del counts[word]
# 排序筛选
# 把字典转化成列表[(),()] [{}]
items = list(counts.items())
print(items)
# 按照词频次数进行排序
items.sort(key=lambda x: x[1], reverse=True)
print(items)
# 显示出现词语前10的词
x = []
y = []
for i in range(10):
# 将返回的数据拆开,拆包
role, count = items[i]
x.append(role)
y.append(count)
# 绘制条形图
plt.bar(x, y)
plt.xlabel('人物')
plt.ylabel('频次')
plt.title('三国人物出现次数top10')
plt.grid()
plt.show()
# 绘制饼图
explode = [0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0]
plt.pie(x=y,
labels=x,
autopct='%1.2f%%',
startangle=90,
explode=explode,
shadow=True
)
plt.axis('equal')
plt.legend(loc=2)
plt.title('三国人物分析top10')
plt.show()
三国人物出现次数top10.png
三国人物分析top10.png
练习二:豆瓣即将上映电影想看人数top5并绘制条形图、水平条形图
from xpinyin import Pinyin
import requests
from lxml import html
from matplotlib import pyplot as plt
import numpy
import pandas as pd
# pip install xpinyin
def spider(city):
# splitter 是分隔使用符号,默认是‘-’
city_pinyin = Pinyin().get_pinyin(city, splitter='')
url = 'https://movie.douban.com/cinema/later/{}/'.format(city_pinyin)
print('您要爬取的目标站点是', url)
print('爬虫进行中,请稍后...')
# 请求头信息, 目的是伪装成浏览器进行爬虫
headers = {'User-Agent': 'Mozillsa/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36'}
# 获取网页的源代码
response = requests.get(url, headers=headers)
html_data = response.text
# 提取我们想要的内容
selector = html.fromstring(html_data)
div_list = selector.xpath('//div[@id="showing-soon"]/div') # xpath爬取的是一个列表
print('您好,{}市共查询到{}部即将上映的电影'.format(city,len(div_list)))
movie_info_list = []
for div in div_list:
movie_name = div.xpath('div[1]/h3/a/text()')
# if len(movie_name)==0:
# movie_name = '没有查询到数据'
# else:
# movie_name = movie_name[0]
# 代码优化 ,可以使程序错误也继续运行
movie_name = '没有查询到数据' if len(movie_name) == 0 else movie_name[0]
# print(movie_name)
# 上映日期
date = div.xpath('div[1]/ul/li[1]/text()')
date = '没有查询到数据' if len(date) == 0 else date[0]
# print(date)
# 类型
type = div.xpath('div[1]/ul/li[2]/text()')
type = '没有查询到数据' if len(type) == 0 else type[0]
# print(type)
# 国家
country = div.xpath('div[1]/ul/li[3]/text()')
country = '没有查询到数据' if len(country) == 0 else country[0]
# print(country)
# 想看人数
want_see = div.xpath('div[1]/ul/li[4]/span/text()')
want_see = '没有查询到数据' if len(want_see) == 0 else want_see[0]
want_see = int(want_see.replace('人想看', ''))
# print(want_see)
# 图片链接
img_link = div.xpath('a/img/@src')
img_link = '没有查询到数据' if len(img_link) == 0 else img_link[0]
# 将信息放入一个列表中 [{}, {}, {}]
movie_info_list.append({
"movie_name": movie_name,
"date": date,
"type": type,
"country": country,
"want_see": want_see,
"img_link": img_link
})
# 排序
movie_info_list.sort(key=lambda x: x['want_see'],reverse=True)
print(movie_info_list)
# 绘制想看人数前五的条形图
plt.rcParams["font.sans-serif"] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
# 笨方法:
# a = []
# b = []
# for movie in movie_info_list:
# movie_name = movie['movie_name']
# want_see = movie['want_see']
# a.append(movie_name)
# b.append(want_see)
# x = []
# y = []
# for i in range(5):
# x.append(a[i])
# y.append(b[i])
# 优化方法:
# x = [movie['movie_name'] for movie in movie_info_list]
# x = [x[i] for i in range(5)]
# y = [movie['want_see'] for movie in movie_info_list]
# y = [y[i] for i in range(5)]
# 再次优化
x = [movie['movie_name'] for movie in movie_info_list[:5]]
y = [movie['want_see'] for movie in movie_info_list[:5]]
# 绘制柱状图
plt.bar(x, y)
plt.xlabel('电影名称')
plt.ylabel('想看人数')
plt.title('豆瓣即将上映电影想看人数TOP5')
plt.grid()
plt.show()
# 绘制水平柱状图
plt.barh(x, y)
plt.xlabel('想看人数')
plt.ylabel('电影名称')
plt.title('豆瓣即将上映电影想看人数TOP5')
plt.show()
# 再屏幕中输入‘请输入您要查看即将上映电影信息的城市’
city = input('请输入您要查看即将上映电影信息的城市:')
# 调用函数
spider(city)
豆瓣即将上映电影想看人数top5.png
豆瓣即将上映电影想看人数top5.png
练习三:绘制电影国家占比图
from xpinyin import Pinyin
import requests
from lxml import html
from matplotlib import pyplot as plt
import numpy
import pandas as pd
# pip install xpinyin
def spider(city):
# splitter 是分隔使用符号,默认是‘-’
city_pinyin = Pinyin().get_pinyin(city, splitter='')
url = 'https://movie.douban.com/cinema/later/{}/'.format(city_pinyin)
print('您要爬取的目标站点是', url)
print('爬虫进行中,请稍后...')
# 请求头信息, 目的是伪装成浏览器进行爬虫
headers = {'User-Agent': 'Mozillsa/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36'}
# 获取网页的源代码
response = requests.get(url, headers=headers)
html_data = response.text
# 提取我们想要的内容
selector = html.fromstring(html_data)
div_list = selector.xpath('//div[@id="showing-soon"]/div') # xpath爬取的是一个列表
print('您好,{}市共查询到{}部即将上映的电影'.format(city,len(div_list)))
movie_info_list = []
for div in div_list:
movie_name = div.xpath('div[1]/h3/a/text()')
# if len(movie_name)==0:
# movie_name = '没有查询到数据'
# else:
# movie_name = movie_name[0]
# 代码优化 ,可以使程序错误也继续运行
movie_name = '没有查询到数据' if len(movie_name) == 0 else movie_name[0]
# print(movie_name)
# 上映日期
date = div.xpath('div[1]/ul/li[1]/text()')
date = '没有查询到数据' if len(date) == 0 else date[0]
# print(date)
# 类型
type = div.xpath('div[1]/ul/li[2]/text()')
type = '没有查询到数据' if len(type) == 0 else type[0]
# print(type)
# 国家
country = div.xpath('div[1]/ul/li[3]/text()')
country = '没有查询到数据' if len(country) == 0 else country[0]
# print(country)
# 想看人数
want_see = div.xpath('div[1]/ul/li[4]/span/text()')
want_see = '没有查询到数据' if len(want_see) == 0 else want_see[0]
want_see = int(want_see.replace('人想看', ''))
# print(want_see)
# 图片链接
img_link = div.xpath('a/img/@src')
img_link = '没有查询到数据' if len(img_link) == 0 else img_link[0]
# 将信息放入一个列表中 [{}, {}, {}]
movie_info_list.append({
"movie_name": movie_name,
"date": date,
"type": type,
"country": country,
"want_see": want_see,
"img_link": img_link
})
# 将国家和电影个数放入容器中,便于提取
counts = {}
for movie in movie_info_list:
counts[movie['country']]=counts.get(movie['country'], 0)+1
print(counts)
x = list(counts.keys())
y = list(counts.values())
print(x)
print(y)
# 绘制电影国家占比图
plt.rcParams["font.sans-serif"] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
plt.pie(x=y,
labels=x,
autopct='%1.1f%%')
plt.title('电影国家占比图')
plt.legend(loc=2)
plt.show()
# 再屏幕中输入‘请输入您要查看即将上映电影信息的城市’
city = input('请输入您要查看即将上映电影信息的城市:')
# 调用函数
spider(city)
电影国家占比图.png
网友评论