美文网首页
绘图练习

绘图练习

作者: code与有荣焉 | 来源:发表于2019-10-30 11:20 被阅读0次

练习一:三国人物分析top10并绘制成条形图、饼状图

import jieba
from matplotlib import pyplot as plt
plt.rcParams["font.sans-serif"] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
import numpy
# 读取文件
with open('novel/threekingdom.txt', 'r', encoding='UTF-8') as f:
    data = f.read()
    # 分词
    words_list = jieba.lcut(data)
    # print(words_list)
    # print(type(words_list)) # <class 'list'>

    # 构建一个集合,定义无关词   ,用集合因为可以去重
    excludes = {"将军", "却说", "二人", "不可", "荆州", "不能", "如此", "丞相",
                "商议", "如何", "主公", "军士", "军马", "左右", "次日", "引兵",
                "大喜", "天下", "东吴", "于是", "今日", "不敢", "魏兵", "陛下",
                "都督", "人马", "不知", "孔明曰", "玄德曰", "刘备", "云长"}
                # , ""
    #构建一个容器,存储我们要的数据
    #{"夏侯渊":34,"害怕":33...}\
    counts = {}
    print(type(counts)) # <class 'dict'>
    # 遍历wordlist 目标是筛选出人名
    for word in words_list:
        # print(word)
        if len(word) <= 1:
            # 过滤无关词语即可
            continue
        else:
            # 向字典counts里更新值
            # counts[word] = 字典中原来该词出现的次数 + 1
            # counts[word] = counts[word] + 1
            # counts["正文"] = count["正文"] + 1
            counts[word] = counts.get(word, 0) + 1
    # print(counts)

    # 指向同一个词的人进行合并 , 记得把合并的词变为无关词,放到excludes中
    counts['孔明'] = counts['孔明'] + counts['孔明曰']
    counts['玄德'] = counts['玄德'] + counts['玄德曰'] + counts['刘备']
    counts['关公'] = counts['关公'] + counts['云长']

    # 删除无关的词语
    for word in excludes:
        del counts[word]

    # 排序筛选
    # 把字典转化成列表[(),()]  [{}]
    items = list(counts.items())
    print(items)
    # 按照词频次数进行排序
    items.sort(key=lambda x: x[1], reverse=True)
    print(items)
    # 显示出现词语前10的词

    x = []
    y = []
    for i in range(10):
        # 将返回的数据拆开,拆包
        role, count = items[i]
        x.append(role)
        y.append(count)
    # 绘制条形图
    plt.bar(x, y)
    plt.xlabel('人物')
    plt.ylabel('频次')
    plt.title('三国人物出现次数top10')
    plt.grid()
    plt.show()
    # 绘制饼图
    explode = [0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0]
    plt.pie(x=y,
            labels=x,
            autopct='%1.2f%%',
            startangle=90,
            explode=explode,
            shadow=True
            )
    plt.axis('equal')
    plt.legend(loc=2)
    plt.title('三国人物分析top10')
    plt.show()
三国人物出现次数top10.png
三国人物分析top10.png

练习二:豆瓣即将上映电影想看人数top5并绘制条形图、水平条形图

from xpinyin import Pinyin
import requests
from lxml import html
from matplotlib import pyplot as plt
import numpy
import pandas as pd
# pip install xpinyin
def spider(city):
    # splitter 是分隔使用符号,默认是‘-’
    city_pinyin = Pinyin().get_pinyin(city, splitter='')
    url = 'https://movie.douban.com/cinema/later/{}/'.format(city_pinyin)
    print('您要爬取的目标站点是', url)
    print('爬虫进行中,请稍后...')
    # 请求头信息, 目的是伪装成浏览器进行爬虫
    headers = {'User-Agent': 'Mozillsa/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36'}
    # 获取网页的源代码
    response = requests.get(url, headers=headers)
    html_data = response.text
    # 提取我们想要的内容
    selector = html.fromstring(html_data)
    div_list = selector.xpath('//div[@id="showing-soon"]/div') # xpath爬取的是一个列表
    print('您好,{}市共查询到{}部即将上映的电影'.format(city,len(div_list)))
    movie_info_list = []
    for div in div_list:
        movie_name = div.xpath('div[1]/h3/a/text()')
        # if len(movie_name)==0:
        #     movie_name = '没有查询到数据'
        # else:
        #     movie_name = movie_name[0]

        # 代码优化   ,可以使程序错误也继续运行
        movie_name = '没有查询到数据' if len(movie_name) == 0 else movie_name[0]
        # print(movie_name)
        # 上映日期
        date = div.xpath('div[1]/ul/li[1]/text()')
        date = '没有查询到数据' if len(date) == 0 else date[0]
        # print(date)
        # 类型
        type = div.xpath('div[1]/ul/li[2]/text()')
        type = '没有查询到数据' if len(type) == 0 else type[0]
        # print(type)
        # 国家
        country = div.xpath('div[1]/ul/li[3]/text()')
        country = '没有查询到数据' if len(country) == 0 else country[0]
        # print(country)
        # 想看人数
        want_see = div.xpath('div[1]/ul/li[4]/span/text()')
        want_see = '没有查询到数据' if len(want_see) == 0 else want_see[0]
        want_see = int(want_see.replace('人想看', ''))
        # print(want_see)
        # 图片链接
        img_link = div.xpath('a/img/@src')
        img_link = '没有查询到数据' if len(img_link) == 0 else img_link[0]
        # 将信息放入一个列表中 [{}, {}, {}]
        movie_info_list.append({
            "movie_name": movie_name,
            "date": date,
            "type": type,
            "country": country,
            "want_see": want_see,
            "img_link": img_link
        })



    # 排序
    movie_info_list.sort(key=lambda x: x['want_see'],reverse=True)
    print(movie_info_list)

    # 绘制想看人数前五的条形图
    plt.rcParams["font.sans-serif"] = ['SimHei']
    plt.rcParams['axes.unicode_minus'] = False

    # 笨方法:
    # a = []
    # b = []
    # for movie in movie_info_list:
    #     movie_name = movie['movie_name']
    #     want_see = movie['want_see']
    #     a.append(movie_name)
    #     b.append(want_see)
    # x = []
    # y = []
    # for i in range(5):
    #     x.append(a[i])
    #     y.append(b[i])

    # 优化方法:
    # x = [movie['movie_name'] for movie in movie_info_list]
    # x = [x[i] for i in range(5)]
    # y = [movie['want_see'] for movie in movie_info_list]
    # y = [y[i] for i in range(5)]
    # 再次优化
    x = [movie['movie_name'] for movie in movie_info_list[:5]]
    y = [movie['want_see'] for movie in movie_info_list[:5]]

    # 绘制柱状图
    plt.bar(x, y)
    plt.xlabel('电影名称')
    plt.ylabel('想看人数')
    plt.title('豆瓣即将上映电影想看人数TOP5')
    plt.grid()
    plt.show()
    # 绘制水平柱状图
    plt.barh(x, y)
    plt.xlabel('想看人数')
    plt.ylabel('电影名称')
    plt.title('豆瓣即将上映电影想看人数TOP5')
    plt.show()






# 再屏幕中输入‘请输入您要查看即将上映电影信息的城市’
city = input('请输入您要查看即将上映电影信息的城市:')
# 调用函数
spider(city)

豆瓣即将上映电影想看人数top5.png 豆瓣即将上映电影想看人数top5.png

练习三:绘制电影国家占比图

from xpinyin import Pinyin
import requests
from lxml import html
from matplotlib import pyplot as plt
import numpy
import pandas as pd
# pip install xpinyin
def spider(city):
    # splitter 是分隔使用符号,默认是‘-’
    city_pinyin = Pinyin().get_pinyin(city, splitter='')
    url = 'https://movie.douban.com/cinema/later/{}/'.format(city_pinyin)
    print('您要爬取的目标站点是', url)
    print('爬虫进行中,请稍后...')
    # 请求头信息, 目的是伪装成浏览器进行爬虫
    headers = {'User-Agent': 'Mozillsa/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36'}
    # 获取网页的源代码
    response = requests.get(url, headers=headers)
    html_data = response.text
    # 提取我们想要的内容
    selector = html.fromstring(html_data)
    div_list = selector.xpath('//div[@id="showing-soon"]/div') # xpath爬取的是一个列表
    print('您好,{}市共查询到{}部即将上映的电影'.format(city,len(div_list)))
    movie_info_list = []
    for div in div_list:
        movie_name = div.xpath('div[1]/h3/a/text()')
        # if len(movie_name)==0:
        #     movie_name = '没有查询到数据'
        # else:
        #     movie_name = movie_name[0]

        # 代码优化   ,可以使程序错误也继续运行
        movie_name = '没有查询到数据' if len(movie_name) == 0 else movie_name[0]
        # print(movie_name)
        # 上映日期
        date = div.xpath('div[1]/ul/li[1]/text()')
        date = '没有查询到数据' if len(date) == 0 else date[0]
        # print(date)
        # 类型
        type = div.xpath('div[1]/ul/li[2]/text()')
        type = '没有查询到数据' if len(type) == 0 else type[0]
        # print(type)
        # 国家
        country = div.xpath('div[1]/ul/li[3]/text()')
        country = '没有查询到数据' if len(country) == 0 else country[0]
        # print(country)
        # 想看人数
        want_see = div.xpath('div[1]/ul/li[4]/span/text()')
        want_see = '没有查询到数据' if len(want_see) == 0 else want_see[0]
        want_see = int(want_see.replace('人想看', ''))
        # print(want_see)
        # 图片链接
        img_link = div.xpath('a/img/@src')
        img_link = '没有查询到数据' if len(img_link) == 0 else img_link[0]
        # 将信息放入一个列表中 [{}, {}, {}]
        movie_info_list.append({
            "movie_name": movie_name,
            "date": date,
            "type": type,
            "country": country,
            "want_see": want_see,
            "img_link": img_link
        })
    # 将国家和电影个数放入容器中,便于提取
    counts = {}
    for movie in movie_info_list:
        counts[movie['country']]=counts.get(movie['country'], 0)+1
    print(counts)
    x = list(counts.keys())
    y = list(counts.values())
    print(x)
    print(y)
    # 绘制电影国家占比图
    plt.rcParams["font.sans-serif"] = ['SimHei']
    plt.rcParams['axes.unicode_minus'] = False
    plt.pie(x=y,
            labels=x,
            autopct='%1.1f%%')
    plt.title('电影国家占比图')
    plt.legend(loc=2)
    plt.show()


# 再屏幕中输入‘请输入您要查看即将上映电影信息的城市’
city = input('请输入您要查看即将上映电影信息的城市:')
# 调用函数
spider(city)
电影国家占比图.png

相关文章

网友评论

      本文标题:绘图练习

      本文链接:https://www.haomeiwen.com/subject/qsonvctx.html