美文网首页
绘图练习

绘图练习

作者: code与有荣焉 | 来源:发表于2019-10-30 11:20 被阅读0次

    练习一:三国人物分析top10并绘制成条形图、饼状图

    import jieba
    from matplotlib import pyplot as plt
    plt.rcParams["font.sans-serif"] = ['SimHei']
    plt.rcParams['axes.unicode_minus'] = False
    import numpy
    # 读取文件
    with open('novel/threekingdom.txt', 'r', encoding='UTF-8') as f:
        data = f.read()
        # 分词
        words_list = jieba.lcut(data)
        # print(words_list)
        # print(type(words_list)) # <class 'list'>
    
        # 构建一个集合,定义无关词   ,用集合因为可以去重
        excludes = {"将军", "却说", "二人", "不可", "荆州", "不能", "如此", "丞相",
                    "商议", "如何", "主公", "军士", "军马", "左右", "次日", "引兵",
                    "大喜", "天下", "东吴", "于是", "今日", "不敢", "魏兵", "陛下",
                    "都督", "人马", "不知", "孔明曰", "玄德曰", "刘备", "云长"}
                    # , ""
        #构建一个容器,存储我们要的数据
        #{"夏侯渊":34,"害怕":33...}\
        counts = {}
        print(type(counts)) # <class 'dict'>
        # 遍历wordlist 目标是筛选出人名
        for word in words_list:
            # print(word)
            if len(word) <= 1:
                # 过滤无关词语即可
                continue
            else:
                # 向字典counts里更新值
                # counts[word] = 字典中原来该词出现的次数 + 1
                # counts[word] = counts[word] + 1
                # counts["正文"] = count["正文"] + 1
                counts[word] = counts.get(word, 0) + 1
        # print(counts)
    
        # 指向同一个词的人进行合并 , 记得把合并的词变为无关词,放到excludes中
        counts['孔明'] = counts['孔明'] + counts['孔明曰']
        counts['玄德'] = counts['玄德'] + counts['玄德曰'] + counts['刘备']
        counts['关公'] = counts['关公'] + counts['云长']
    
        # 删除无关的词语
        for word in excludes:
            del counts[word]
    
        # 排序筛选
        # 把字典转化成列表[(),()]  [{}]
        items = list(counts.items())
        print(items)
        # 按照词频次数进行排序
        items.sort(key=lambda x: x[1], reverse=True)
        print(items)
        # 显示出现词语前10的词
    
        x = []
        y = []
        for i in range(10):
            # 将返回的数据拆开,拆包
            role, count = items[i]
            x.append(role)
            y.append(count)
        # 绘制条形图
        plt.bar(x, y)
        plt.xlabel('人物')
        plt.ylabel('频次')
        plt.title('三国人物出现次数top10')
        plt.grid()
        plt.show()
        # 绘制饼图
        explode = [0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0]
        plt.pie(x=y,
                labels=x,
                autopct='%1.2f%%',
                startangle=90,
                explode=explode,
                shadow=True
                )
        plt.axis('equal')
        plt.legend(loc=2)
        plt.title('三国人物分析top10')
        plt.show()
    
    三国人物出现次数top10.png
    三国人物分析top10.png

    练习二:豆瓣即将上映电影想看人数top5并绘制条形图、水平条形图

    from xpinyin import Pinyin
    import requests
    from lxml import html
    from matplotlib import pyplot as plt
    import numpy
    import pandas as pd
    # pip install xpinyin
    def spider(city):
        # splitter 是分隔使用符号,默认是‘-’
        city_pinyin = Pinyin().get_pinyin(city, splitter='')
        url = 'https://movie.douban.com/cinema/later/{}/'.format(city_pinyin)
        print('您要爬取的目标站点是', url)
        print('爬虫进行中,请稍后...')
        # 请求头信息, 目的是伪装成浏览器进行爬虫
        headers = {'User-Agent': 'Mozillsa/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36'}
        # 获取网页的源代码
        response = requests.get(url, headers=headers)
        html_data = response.text
        # 提取我们想要的内容
        selector = html.fromstring(html_data)
        div_list = selector.xpath('//div[@id="showing-soon"]/div') # xpath爬取的是一个列表
        print('您好,{}市共查询到{}部即将上映的电影'.format(city,len(div_list)))
        movie_info_list = []
        for div in div_list:
            movie_name = div.xpath('div[1]/h3/a/text()')
            # if len(movie_name)==0:
            #     movie_name = '没有查询到数据'
            # else:
            #     movie_name = movie_name[0]
    
            # 代码优化   ,可以使程序错误也继续运行
            movie_name = '没有查询到数据' if len(movie_name) == 0 else movie_name[0]
            # print(movie_name)
            # 上映日期
            date = div.xpath('div[1]/ul/li[1]/text()')
            date = '没有查询到数据' if len(date) == 0 else date[0]
            # print(date)
            # 类型
            type = div.xpath('div[1]/ul/li[2]/text()')
            type = '没有查询到数据' if len(type) == 0 else type[0]
            # print(type)
            # 国家
            country = div.xpath('div[1]/ul/li[3]/text()')
            country = '没有查询到数据' if len(country) == 0 else country[0]
            # print(country)
            # 想看人数
            want_see = div.xpath('div[1]/ul/li[4]/span/text()')
            want_see = '没有查询到数据' if len(want_see) == 0 else want_see[0]
            want_see = int(want_see.replace('人想看', ''))
            # print(want_see)
            # 图片链接
            img_link = div.xpath('a/img/@src')
            img_link = '没有查询到数据' if len(img_link) == 0 else img_link[0]
            # 将信息放入一个列表中 [{}, {}, {}]
            movie_info_list.append({
                "movie_name": movie_name,
                "date": date,
                "type": type,
                "country": country,
                "want_see": want_see,
                "img_link": img_link
            })
    
    
    
        # 排序
        movie_info_list.sort(key=lambda x: x['want_see'],reverse=True)
        print(movie_info_list)
    
        # 绘制想看人数前五的条形图
        plt.rcParams["font.sans-serif"] = ['SimHei']
        plt.rcParams['axes.unicode_minus'] = False
    
        # 笨方法:
        # a = []
        # b = []
        # for movie in movie_info_list:
        #     movie_name = movie['movie_name']
        #     want_see = movie['want_see']
        #     a.append(movie_name)
        #     b.append(want_see)
        # x = []
        # y = []
        # for i in range(5):
        #     x.append(a[i])
        #     y.append(b[i])
    
        # 优化方法:
        # x = [movie['movie_name'] for movie in movie_info_list]
        # x = [x[i] for i in range(5)]
        # y = [movie['want_see'] for movie in movie_info_list]
        # y = [y[i] for i in range(5)]
        # 再次优化
        x = [movie['movie_name'] for movie in movie_info_list[:5]]
        y = [movie['want_see'] for movie in movie_info_list[:5]]
    
        # 绘制柱状图
        plt.bar(x, y)
        plt.xlabel('电影名称')
        plt.ylabel('想看人数')
        plt.title('豆瓣即将上映电影想看人数TOP5')
        plt.grid()
        plt.show()
        # 绘制水平柱状图
        plt.barh(x, y)
        plt.xlabel('想看人数')
        plt.ylabel('电影名称')
        plt.title('豆瓣即将上映电影想看人数TOP5')
        plt.show()
    
    
    
    
    
    
    # 再屏幕中输入‘请输入您要查看即将上映电影信息的城市’
    city = input('请输入您要查看即将上映电影信息的城市:')
    # 调用函数
    spider(city)
    
    
    豆瓣即将上映电影想看人数top5.png 豆瓣即将上映电影想看人数top5.png

    练习三:绘制电影国家占比图

    from xpinyin import Pinyin
    import requests
    from lxml import html
    from matplotlib import pyplot as plt
    import numpy
    import pandas as pd
    # pip install xpinyin
    def spider(city):
        # splitter 是分隔使用符号,默认是‘-’
        city_pinyin = Pinyin().get_pinyin(city, splitter='')
        url = 'https://movie.douban.com/cinema/later/{}/'.format(city_pinyin)
        print('您要爬取的目标站点是', url)
        print('爬虫进行中,请稍后...')
        # 请求头信息, 目的是伪装成浏览器进行爬虫
        headers = {'User-Agent': 'Mozillsa/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36'}
        # 获取网页的源代码
        response = requests.get(url, headers=headers)
        html_data = response.text
        # 提取我们想要的内容
        selector = html.fromstring(html_data)
        div_list = selector.xpath('//div[@id="showing-soon"]/div') # xpath爬取的是一个列表
        print('您好,{}市共查询到{}部即将上映的电影'.format(city,len(div_list)))
        movie_info_list = []
        for div in div_list:
            movie_name = div.xpath('div[1]/h3/a/text()')
            # if len(movie_name)==0:
            #     movie_name = '没有查询到数据'
            # else:
            #     movie_name = movie_name[0]
    
            # 代码优化   ,可以使程序错误也继续运行
            movie_name = '没有查询到数据' if len(movie_name) == 0 else movie_name[0]
            # print(movie_name)
            # 上映日期
            date = div.xpath('div[1]/ul/li[1]/text()')
            date = '没有查询到数据' if len(date) == 0 else date[0]
            # print(date)
            # 类型
            type = div.xpath('div[1]/ul/li[2]/text()')
            type = '没有查询到数据' if len(type) == 0 else type[0]
            # print(type)
            # 国家
            country = div.xpath('div[1]/ul/li[3]/text()')
            country = '没有查询到数据' if len(country) == 0 else country[0]
            # print(country)
            # 想看人数
            want_see = div.xpath('div[1]/ul/li[4]/span/text()')
            want_see = '没有查询到数据' if len(want_see) == 0 else want_see[0]
            want_see = int(want_see.replace('人想看', ''))
            # print(want_see)
            # 图片链接
            img_link = div.xpath('a/img/@src')
            img_link = '没有查询到数据' if len(img_link) == 0 else img_link[0]
            # 将信息放入一个列表中 [{}, {}, {}]
            movie_info_list.append({
                "movie_name": movie_name,
                "date": date,
                "type": type,
                "country": country,
                "want_see": want_see,
                "img_link": img_link
            })
        # 将国家和电影个数放入容器中,便于提取
        counts = {}
        for movie in movie_info_list:
            counts[movie['country']]=counts.get(movie['country'], 0)+1
        print(counts)
        x = list(counts.keys())
        y = list(counts.values())
        print(x)
        print(y)
        # 绘制电影国家占比图
        plt.rcParams["font.sans-serif"] = ['SimHei']
        plt.rcParams['axes.unicode_minus'] = False
        plt.pie(x=y,
                labels=x,
                autopct='%1.1f%%')
        plt.title('电影国家占比图')
        plt.legend(loc=2)
        plt.show()
    
    
    # 再屏幕中输入‘请输入您要查看即将上映电影信息的城市’
    city = input('请输入您要查看即将上映电影信息的城市:')
    # 调用函数
    spider(city)
    
    电影国家占比图.png

    相关文章

      网友评论

          本文标题:绘图练习

          本文链接:https://www.haomeiwen.com/subject/qsonvctx.html