python第七天

作者: code与有荣焉 | 来源:发表于2019-10-29 20:53 被阅读0次

    一、python中的生成表达式

    1. 列表推导式

    用途:快速生成一个列表
    格式

    # 格式 [表达式 for 临时变量 in 可迭代对象 [条件语句]]
    

    与普通for循环创建列表进行对比:

    # 使用普通for循环创建列表
    # 生成一个空列表
    li = []
    # for 临时变量 in 可迭代对象:
    #     循环体
    for i in range(10):
        # 向列表中添加元素
        li.append(i)
    print(li)
    

    用列表推导式创建列表

    # 以上我们使用4行代码显示一个0-9的列表
    print([i for i in range(10)])
    

    示例1:

    # 生成随机数
    from random import randint
    print(randint(60, 100))
    
    # 生成一个列表['序号:998','序号:992', '序号:993' ]
    # [100,999]共十个元素
    
    # for循环
    from random import randint
    l1 = []
    # _ 是给读代码的人看的,表示下面将不使用 _
    for _ in range(10):
        seq = '序号:{}'.format(randint(100, 999))
        l1.append(seq)
    print(l1) # ['序号:948', '序号:475', '序号:727', '序号:291', '序号:788', '序号:708', '序号:554', '序号:217', '序号:407', '序号:594']
    # 列表推导式
    l2 = ['序号:{}'.format(randint(100, 999)) for _ in range(10)]
    # print(l2) # ['序号:177', '序号:568', '序号:518', '序号:878', '序号:523', '序号:526', '序号:938', '序号:556', '序号:137', '序号:432']
    
    

    示例2:

    # 生成一个列表
    from random import randint
    li = []
    for _ in range(10):
        li.append(randint(30, 100))
    print(li)
    # 已知列表li ,然后筛选所有的偶数
    li = [92, 53, 84, 54, 82, 92, 95, 38, 52, 42]
    # for 循环 遍历这个列表
    result = []
    for x in li:
        if x%2 == 0:
            result.append(x)
    print(result) # [92, 84, 54, 82, 92, 38, 52, 42]
    
    # 格式 [表达式 for 临时变量 in 可迭代对象 [条件语句]]
    [x for x in li if x%2==0]
    

    练习:使用列表推导式生成一个含有二十个元素的随机数列表[],再筛选出所有的奇数。

    from random import randint
    
    li = [randint(0, 100) for _ in range(20)]
    print([i for i in li if i%2 == 1]) # [33, 79, 45, 65, 59, 95, 57, 65, 97]
    
    

    2. 三目运算符

    用途:可以使用一行的 if else
    格式

    # 格式: 返回值 if 满足条件的表达式 else 不满足时要执行的事情
    

    示例1:

    li = ['dada']
    if len(li)==0:
        li = 'aa'
    else:
        li = li[0]
    print(li)
    # 三目运算符
    # 格式: 返回值 if 满足条件的表达式 else 不满足时要执行的事情
    s = 'aa' if len(li) == 0 else li[0]
    print(s)
    

    二、爬取当当图书信息(优化后)

    import requests
    from lxml import html
    # 安装pandas
    # pip install pandas
    # 导入pandas
    import pandas as pd
    def spider(isbn):
        """:param   #param是参数
        当当网图书信息爬虫
        """
        # url = "http://search.dangdang.com/?key=python%B4%D3%C8%EB%C3%C5%B5%BD%CA%B5%BC%F9&act=input"
        # isbn 国际标准书号(唯一的) 9787115428028
        url = "http://search.dangdang.com/?key={}&act=input".format(isbn)
        print(url)
        # 获取网页的源代码
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36'}
        html_data = requests.get(url, headers=headers).text
        print(html_data)
    
        #使用xpath语法提取我们想要的内容
        selector = html.fromstring(html_data)
    
        ul_list = selector.xpath('//div[@id="search_nature_rg"]/ul/li')
        print('有{}家商铺售卖此书'.format(len(ul_list)))
    
        # 用于存储图书的所有信息,每一家是一个字典
        # [{},{},{}]
        book_info_list = []
        # 遍历
        for li in ul_list:
            # 爬取所有书籍的标题
            title = li.xpath('a/@title')[0]
            # print(title)
            # 获取所有购买链接
            link = li.xpath('a/@href')[0]
            # print(link)
            # 获取价格
            price = li.xpath('p[@class="price"]/span[@class="search_now_price"]/text()')[0]
            # print(price)
            # 去掉¥符号
            price = price.replace('¥', ' ')
            # print(price)
    
            # 爬取除了当当自营以外的所有店铺(作业)
            # //标签1[@属性1=属性值1]/.../text()
            # //标签1[@属性1=属性值1]/.../@属性的名字
            # store = li.xpath('p[@class="search_shangjia"]/a/text()')
            store = li.xpath('p[4]/a/@title')
            # store列表是当当自营的时候是空的
            # if len(store) == 0:
            #     # 当当自营
            #     store = "当当自营"
            # else:
            #     store = store[0]
    
            # 代码优化
            store = '当当自营' if len(store) == 0 else store[0]
            print(store)
            # print(store)
            book_info_list.append({
                'title': title,
                'link': link,
                'price': price,
                'store': store
            })
    
        # 排序
        book_info_list.sort(key=lambda x: float(x['price']), reverse=True)
        # 遍历图书列表
        for book in book_info_list:
            print(book)
        # import pandas as pd
        # 转化成dataframe格式
        df = pd.DataFrame(book_info_list)
        # 存储成csv   ,csv 是逗号分隔值文件
        df.to_csv('当当图书信息.csv')
    isbn = input('请输入您要查询的书号')
    spider(isbn)
    

    三、爬取豆瓣即将上映电影信息(优化后)

    多了个下载电影图片功能

    from xpinyin import Pinyin
    import requests
    from lxml import html
    import pandas as pd
    # pip install xpinyin
    def spider(city):
        # splitter 是分隔使用符号,默认是‘-’
        city_pinyin = Pinyin().get_pinyin(city, splitter='')
        url = 'https://movie.douban.com/cinema/later/{}/'.format(city_pinyin)
        print('您要爬取的目标站点是', url)
        print('爬虫进行中,请稍后...')
        # 请求头信息, 目的是伪装成浏览器进行爬虫
        headers = {'User-Agent': 'Mozillsa/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36'}
        # 获取网页的源代码
        response = requests.get(url, headers=headers)
        html_data = response.text
        # 提取我们想要的内容
        selector = html.fromstring(html_data)
        div_list = selector.xpath('//div[@id="showing-soon"]/div') # xpath爬取的是一个列表
        print('您好,{}市共查询到{}部即将上映的电影'.format(city,len(div_list)))
        movie_info_list = []
        for div in div_list:
            movie_name = div.xpath('div[1]/h3/a/text()')
            # if len(movie_name)==0:
            #     movie_name = '没有查询到数据'
            # else:
            #     movie_name = movie_name[0]
    
            # 代码优化   ,可以使程序错误也继续运行
            movie_name = '没有查询到数据' if len(movie_name) == 0 else movie_name[0]
            # print(movie_name)
            # 上映日期
            date = div.xpath('div[1]/ul/li[1]/text()')
            date = '没有查询到数据' if len(date) == 0 else date[0]
            # print(date)
            # 类型
            type = div.xpath('div[1]/ul/li[2]/text()')
            type = '没有查询到数据' if len(type) == 0 else type[0]
            # print(type)
            # 国家
            country = div.xpath('div[1]/ul/li[3]/text()')
            country = '没有查询到数据' if len(country) == 0 else country[0]
            # print(country)
            # 想看人数
            want_see = div.xpath('div[1]/ul/li[4]/span/text()')
            want_see = '没有查询到数据' if len(want_see) == 0 else want_see[0]
            want_see = int(want_see.replace('人想看', ''))
            # print(want_see)
            # 图片链接
            img_link = div.xpath('a/img/@src')
            img_link = '没有查询到数据' if len(img_link) == 0 else img_link[0]
            # 将信息放入一个列表中 [{}, {}, {}]
            movie_info_list.append({
                "movie_name": movie_name,
                "date": date,
                "type": type,
                "country": country,
                "want_see": want_see,
                "img_link": img_link
            })
    
    
    
        # 排序
        movie_info_list.sort(key=lambda x: x['want_see'])
        # 遍历
        for movie in movie_info_list:
            print(movie)
            # 图片爬取
            with open('./douban_img/{}.jpg'.format(movie['movie_name']), 'wb') as f:
                f.write(requests.get(movie['img_link']).content)
        pd.DataFrame(movie_info_list).to_csv('{}douban_movie_info.csv'.format(city_pinyin))
    # 再屏幕中输入‘请输入您要查看即将上映电影信息的城市’
    city = input('请输入您要查看即将上映电影信息的城市:')
    # 调用函数
    spider(city)
    
    
    

    四、数据可视化

    常用的可视化工具:echarts(百度开源的)
    安装并导入matplotlib、numpy库

    # matplotlib # 用于绘图的库
    # 安装
    # pip install matplotlib、numpy
    # 导入
    from matplotlib import pyplot as plt
    # 导入numpy
    import numpy as np
    

    设置支持中文字体

    # 设置支持中文字体
    plt.rcParams["font.sans-serif"] = ['SimHei']
    plt.rcParams['axes.unicode_minus'] = False
    
    1. 绘制 正弦曲线(线形图)
    # 绘制 正弦曲线
    # 选取100个等间距的点(x,y)然后进行绘制曲线图
    # 生成[0,2π]区间100个等间距的点
    x = np.linspace(0, 2*np.pi, num=100) # linspace 可以生成等间距的范围
    y = np.sin(x)
    cosy = np.cos(x)
    # 绘制线形图
    plt.plot(x, y,
            color='r', # 线的颜色
            linestyle = ':', # 线的风格
             # - 实线, -- 虚线, : 点化线
            marker='o',# 标记点的样式
             # o 实心圆, *,+
            markerfacecolor='r', # 标记点的颜色
            alpha=0.8, # 设置透明度
            label='代表正弦曲线' # 标签
    )
    plt.plot(x, cosy,
             label='cos(x)',
             linestyle='-',
             color='g',
             marker='*',
             markerfacecolor='r',
             alpha=0.6
    )
    plt.xlabel('time(s)')
    plt.ylabel('电压(v)')
    plt.title('电压随时间变化曲线')
    plt.legend() # 设置图例
    plt.show()
    
    

    输出线形图:


    输出线形图
    2. 绘制条形图
    from random import randint
    x = ['口红{}'.format(i) for i in range(1, 7)]
    print(x)
    y = [randint(200, 1000) for _ in range(6)]
    print(y)
    plt.bar(x, y)
    plt.grid() # 带网格
    plt.xlabel('口红品牌')
    plt.ylabel('口红价格(元)')
    plt.show()
    

    输出条形图:


    输出条形图

    五、作业

    1. 三国人物分析top10绘制条形图

    import jieba
    from matplotlib import pyplot as plt
    plt.rcParams["font.sans-serif"] = ['SimHei']
    plt.rcParams['axes.unicode_minus'] = False
    import numpy
    # 读取文件
    with open('novel/threekingdom.txt', 'r', encoding='UTF-8') as f:
        data = f.read()
        # 分词
        words_list = jieba.lcut(data)
        # print(words_list)
        # print(type(words_list)) # <class 'list'>
    
        # 构建一个集合,定义无关词   ,用集合因为可以去重
        excludes = {"将军", "却说", "二人", "不可", "荆州", "不能", "如此", "丞相",
                    "商议", "如何", "主公", "军士", "军马", "左右", "次日", "引兵",
                    "大喜", "天下", "东吴", "于是", "今日", "不敢", "魏兵", "陛下",
                    "都督", "人马", "不知", "孔明曰", "玄德曰", "刘备", "云长"}
                    # , ""
        #构建一个容器,存储我们要的数据
        #{"夏侯渊":34,"害怕":33...}\
        counts = {}
        print(type(counts)) # <class 'dict'>
        # 遍历wordlist 目标是筛选出人名
        for word in words_list:
            # print(word)
            if len(word) <= 1:
                # 过滤无关词语即可
                continue
            else:
                # 向字典counts里更新值
                # counts[word] = 字典中原来该词出现的次数 + 1
                # counts[word] = counts[word] + 1
                # counts["正文"] = count["正文"] + 1
                counts[word] = counts.get(word, 0) + 1
        # print(counts)
    
        # 指向同一个词的人进行合并 , 记得把合并的词变为无关词,放到excludes中
        counts['孔明'] = counts['孔明'] + counts['孔明曰']
        counts['玄德'] = counts['玄德'] + counts['玄德曰'] + counts['刘备']
        counts['关公'] = counts['关公'] + counts['云长']
    
        # 删除无关的词语
        for word in excludes:
            del counts[word]
    
        # 排序筛选
        # 把字典转化成列表[(),()]  [{}]
        items = list(counts.items())
        print(items)
        # 按照词频次数进行排序
        items.sort(key=lambda x: x[1], reverse=True)
        print(items)
        # 显示出现词语前10的词
    
        x = []
        y = []
        for i in range(10):
            # 将返回的数据拆开,拆包
            role, count = items[i]
            x.append(role)
            y.append(count)
        plt.bar(x, y)
        plt.xlabel('人物')
        plt.ylabel('频次')
        plt.title('三国人物出现次数top10')
        plt.grid()
        plt.show()
    

    输出:


    三国人物分析top10

    2. 豆瓣中最想看的即将上映电影top5条形图

    from xpinyin import Pinyin
    import requests
    from lxml import html
    from matplotlib import pyplot as plt
    import numpy
    import pandas as pd
    # pip install xpinyin
    def spider(city):
        # splitter 是分隔使用符号,默认是‘-’
        city_pinyin = Pinyin().get_pinyin(city, splitter='')
        url = 'https://movie.douban.com/cinema/later/{}/'.format(city_pinyin)
        print('您要爬取的目标站点是', url)
        print('爬虫进行中,请稍后...')
        # 请求头信息, 目的是伪装成浏览器进行爬虫
        headers = {'User-Agent': 'Mozillsa/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36'}
        # 获取网页的源代码
        response = requests.get(url, headers=headers)
        html_data = response.text
        # 提取我们想要的内容
        selector = html.fromstring(html_data)
        div_list = selector.xpath('//div[@id="showing-soon"]/div') # xpath爬取的是一个列表
        print('您好,{}市共查询到{}部即将上映的电影'.format(city,len(div_list)))
        movie_info_list = []
        for div in div_list:
            movie_name = div.xpath('div[1]/h3/a/text()')
            # if len(movie_name)==0:
            #     movie_name = '没有查询到数据'
            # else:
            #     movie_name = movie_name[0]
    
            # 代码优化   ,可以使程序错误也继续运行
            movie_name = '没有查询到数据' if len(movie_name) == 0 else movie_name[0]
            # print(movie_name)
            # 上映日期
            date = div.xpath('div[1]/ul/li[1]/text()')
            date = '没有查询到数据' if len(date) == 0 else date[0]
            # print(date)
            # 类型
            type = div.xpath('div[1]/ul/li[2]/text()')
            type = '没有查询到数据' if len(type) == 0 else type[0]
            # print(type)
            # 国家
            country = div.xpath('div[1]/ul/li[3]/text()')
            country = '没有查询到数据' if len(country) == 0 else country[0]
            # print(country)
            # 想看人数
            want_see = div.xpath('div[1]/ul/li[4]/span/text()')
            want_see = '没有查询到数据' if len(want_see) == 0 else want_see[0]
            want_see = int(want_see.replace('人想看', ''))
            # print(want_see)
            # 图片链接
            img_link = div.xpath('a/img/@src')
            img_link = '没有查询到数据' if len(img_link) == 0 else img_link[0]
            # 将信息放入一个列表中 [{}, {}, {}]
            movie_info_list.append({
                "movie_name": movie_name,
                "date": date,
                "type": type,
                "country": country,
                "want_see": want_see,
                "img_link": img_link
            })
    
    
    
        # 排序
        movie_info_list.sort(key=lambda x: x['want_see'],reverse=True)
        print(movie_info_list)
    
        # 绘制想看人数前五的条形图
        plt.rcParams["font.sans-serif"] = ['SimHei']
        plt.rcParams['axes.unicode_minus'] = False
    
        # 笨方法:
        # a = []
        # b = []
        # for movie in movie_info_list:
        #     movie_name = movie['movie_name']
        #     want_see = movie['want_see']
        #     a.append(movie_name)
        #     b.append(want_see)
        # x = []
        # y = []
        # for i in range(5):
        #     x.append(a[i])
        #     y.append(b[i])
    
        # 优化方法:
        x = [movie['movie_name'] for movie in movie_info_list]
        x = [x[i] for i in range(5)]
        y = [movie['want_see'] for movie in movie_info_list]
        y = [y[i] for i in range(5)]
        plt.bar(x, y)
        plt.xlabel('电影名称')
        plt.ylabel('想看人数')
        plt.title('豆瓣即将上映电影想看人数TOP5')
        plt.grid()
        plt.show()
    
    
    # 再屏幕中输入‘请输入您要查看即将上映电影信息的城市’
    city = input('请输入您要查看即将上映电影信息的城市:')
    # 调用函数
    spider(city)
    
    

    输出:


    豆瓣中最想看的即将上映电影top5

    相关文章

      网友评论

        本文标题:python第七天

        本文链接:https://www.haomeiwen.com/subject/knggvctx.html