美文网首页
python采集某旅游网攻略并作可视化分析~

python采集某旅游网攻略并作可视化分析~

作者: 颜狗一只 | 来源:发表于2022-04-06 19:12 被阅读0次

    知识点

    requests 发送网络请求
    parsel 解析数据
    csv 保存数据

    第三方库

    requests >>> pip install requests
    parsel >>> pip install parsel

    开发环境:

    版 本: python 3.8
    编辑器:pycharm 2021.2
    [图片上传失败...(image-36f74a-1649243475687)]

    开始代码

    导入模块

    # 发送网络请求的模块
    import requests
    # 解析数据的模块
    import parsel
    import csv
    import time
    import random
    

    代码

    # 发送请求
    url = f'https://travel.qunar.com/travelbook/list.htm?page=1&order=hot_heat'
    # <Response [200]>: 告诉我们 请求成功了
    response = requests.get(url)
    
    # 获取数据(网页源代码)
    html_data = response.text
    
    # 解析网页(re正则表达式,css选择器,xpath,bs4/六年没更新了,json)
    # html_data: 字符串
    # 我们现在要把这个字符串 变成一个对象
    selector = parsel.Selector(html_data)
    # ::attr(href) url_list:列表
    url_list = selector.css('.b_strategy_list li h2 a::attr(href)').getall()
    for detail_url in url_list:
        # 字符串的 替换方法
        detail_id = detail_url.replace('/youji/', '')
        url_1 = 'https://travel.qunar.com/travelbook/note/' + detail_id
        print(url_1)
    
    # 向详情页网站发送请求(get,post)
    # https://travel.qunar.com/travelbook/note/7701502
    response_1 = requests.get(url_1).text
    
    # 解析网页
    selector_1 = parsel.Selector(response_1)
    # :nth-child(): 伪类选择器
    # ::text 提取文本内容
    # * 代表所有
    # 地点
    title = selector_1.css('.b_crumb_cont *:nth-child(3)::text').get().replace('旅游攻略', '')
    # 短评
    comment = selector_1.css('.title.white::text').get()
    # 出发日期
    date = selector_1.css('#js_mainleft > div.b_foreword > ul > li.f_item.when > p > span.data::text').get()
    # 天数
    days = selector_1.css('#js_mainleft > div.b_foreword > ul > li.f_item.howlong > p > span.data::text').get()
    # 人均消费
    money = selector_1.css('#js_mainleft > div.b_foreword > ul > li.f_item.howmuch > p > span.data::text').get()
    # 人物
    character = selector_1.css('#js_mainleft > div.b_foreword > ul > li.f_item.who > p > span.data::text').get()
    # 玩法
    play_list = selector_1.css('#js_mainleft > div.b_foreword > ul > li.f_item.how > p > span.data span::text').getall()
    play = ' '.join(play_list)
    # 浏览量
    count = selector_1.css('.view_count::text').get()
    print(title, comment, date, days, money, character, play, count)
    
    # 保存数据
    # 保存成csv
    csv_qne = open('去哪儿.csv', mode='a', encoding='utf-8', newline='')
    csv_writer = csv.writer(csv_qne)
    # 写入数据
    csv_writer.writerow(['地点', '短评', '出发时间', '天数', '人均消费', '人物', '玩法', '浏览量'])
    

    [图片上传失败...(image-90b308-1649243475688)]

    数据可视化

    导入模块

    import pandas as pd
    from pyecharts.commons.utils import JsCode
    from pyecharts.charts import *
    from pyecharts import options as opts
    

    导入数据

    data = pd.read_csv('去哪儿_数分.csv')
    data
    

    [图片上传失败...(image-143004-1649243475688)]

    旅游胜地Top10及对应费用

    bar=(
        Bar(init_opts=opts.InitOpts(height='500px',width='1000px',theme='dark'))
        .add_xaxis(m2)
        .add_yaxis(
            '目的地Top10',
            n2,
            label_opts=opts.LabelOpts(is_show=True,position='top'),
            itemstyle_opts=opts.ItemStyleOpts(
                color=JsCode("""new echarts.graphic.LinearGradient(
                0, 0, 0, 1,[{offset: 0,color: 'rgb(255,99,71)'}, {offset: 1,color: 'rgb(32,178,170)'}])
                """
                )
            )
        )
        .set_global_opts(
            title_opts=opts.TitleOpts(
                title='目的地Top10'),
                xaxis_opts=opts.AxisOpts(name='景点名称',
                type_='category',                                           
                axislabel_opts=opts.LabelOpts(rotate=90),
            ),
            yaxis_opts=opts.AxisOpts(
                name='数量',
                min_=0,
                max_=120.0,
                splitline_opts=opts.SplitLineOpts(is_show=True,linestyle_opts=opts.LineStyleOpts(type_='dash'))
            ),
            tooltip_opts=opts.TooltipOpts(trigger='axis',axis_pointer_type='cross')
        )
    
        .set_series_opts(
            markline_opts=opts.MarkLineOpts(
                data=[
                    opts.MarkLineItem(type_='average',name='均值'),
                    opts.MarkLineItem(type_='max',name='最大值'),
                    opts.MarkLineItem(type_='min',name='最小值'),
                ]
            )
        )
    )
    bar.render_notebook()
    

    [图片上传失败...(image-7127cf-1649243475688)]

    bar=(
        Bar(init_opts=opts.InitOpts(height='500px',width='1000px',theme='dark'))
        .add_xaxis(loc)
        .add_yaxis(
            '人均费用',
            price_mean2,
            label_opts=opts.LabelOpts(is_show=True,position='top'),
            itemstyle_opts=opts.ItemStyleOpts(
                color=JsCode("""new echarts.graphic.LinearGradient(
                0, 0, 0, 1,[{offset: 0,color: 'rgb(255,99,71)'}, {offset: 1,color: 'rgb(32,178,170)'}])
                """
                )
            )
        )
        .set_global_opts(
            title_opts=opts.TitleOpts(
                title='各景点人均费用'),
                xaxis_opts=opts.AxisOpts(name='景点名称',
                type_='category',                                           
                axislabel_opts=opts.LabelOpts(rotate=90),
            ),
            yaxis_opts=opts.AxisOpts(
                name='数量',
                min_=0,
                max_=2000.0,
                splitline_opts=opts.SplitLineOpts(is_show=True,linestyle_opts=opts.LineStyleOpts(type_='dash'))
            ),
            tooltip_opts=opts.TooltipOpts(trigger='axis',axis_pointer_type='cross')
        )
    
        .set_series_opts(
            markline_opts=opts.MarkLineOpts(
                data=[
                    opts.MarkLineItem(type_='average',name='均值'),
                    opts.MarkLineItem(type_='max',name='最大值'),
                    opts.MarkLineItem(type_='min',name='最小值'),
                ]
            )
        )
    )
    bar.render_notebook()
    

    [图片上传失败...(image-88cacb-1649243475688)]

    出游方式分析

    pie = (Pie(init_opts=opts.InitOpts(theme='dark', width='1000px', height='800px'))
           .add("", [z for z in zip(m1,n1)],
                radius=["40%", "65%"])
           .set_global_opts(title_opts=opts.TitleOpts(title="去哪儿\n\n出游结伴方式", pos_left='center', pos_top='center',
                                                   title_textstyle_opts=opts.TextStyleOpts(
                                                       color='#FF6A6A', font_size=30, font_weight='bold'),
                                                   ),
                            visualmap_opts=opts.VisualMapOpts(is_show=False, 
                                              min_=38,
                                              max_=641,
                                              is_piecewise=False,
                                              dimension=0,
                                              range_color=['#9400D3', '#008afb', '#ffec4a', '#FFA500','#ce5777']),
                            legend_opts=opts.LegendOpts(is_show=False, pos_top='5%'),
                            )
           .set_series_opts(label_opts=opts.LabelOpts(formatter="{b}: {c}", font_size=12),
                            tooltip_opts=opts.TooltipOpts(trigger="item", formatter="{b}: {c}"),
                            itemstyle_opts={"normal": {
                                                        "barBorderRadius": [30, 30, 30, 30],
                                                        'shadowBlur': 10,
                                                        'shadowColor': 'rgba(0,191,255,0.5)',
                                                        'shadowOffsetY': 1,
                                                        'opacity': 0.8
                                                    }
                                           })
            
                            )
    pie.render_notebook()
    

    [图片上传失败...(image-40392b-1649243475688)]

    出游时间分析

    line = (
        Line()
        .add_xaxis(m4.tolist())
        .add_yaxis('',n4.tolist())
    )
    line.render_notebook()
    

    [图片上传失败...(image-99d3b0-1649243475688)]
    2021年的旅游时间曲线大约在五月一号起伏最大,原因肯定是因为假期调休延长至4天,为了调整自己生活及工作的状态,很多人利用这个假期去旅行放松自己。

    出游玩法分析

    m5 = []
    n5 = []
    for i in range(20):
        m5.append(list[i][0])
        n5.append(list[i][1])
    m5.reverse()
    m6 = m5
    n5.reverse()
    n6 = n5
    
    bar = (
        Bar(init_opts=opts.InitOpts(theme='dark', width='1000px',height ='500px'))
        .add_xaxis(m6)
        .add_yaxis('', n6)
        .set_series_opts(label_opts=opts.LabelOpts(is_show=True, 
                                                           position='insideRight',
                                                           font_style='italic'),
                                itemstyle_opts=opts.ItemStyleOpts(
                                    color=JsCode("""new echarts.graphic.LinearGradient(1, 0, 0, 0, 
                                                 [{
                                                     offset: 0,
                                                     color: 'rgb(255,99,71)'
                                                 }, {
                                                     offset: 1,
                                                     color: 'rgb(32,178,170)'
                                                 }])"""))
                                )
        .set_global_opts(
            title_opts=opts.TitleOpts(title="出游玩法分析"),
            xaxis_opts=opts.AxisOpts(axislabel_opts=opts.LabelOpts(rotate=45)),
            legend_opts=opts.LegendOpts(is_show=True))
        .reversal_axis()
    )
    bar.render_notebook()
    

    [图片上传失败...(image-f46ca6-1649243475688)]

    相关文章

      网友评论

          本文标题:python采集某旅游网攻略并作可视化分析~

          本文链接:https://www.haomeiwen.com/subject/jicdsrtx.html