美文网首页
天池短租数据简单分析

天池短租数据简单分析

作者: butters001 | 来源:发表于2020-09-09 16:57 被阅读0次

    导入下面代码所需依赖包

    import math
    import pandas as pd
    import numpy as np
    import plotly.graph_objects as go
    from plotly.subplots import make_subplots
    

    加载数据

    source_data = pd.read_csv('./source_data/listings.csv')
    # 查看数据
    source_data.head()
    
    image.png

    每一列的数据类型

    source_data.dtypes
    
    image.png

    修改列名 删除两列

    source_data.columns = ['id', 'name', '房主id', '房主名', '行政区组', '行政区名', '纬度', '经度', '房屋类型', '价格', '最低入住天数', '评论数量', '最后评论时间', '每月评论数', 'calculated_host_listings_count', '可提供天数365']
    
    del source_data['行政区组']
    del source_data['calculated_host_listings_count']
    

    按行政区分组 查看每个区的房屋出租数量

    source_data.loc[:, ['行政区名']].groupby('行政区名').size()
    
    image.png

    行政区名 统一命名某某区 对房屋类型进行汉化

    for neighbourhood in source_data['行政区名'].unique():
        split_res = neighbourhood.split('/')
        if len(split_res) == 2:
            source_data.replace(neighbourhood, split_res[0].strip(), inplace=True)
            
    source_data['行政区名'].replace('县', '区', regex=True, inplace=True)
    
    source_data.replace('Entire home/apt', '整套房子/公寓', inplace=True)
    source_data.replace('Private room', '私人房间', inplace=True)
    source_data.replace('Shared room', '共享房间', inplace=True)
    
    source_data
    
    image.png

    再次查看每个区的房屋数量

    neighbourhood_group_count = source_data.loc[:, ['行政区名']].groupby('行政区名').size()
    neighbourhood_group_count
    
    image.png

    每个区房源数量绘图

    data = go.Bar(name='北京市房源数量分布图', 
                  x=neighbourhood_group_count.index.values, 
                  y=neighbourhood_group_count.values, 
                  text=neighbourhood_group_count.values,
                  texttemplate='%{text:.3s}',
                  textposition='outside')
    fig = go.Figure(data=data)
    
    # 降序
    fig.update_layout(xaxis={'categoryorder':'total descending'})
    fig.show()
    
    image.png

    每个区的房源价格箱型图

    fig = go.Figure()
    for neighbourhood in source_data['行政区名'].unique():
        y = source_data[source_data['行政区名']==neighbourhood]['价格'].values
        box = go.Box(y=y, name=neighbourhood)
        fig.add_trace(box)
    fig.show()
    
    image.png

    剔除异常值,重新绘制箱形图 异常值设置为一万以上

    # 获取价格一万以内的数据
    price_in_10k = source_data[source_data['价格'] <= 10000]
    
    fig = go.Figure()
    for neighbourhood in price_in_10k['行政区名'].unique():
        y = price_in_10k[price_in_10k['行政区名']==neighbourhood]['价格'].values
        box = go.Box(y=y, name=neighbourhood)
        fig.add_trace(box)
    fig.show()
    
    image.png

    感觉异常值还是多 设置4000以下看看

    print('4000以内价格的房屋数量: %s ' % source_data[source_data['价格'] <= 4000].shape[0])
    print('4000以外价格的房屋数量: %s ' % source_data[source_data['价格'] > 4000].shape[0])
    
    # 获取价格四千以内的数据
    price_in_4k = source_data[source_data['价格'] <= 4000]
    fig = go.Figure()
    for neighbourhood in price_in_4k['行政区名'].unique():
        y = price_in_4k[price_in_4k['行政区名']==neighbourhood]['价格'].values
        box = go.Box(y=y, name=neighbourhood)
        fig.add_trace(box)
    fig.show()
    
    image.png

    密云 怀柔 延庆价格怎么这么高。。。单拿出来观察一下

    print('密云', source_data[source_data['行政区名'] == '密云区']['价格'].describe(), '\r\n')
    print('朝阳', source_data[source_data['行政区名'] == '朝阳区']['价格'].describe(), '\r\n')
    print('怀柔', source_data[source_data['行政区名'] == '怀柔区']['价格'].describe())
    
    image.png

    查看一下房屋类型占比

    # 先按行政区进行分组 再按房屋类型进行分组
    groupby_neighbourhood_price = source_data.loc[:, ['行政区名', '房屋类型', 'id']].groupby(['行政区名', '房屋类型']).count()
    groupby_neighbourhood_price
    
    image.png

    重制索引

    groupby_neighbourhood_price.reset_index(inplace=True)
    
    neighbourhoods = groupby_neighbourhood_price['行政区名'].unique()
    rows_num = math.ceil(len(neighbourhoods)/3)
    cols_num=3
    
    fig = make_subplots(rows_num, cols_num, 
                        specs=[[{'type':'domain'}]*3]*rows_num, 
                        subplot_titles=neighbourhoods)
    
    labels = ['整套房子/公寓', '私人房间', '共享房间']
    
    row = 0
    col = 1
    for index, neighbourhood in enumerate(neighbourhoods):
        values = []
        for label in labels:
            value = groupby_neighbourhood_price[(groupby_neighbourhood_price['行政区名']==neighbourhood) & (groupby_neighbourhood_price['房屋类型']==label)].id.values
            value = 0 if not value else value[0]
            values.append(value)
        data = go.Pie(labels=labels, values=values, scalegroup=index, name=neighbourhood)
        # "radial", "tangential" “径向”,“切向”
        if index % 3 == 0:
            row += 1
            col = 1
        fig.add_trace(data, row, col)
        col += 1
    fig.show()
    
    image.png

    相关文章

      网友评论

          本文标题:天池短租数据简单分析

          本文链接:https://www.haomeiwen.com/subject/rdaeektx.html