导入下面代码所需依赖包
import math
import pandas as pd
import numpy as np
import plotly.graph_objects as go
from plotly.subplots import make_subplots
加载数据
source_data = pd.read_csv('./source_data/listings.csv')
# 查看数据
source_data.head()
image.png
每一列的数据类型
source_data.dtypes
image.png
修改列名 删除两列
source_data.columns = ['id', 'name', '房主id', '房主名', '行政区组', '行政区名', '纬度', '经度', '房屋类型', '价格', '最低入住天数', '评论数量', '最后评论时间', '每月评论数', 'calculated_host_listings_count', '可提供天数365']
del source_data['行政区组']
del source_data['calculated_host_listings_count']
按行政区分组 查看每个区的房屋出租数量
source_data.loc[:, ['行政区名']].groupby('行政区名').size()
image.png
行政区名 统一命名某某区 对房屋类型进行汉化
for neighbourhood in source_data['行政区名'].unique():
split_res = neighbourhood.split('/')
if len(split_res) == 2:
source_data.replace(neighbourhood, split_res[0].strip(), inplace=True)
source_data['行政区名'].replace('县', '区', regex=True, inplace=True)
source_data.replace('Entire home/apt', '整套房子/公寓', inplace=True)
source_data.replace('Private room', '私人房间', inplace=True)
source_data.replace('Shared room', '共享房间', inplace=True)
source_data
image.png
再次查看每个区的房屋数量
neighbourhood_group_count = source_data.loc[:, ['行政区名']].groupby('行政区名').size()
neighbourhood_group_count
image.png
每个区房源数量绘图
data = go.Bar(name='北京市房源数量分布图',
x=neighbourhood_group_count.index.values,
y=neighbourhood_group_count.values,
text=neighbourhood_group_count.values,
texttemplate='%{text:.3s}',
textposition='outside')
fig = go.Figure(data=data)
# 降序
fig.update_layout(xaxis={'categoryorder':'total descending'})
fig.show()
image.png
每个区的房源价格箱型图
fig = go.Figure()
for neighbourhood in source_data['行政区名'].unique():
y = source_data[source_data['行政区名']==neighbourhood]['价格'].values
box = go.Box(y=y, name=neighbourhood)
fig.add_trace(box)
fig.show()
image.png
剔除异常值,重新绘制箱形图 异常值设置为一万以上
# 获取价格一万以内的数据
price_in_10k = source_data[source_data['价格'] <= 10000]
fig = go.Figure()
for neighbourhood in price_in_10k['行政区名'].unique():
y = price_in_10k[price_in_10k['行政区名']==neighbourhood]['价格'].values
box = go.Box(y=y, name=neighbourhood)
fig.add_trace(box)
fig.show()
image.png
感觉异常值还是多 设置4000以下看看
print('4000以内价格的房屋数量: %s ' % source_data[source_data['价格'] <= 4000].shape[0])
print('4000以外价格的房屋数量: %s ' % source_data[source_data['价格'] > 4000].shape[0])
# 获取价格四千以内的数据
price_in_4k = source_data[source_data['价格'] <= 4000]
fig = go.Figure()
for neighbourhood in price_in_4k['行政区名'].unique():
y = price_in_4k[price_in_4k['行政区名']==neighbourhood]['价格'].values
box = go.Box(y=y, name=neighbourhood)
fig.add_trace(box)
fig.show()
image.png
密云 怀柔 延庆价格怎么这么高。。。单拿出来观察一下
print('密云', source_data[source_data['行政区名'] == '密云区']['价格'].describe(), '\r\n')
print('朝阳', source_data[source_data['行政区名'] == '朝阳区']['价格'].describe(), '\r\n')
print('怀柔', source_data[source_data['行政区名'] == '怀柔区']['价格'].describe())
image.png
查看一下房屋类型占比
# 先按行政区进行分组 再按房屋类型进行分组
groupby_neighbourhood_price = source_data.loc[:, ['行政区名', '房屋类型', 'id']].groupby(['行政区名', '房屋类型']).count()
groupby_neighbourhood_price
image.png
重制索引
groupby_neighbourhood_price.reset_index(inplace=True)
neighbourhoods = groupby_neighbourhood_price['行政区名'].unique()
rows_num = math.ceil(len(neighbourhoods)/3)
cols_num=3
fig = make_subplots(rows_num, cols_num,
specs=[[{'type':'domain'}]*3]*rows_num,
subplot_titles=neighbourhoods)
labels = ['整套房子/公寓', '私人房间', '共享房间']
row = 0
col = 1
for index, neighbourhood in enumerate(neighbourhoods):
values = []
for label in labels:
value = groupby_neighbourhood_price[(groupby_neighbourhood_price['行政区名']==neighbourhood) & (groupby_neighbourhood_price['房屋类型']==label)].id.values
value = 0 if not value else value[0]
values.append(value)
data = go.Pie(labels=labels, values=values, scalegroup=index, name=neighbourhood)
# "radial", "tangential" “径向”,“切向”
if index % 3 == 0:
row += 1
col = 1
fig.add_trace(data, row, col)
col += 1
fig.show()
image.png
网友评论