导入库
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import geopandas as gpd
plt.style.use("fivethirtyeight")
%matplotlib inline
import plotly as plotly
import plotly.express as px
import plotly.graph_objects as go
from plotly.offline import init_notebook_mode,iplot,plot
init_notebook_mode(connected=True)
import folium
import folium.plugins
import wordcloud
from wordcloud import WordCloud,STOPWORDS,ImageColorGenerator
import sklearn
from sklearn import preprocessing
from sklearn.metrics import r2_score,mean_absolute_error
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression,LogisticRegression
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor
import warnings
warnings.filterwarnings("ignore")
数据基本信息
df = pd.read_csv("listings.csv")
df.head()
image.png
df.shape
image.png
columns = df.columns
columns
image.png
df.dtypes
image.png
df.info()
image.png
df.isnull().sum()
image.png
缺失值处理
# 1.先查看字段缺失值分布情况
sns.set(rc={"figure.figsize":(19.7,8.27)})
sns.heatmap(df.isnull(),yticklabels=False,cbar=False,cmap="viridis")
plt.show()
image.png
# 2.缺失值的字段(上面的两个)和name字段的两行记录直接删除
df.drop(["last_review","reviews_per_month"],inplace=True,axis=1)
df.dropna(inplace=True)
df.shape
image.png
数据EDA
# 价格price
sns.distplot(df["price"])
plt.show()
image.png
sns.scatterplot(x="price",y="minimum_nights",data=df)
plt.show()
image.png
# 区域
sns.countplot(x="neighbourhood_group", data=df)
plt.show()
image.png
df1 = df[df.price<250]
plt.figure(figsize=(10,6))
sns.boxplot(x="neighbourhood_group",y="price",data=df1)
plt.title("neighbourhood_group < 250")
plt.show()
image.png
plt.figure(figsize=(12,8))
sns.scatterplot(x="longitude",y="latitude",hue="neighbourhood_group",data=df)
plt.show()
image.png
房源分布热力图
import folium
from folium.plugins import HeatMap
m = folium.Map([1.44255,103.79580],zoom_start=11)
HeatMap(df[['latitude','longitude']].dropna(),
radius=10,
gradient={0.2:'blue',
0.4:'purple',
0.6:'orange',
1.0:'red'}).add_to(m)
display(m)
image.png
房间类型room_type
# 不同房间类型的占比
df["room_type"].value_counts()
image.png
import plotly.offline as pyo
import plotly.graph_objs as go
room_df = df.groupby("room_type").size() / df["room_type"].count()*100
room_df
image.png
labels = room_df.index
values = room_df.values
fig = go.Figure(data=[go.Pie(labels=labels,values=values,hole=0.5)])
fig.show()
image.png
# 不同区域的房间类型
plt.figure(figsize=(12,6))
sns.countplot(data=df,x="room_type",hue="neighbourhood_group")
plt.title("room types occupied by the neighbourhood_group")
plt.show()
image.png
type_group = df.groupby(["room_type","neighbourhood_group"]).size().reset_index().rename(columns={0:"number"})
type_group.head()
image.png
px.bar(type_group,x="room_type",y="number",color="neighbourhood_group",barmode="group")
image.png
# 房间类型和价格关系
plt.figure(figsize=(12,6))
sns.catplot(data=df,x="room_type",y="price")
plt.show()
image.png
px.scatter(df,x="room_type",y="price",color="room_type")
image.png
房间名称
# 整体词云图
from wordcloud import WordCloud,ImageColorGenerator
text = " ".join(str(each) for each in df.name)
wordcloud = WordCloud(max_words=200,background_color="white").generate(text)
plt.figure(figsize=(10,6))
plt.figure(figsize=(15,10))
plt.imshow(wordcloud,interpolation="Bilinear")
plt.axis("off")
plt.show()
image.png
# 名字中的关键
names = []
for name in df.name:
names.append(name)
def split_name(name):
spl = str(name).split()
return spl
names_count = []
for each in names:
for word in split_name(each):
word = word.lower()
names_count.append(word)
from collections import Counter
result = Counter(names_count).most_common()
result[:5]
image.png
top_20 = result[0:20]
top_20_words = pd.DataFrame(top_20,columns=["words","count"])
top_20_words
image.png
plt.figure(figsize=(10,6))
fig = sns.barplot(data=top_20_words,x="words",y="count")
fig.set_title("Counts of the top 20 used words for listing names")
fig.set_ylabel("Count of words")
fig.set_xlabel("Words")
fig.set_xticklabels(fig.get_xticklabels(),rotation=80)
plt.show()
image.png
回访量统计
df1 = df.sort_values(by="number_of_reviews",ascending=False).head(1000)
df1.head()
image.png
import folium
from folium.plugins import MarkerCluster
from folium import plugins
print("Rooms with the most number of reviews")
Long=103.91492
Lat=1.32122
mapdf1 = folium.Map([Lat, Long], zoom_start=10)
mapdf1_rooms_map = plugins.MarkerCluster().add_to(mapdf1)
for lat, lon, label in zip(df1.latitude,df1.longitude,df1.name):
folium.Marker(location=[lat, lon],icon=folium.Icon(icon="home"),popup=label).add_to(mapdf1_rooms_map)
mapdf1.add_child(mapdf1_rooms_map)
image.png
可租天数
plt.figure(figsize=(10,6))
plt.scatter(df.longitude,df.latitude,c=df.availability_365,cmap="spring",edgecolors="black",linewidths=1,alpha=1)
cbar = plt.colorbar()
cbar.set_label("availability_365")
image.png
px.scatter(df,x="longitude",y="latitude",color="availability_365")
image.png
plt.figure(figsize=(10,6))
low_500 = df[df.price<500]
viz1 = low_500.plot(kind="scatter",x="longitude",y="latitude",label="availability_365",c="price",cmap=plt.get_cmap("jet"),colorbar=True,alpha=0.4)
viz1.legend()
plt.show()
image.png
px.scatter(low_500,x="longitude",y="latitude",color="price")
image.png
线性回归建模
# 预处理
df.drop(["name","id","host_name"],inplace=True,axis=1)
# 编码类型的转化
cols = ["neighbourhood_group","neighbourhood","room_type"]
for col in cols:
le = preprocessing.LabelEncoder()
le.fit(df[col])
df[col] = le.transform(df[col])
df.head()
image.png
# 建模
lm = LinearRegression()
X = df.drop("price",axis=1)
y = df["price"]
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=101)
lm.fit(X_train,y_train)
image.png
predicts = lm.predict(X_test)
predicts
image.png
error_airbnb = pd.DataFrame({"Actual":np.array(y_test).tolist(),"Predict":predicts.tolist()})
error_airbnb.head()
image.png
title=['Pred vs Actual']
fig = go.Figure(data=[
go.Bar(name='Predicted',x=error_airbnb.index,y=error_airbnb['Predict']),
go.Bar(name='Actual',x=error_airbnb.index,y=error_airbnb['Actual'])
])
fig.update_layout(barmode='group')
fig.show()
image.png
error_airbnb["diff"] = error_airbnb["Predict"] - error_airbnb["Actual"]
px.box(error_airbnb,y="diff")
image.png
error_airbnb.describe()
image.png
网友评论