美文网首页
05爱彼迎租房数据分析

05爱彼迎租房数据分析

作者: Jachin111 | 来源:发表于2022-12-30 18:10 被阅读0次

    导入库

    import numpy as np
    import pandas as pd
    
    import matplotlib
    import matplotlib.pyplot as plt
    import seaborn as sns
    import geopandas as gpd
    plt.style.use("fivethirtyeight")
    %matplotlib inline
    
    import plotly as plotly
    import plotly.express as px
    import plotly.graph_objects as go
    from plotly.offline import init_notebook_mode,iplot,plot
    init_notebook_mode(connected=True)
    
    import folium
    import folium.plugins
    
    import wordcloud
    from wordcloud import WordCloud,STOPWORDS,ImageColorGenerator
    
    import sklearn
    from sklearn import preprocessing
    from sklearn.metrics import r2_score,mean_absolute_error
    from sklearn.preprocessing import LabelEncoder,OneHotEncoder
    from sklearn.model_selection import train_test_split
    from sklearn.linear_model import LinearRegression,LogisticRegression
    from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor
    
    import warnings
    warnings.filterwarnings("ignore")
    

    数据基本信息

    df = pd.read_csv("listings.csv")
    df.head()
    
    image.png
    df.shape
    
    image.png
    columns = df.columns
    columns
    
    image.png
    df.dtypes
    
    image.png
    df.info()
    
    image.png
    df.isnull().sum()
    
    image.png

    缺失值处理

    # 1.先查看字段缺失值分布情况
    sns.set(rc={"figure.figsize":(19.7,8.27)})
    sns.heatmap(df.isnull(),yticklabels=False,cbar=False,cmap="viridis")
    
    plt.show()
    
    image.png
    # 2.缺失值的字段(上面的两个)和name字段的两行记录直接删除
    df.drop(["last_review","reviews_per_month"],inplace=True,axis=1)
    df.dropna(inplace=True)
    df.shape
    
    image.png

    数据EDA

    # 价格price
    sns.distplot(df["price"])
    
    plt.show()
    
    image.png
    sns.scatterplot(x="price",y="minimum_nights",data=df)
    
    plt.show()
    
    image.png
    # 区域
    sns.countplot(x="neighbourhood_group", data=df)
    
    plt.show()
    
    image.png
    df1 = df[df.price<250]
    
    plt.figure(figsize=(10,6))
    
    sns.boxplot(x="neighbourhood_group",y="price",data=df1)
    plt.title("neighbourhood_group < 250")
    
    plt.show()
    
    image.png
    plt.figure(figsize=(12,8))
    
    sns.scatterplot(x="longitude",y="latitude",hue="neighbourhood_group",data=df)
    
    plt.show()
    
    image.png

    房源分布热力图

    import folium
    from folium.plugins import HeatMap
    
    m = folium.Map([1.44255,103.79580],zoom_start=11)
    
    HeatMap(df[['latitude','longitude']].dropna(),
            radius=10,
            gradient={0.2:'blue',
                      0.4:'purple',
                      0.6:'orange',
                      1.0:'red'}).add_to(m)
    display(m)
    
    image.png

    房间类型room_type

    # 不同房间类型的占比
    df["room_type"].value_counts()
    
    image.png
    import plotly.offline as pyo
    import plotly.graph_objs as go
    
    room_df = df.groupby("room_type").size() / df["room_type"].count()*100
    
    room_df
    
    image.png
    labels = room_df.index
    values = room_df.values
    
    fig = go.Figure(data=[go.Pie(labels=labels,values=values,hole=0.5)])
    
    fig.show()
    
    image.png
    # 不同区域的房间类型
    plt.figure(figsize=(12,6))
    
    sns.countplot(data=df,x="room_type",hue="neighbourhood_group")
    plt.title("room types occupied by the neighbourhood_group")
    
    plt.show()
    
    image.png
    type_group = df.groupby(["room_type","neighbourhood_group"]).size().reset_index().rename(columns={0:"number"})
    
    type_group.head()
    
    image.png
    px.bar(type_group,x="room_type",y="number",color="neighbourhood_group",barmode="group")
    
    image.png
    # 房间类型和价格关系
    plt.figure(figsize=(12,6))
    
    sns.catplot(data=df,x="room_type",y="price")
    
    plt.show()
    
    image.png
    px.scatter(df,x="room_type",y="price",color="room_type")
    
    image.png

    房间名称

    # 整体词云图
    from wordcloud import WordCloud,ImageColorGenerator
    
    text = " ".join(str(each) for each in df.name)
    wordcloud = WordCloud(max_words=200,background_color="white").generate(text)
    plt.figure(figsize=(10,6))
    plt.figure(figsize=(15,10))
    plt.imshow(wordcloud,interpolation="Bilinear")
    plt.axis("off")
    
    plt.show()
    
    image.png
    # 名字中的关键
    names = []
    for name in df.name:
        names.append(name)
        
    def split_name(name):
        spl = str(name).split()
        return spl
    
    names_count = []
    for each in names:
        for word in split_name(each):
            word = word.lower()
            names_count.append(word)
    
            
    from collections import Counter
    
    result = Counter(names_count).most_common()
    result[:5]
    
    image.png
    top_20 = result[0:20]
    
    top_20_words = pd.DataFrame(top_20,columns=["words","count"])
    top_20_words
    
    image.png
    plt.figure(figsize=(10,6))
    
    fig = sns.barplot(data=top_20_words,x="words",y="count")
    fig.set_title("Counts of the top 20 used words for listing names")
    fig.set_ylabel("Count of words")
    fig.set_xlabel("Words")
    fig.set_xticklabels(fig.get_xticklabels(),rotation=80)
    
    plt.show()
    
    image.png

    回访量统计

    df1 = df.sort_values(by="number_of_reviews",ascending=False).head(1000)
    
    df1.head()
    
    image.png
    import folium
    from folium.plugins import MarkerCluster
    from folium import plugins
    
    print("Rooms with the most number of reviews")
    
    Long=103.91492
    Lat=1.32122
    
    mapdf1 = folium.Map([Lat, Long], zoom_start=10)
    
    mapdf1_rooms_map = plugins.MarkerCluster().add_to(mapdf1)
    
    for lat, lon, label in zip(df1.latitude,df1.longitude,df1.name):
        folium.Marker(location=[lat, lon],icon=folium.Icon(icon="home"),popup=label).add_to(mapdf1_rooms_map)
    
    mapdf1.add_child(mapdf1_rooms_map)
    
    image.png

    可租天数

    plt.figure(figsize=(10,6))
    
    plt.scatter(df.longitude,df.latitude,c=df.availability_365,cmap="spring",edgecolors="black",linewidths=1,alpha=1)
    cbar = plt.colorbar()
    cbar.set_label("availability_365")
    
    image.png
    px.scatter(df,x="longitude",y="latitude",color="availability_365")
    
    image.png
    plt.figure(figsize=(10,6))
    low_500 = df[df.price<500]
    
    viz1 = low_500.plot(kind="scatter",x="longitude",y="latitude",label="availability_365",c="price",cmap=plt.get_cmap("jet"),colorbar=True,alpha=0.4)
    viz1.legend()
    
    plt.show()
    
    image.png
    px.scatter(low_500,x="longitude",y="latitude",color="price")
    
    image.png

    线性回归建模

    # 预处理
    df.drop(["name","id","host_name"],inplace=True,axis=1)
    
    # 编码类型的转化
    cols = ["neighbourhood_group","neighbourhood","room_type"]
    
    for col in cols:
        le = preprocessing.LabelEncoder()
        le.fit(df[col])
        df[col] = le.transform(df[col])
        
    df.head()
    
    image.png
    # 建模
    lm = LinearRegression()
    
    X = df.drop("price",axis=1)
    y = df["price"]
    
    X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=101)
    lm.fit(X_train,y_train)
    
    image.png
    predicts = lm.predict(X_test)
    predicts
    
    image.png
    error_airbnb = pd.DataFrame({"Actual":np.array(y_test).tolist(),"Predict":predicts.tolist()})
    error_airbnb.head()
    
    image.png
    title=['Pred vs Actual']
    
    fig = go.Figure(data=[
        go.Bar(name='Predicted',x=error_airbnb.index,y=error_airbnb['Predict']),
        go.Bar(name='Actual',x=error_airbnb.index,y=error_airbnb['Actual'])
    ])
    fig.update_layout(barmode='group')
    
    fig.show()
    
    image.png
    error_airbnb["diff"] = error_airbnb["Predict"] - error_airbnb["Actual"]
    px.box(error_airbnb,y="diff")
    
    image.png
    error_airbnb.describe()
    
    image.png

    相关文章

      网友评论

          本文标题:05爱彼迎租房数据分析

          本文链接:https://www.haomeiwen.com/subject/lnaxcdtx.html