美文网首页
02房价统计

02房价统计

作者: Jachin111 | 来源:发表于2022-12-13 19:36 被阅读0次

    数据EDA

    # 导入库
    import pandas as pd
    import numpy as np
    
    import plotly.express as px
    import matplotlib.pyplot as plt
    import seaborn as sns
    plt.style.use("fivethirtyeight")
    
    from scipy.stats import norm
    from scipy import stats
    from sklearn.preprocessing import StandardScaler
    
    import warnings
    warnings.filterwarnings('ignore')
    %matplotlib inline
    
    # 导入数据
    train = pd.read_csv("train.csv")
    test = pd.read_csv("test.csv")
    
    train.head()
    
    image.png
    # 数据信息
    train.shape
    
    image.png
    train.isnull().sum().sort_values()
    
    image.png
    # 描述统计信息
    train.describe()
    
    image.png

    销售价格SalePrice分析

    # 统计信息
    train["SalePrice"].describe()
    
    image.png
    sns.distplot(train['SalePrice'])
    plt.show()
    
    image.png
    # 偏度和峰度
    print("Skewness(偏度):%f"%train['SalePrice'].skew())
    print("Kurtosis(峰度):%f"%train['SalePrice'].kurt())
    
    # image.png
    # SalePrice和数值型字段的关系
    data = train[["SalePrice","GrLivArea"]]
    data.head()
    
    image.png
    plt.figure(1,figsize=(12,6))
    sns.scatterplot(x="GrLivArea",y="SalePrice",data=data)
    plt.show()
    
    image.png
    data = train[["SalePrice","TotalBsmtSF"]]
    
    plt.figure(1,figsize=(12,6))
    sns.scatterplot(x="TotalBsmtSF",y="SalePrice",data=data)
    plt.show()
    
    image.png
    # 价格和分类型字段的关系
    train["OverallQual"].value_counts()
    
    image.png
    data = train[["SalePrice","OverallQual"]]
    
    f,ax = plt.subplots(1,figsize=(12,6))
    fig = sns.boxplot(x="OverallQual",y="SalePrice",data=data)
    
    fig.axis(ymin=0,ymax=800000)
    plt.show()
    
    image.png
    data = train[["SalePrice","YearBuilt"]]
    
    f,ax = plt.subplots(1,figsize=(16,8))
    fig = sns.boxplot(x="YearBuilt",y="SalePrice",data=data)
    
    fig.axis(ymin=0,ymax=800000)
    plt.show()
    
    image.png

    相关性分析

    # 整体相关性
    corrmat = train.corr()
    corrmat.head()
    
    image.png
    f,ax = plt.subplots(figsize=(12,6))
    sns.heatmap(corrmat,vmax=0.8,square=True)
    plt.show()
    
    image.png
    # 缩放相关矩阵
    k = 10
    corrmat.nlargest(k,"SalePrice")["SalePrice"].index
    
    image.png
    k = 10
    corrmat.nlargest(k,"SalePrice")
    
    image.png
    cols = corrmat.nlargest(k,"SalePrice").index
    cols
    
    image.png
    cm = np.corrcoef(train[cols].values.T)
    cm[:3]
    
    image.png
    sns.set(font_scale=1.25)
    hm = sns.heatmap(cm,
                    cbar=True,
                    annot=True,
                    square=True,
                    fmt='.2f',
                    annot_kws={'size':10},
                    xticklabels=cols.values,
                    yticklabels=cols.values)
    
    plt.show()
    
    image.png
    # 变量离散图
    cols = ['SalePrice','OverallQual','GrLivArea','GarageCars','TotalBsmtSF','FullBath','YearBuilt']
    sns.pairplot(train[cols],size=2.5)
    plt.show()
    
    image.png

    缺失值处理

    # 缺失值占比
    total = train.isnull().sum().sort_values(ascending=False)
    total.head()
    
    image.png
    # 转成百分比
    percent = (train.isnull().sum()/train.isnull().count()).sort_values(ascending=False)
    percent.head()
    
    image.png
    # 数据合并,整体的缺失值情况
    missing_data = pd.concat([total,percent],axis=1,keys=["Total","Percent"])
    missing_data.head()
    
    image.png
    # 删除缺失值
    missing_data[missing_data["Total"]>1].index
    
    image.png
    train = train.drop(missing_data[missing_data["Total"]>1].index,1)
    train = train.drop(train.loc[train["Electrical"].isnull()].index)
    train.isnull().sum().max()
    
    image.png

    离群点out liars

    # 查找离群点
    saleprice_scaled = StandardScaler().fit_transform(train["SalePrice"][:,np.newaxis])
    saleprice_scaled[:5]
    
    image.png
    low_range = saleprice_scaled[saleprice_scaled[:,0].argsort()][:10]
    high_range = saleprice_scaled[saleprice_scaled[:,0].argsort()][-10:]
    
    print(low_range)
    print('---------------')
    print(high_range)
    
    image.png
    # 单变量分析1
    data = train[["SalePrice","GrLivArea"]]
    data.plot.scatter(x="GrLivArea",y="SalePrice",ylim=(0,800000))
    plt.show()
    
    image.png
    # 删除离群点
    train.sort_values(by='GrLivArea',ascending=False)[:2]
    
    image.png
    train = train.drop(train[train['Id']==1299].index)
    train = train.drop(train[train['Id']==524].index)
    train.head()
    
    image.png
    # 单变量分析2
    data = train[["SalePrice","TotalBsmtSF"]]
    data.plot.scatter(x="TotalBsmtSF",y="SalePrice",ylim=(0,800000))
    plt.show()
    
    image.png

    深入理解SalePrice

    # Normality归一化(SalePrice)
    sns.distplot(train["SalePrice"],fit=norm)
    fig = plt.figure()
    res = stats.probplot(train["SalePrice"],plot=plt)
    
    image.png
    # 对数变换
    train["SalePrice"] = np.log(train["SalePrice"])
    
    sns.distplot(train["SalePrice"],fit=norm)
    fig = plt.figure()
    res = stats.probplot(train["SalePrice"],plot=plt)
    
    image.png
    # Normality归一化(GrLivArea)
    sns.distplot(train["GrLivArea"],fit=norm)
    fig = plt.figure()
    res = stats.probplot(train["GrLivArea"],plot=plt)
    
    image.png
    # 执行对数操作
    train["GrLivArea"] = np.log(train["GrLivArea"])
    
    sns.distplot(train["GrLivArea"],fit=norm)
    fig = plt.figure()
    res = stats.probplot(train["GrLivArea"],plot=plt)
    
    image.png
    # Normality归一化(TotalBsmtSF)
    sns.distplot(train["TotalBsmtSF"],fit=norm)
    fig = plt.figure()
    res = stats.probplot(train["TotalBsmtSF"],plot=plt)
    
    image.png
    train['HasBsmt'] = 0
    
    # 当TotalBsstSF>0 则赋值1
    train.loc[train['TotalBsmtSF']>0,'HasBsmt'] = 1
    
    # 对数转换:等于1的部分
    train.loc[train['HasBsmt']==1,'TotalBsmtSF'] = np.log(train['TotalBsmtSF'])
    
    data = train[train['TotalBsmtSF']>0]['TotalBsmtSF']
    sns.distplot(data,fit=norm)
    fig = plt.figure()
    res = stats.probplot(data,plot=plt)
    
    image.png

    同方差性

    # 讨论'SalePrice'和'GrLivArea'之间的关系
    plt.scatter(train['GrLivArea'],train['SalePrice'])
    plt.show()
    
    image.png
    # 讨论'SalePrice'和'TotalBsmtSF'之间的关系
    data = train[train['TotalBsmtSF']>0]
    
    plt.scatter(data['TotalBsmtSF'],data['SalePrice'])
    plt.show()
    
    image.png

    生成哑变量

    train = pd.get_dummies(train)
    train
    
    image.png

    相关文章

      网友评论

          本文标题:02房价统计

          本文链接:https://www.haomeiwen.com/subject/ewohqdtx.html