数据EDA
# 导入库
import pandas as pd
import numpy as np
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use("fivethirtyeight")
from scipy.stats import norm
from scipy import stats
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
# 导入数据
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
train.head()
data:image/s3,"s3://crabby-images/21836/218365f3076ddf252a70d886e0d73b48fe85dc52" alt=""
image.png
# 数据信息
train.shape
data:image/s3,"s3://crabby-images/c95ba/c95ba9d10b254e083a006fb9eab34ff383b5a768" alt=""
image.png
train.isnull().sum().sort_values()
data:image/s3,"s3://crabby-images/b0876/b0876f0587ecfd478a4e28119960a20e8126a679" alt=""
image.png
# 描述统计信息
train.describe()
data:image/s3,"s3://crabby-images/b3bc1/b3bc19b7cf5eb1ca893c1a84948bd6c9cbe534bb" alt=""
image.png
销售价格SalePrice分析
# 统计信息
train["SalePrice"].describe()
data:image/s3,"s3://crabby-images/180d9/180d970671a57d7c815775a86446956af20bf245" alt=""
image.png
sns.distplot(train['SalePrice'])
plt.show()
data:image/s3,"s3://crabby-images/19f8a/19f8a78d6f7759cb2ed8ed3117d3b496e700f15f" alt=""
image.png
# 偏度和峰度
print("Skewness(偏度):%f"%train['SalePrice'].skew())
print("Kurtosis(峰度):%f"%train['SalePrice'].kurt())
data:image/s3,"s3://crabby-images/98509/98509a1cc0c6c7dfdb8d9cea67d110329936a102" alt=""
# image.png
# SalePrice和数值型字段的关系
data = train[["SalePrice","GrLivArea"]]
data.head()
data:image/s3,"s3://crabby-images/9f443/9f4439a047da05baa38deee9de784277a043e598" alt=""
image.png
plt.figure(1,figsize=(12,6))
sns.scatterplot(x="GrLivArea",y="SalePrice",data=data)
plt.show()
data:image/s3,"s3://crabby-images/40fbc/40fbc8b290961aa2c665c0bab1d8fb08dc2c250b" alt=""
image.png
data = train[["SalePrice","TotalBsmtSF"]]
plt.figure(1,figsize=(12,6))
sns.scatterplot(x="TotalBsmtSF",y="SalePrice",data=data)
plt.show()
data:image/s3,"s3://crabby-images/c265b/c265b24a59ef5996bc4336dabf7a1539c7b29c93" alt=""
image.png
# 价格和分类型字段的关系
train["OverallQual"].value_counts()
data:image/s3,"s3://crabby-images/4c550/4c550fcadf0d64a835a596e2f13123d11ced0a56" alt=""
image.png
data = train[["SalePrice","OverallQual"]]
f,ax = plt.subplots(1,figsize=(12,6))
fig = sns.boxplot(x="OverallQual",y="SalePrice",data=data)
fig.axis(ymin=0,ymax=800000)
plt.show()
data:image/s3,"s3://crabby-images/169f7/169f781e0afdfc662f1de54ae7e122be8e1ea9c6" alt=""
image.png
data = train[["SalePrice","YearBuilt"]]
f,ax = plt.subplots(1,figsize=(16,8))
fig = sns.boxplot(x="YearBuilt",y="SalePrice",data=data)
fig.axis(ymin=0,ymax=800000)
plt.show()
data:image/s3,"s3://crabby-images/cc5a9/cc5a97aa4269fd13528ec9fa6420d1b451fa8f25" alt=""
image.png
相关性分析
# 整体相关性
corrmat = train.corr()
corrmat.head()
data:image/s3,"s3://crabby-images/64639/6463956d6d7bea83723c0757ad0f62c6bf56dccf" alt=""
image.png
f,ax = plt.subplots(figsize=(12,6))
sns.heatmap(corrmat,vmax=0.8,square=True)
plt.show()
data:image/s3,"s3://crabby-images/e12bb/e12bb9ac32381a3af32b7f19b782a6fa6dceac9f" alt=""
image.png
# 缩放相关矩阵
k = 10
corrmat.nlargest(k,"SalePrice")["SalePrice"].index
data:image/s3,"s3://crabby-images/8135b/8135bcaaf752c061b8ff0f93d8b359e37d41af50" alt=""
image.png
k = 10
corrmat.nlargest(k,"SalePrice")
data:image/s3,"s3://crabby-images/7deef/7deef5aeefa98abada2f2a5b19a77a3ac4d3dc1c" alt=""
image.png
cols = corrmat.nlargest(k,"SalePrice").index
cols
data:image/s3,"s3://crabby-images/305e5/305e57325e1663f4aa9e3c2192dd775bd3fd6a89" alt=""
image.png
cm = np.corrcoef(train[cols].values.T)
cm[:3]
data:image/s3,"s3://crabby-images/cdbe2/cdbe2d917c1f16ca88c597c77d8c017fca809bc2" alt=""
image.png
sns.set(font_scale=1.25)
hm = sns.heatmap(cm,
cbar=True,
annot=True,
square=True,
fmt='.2f',
annot_kws={'size':10},
xticklabels=cols.values,
yticklabels=cols.values)
plt.show()
data:image/s3,"s3://crabby-images/c4b61/c4b612d3bb507a7cf3a38f4f8dd18b830c5b0f03" alt=""
image.png
# 变量离散图
cols = ['SalePrice','OverallQual','GrLivArea','GarageCars','TotalBsmtSF','FullBath','YearBuilt']
sns.pairplot(train[cols],size=2.5)
plt.show()
data:image/s3,"s3://crabby-images/39b29/39b29b310c77543e3f3822668260189630e9e05b" alt=""
image.png
缺失值处理
# 缺失值占比
total = train.isnull().sum().sort_values(ascending=False)
total.head()
data:image/s3,"s3://crabby-images/816f3/816f30a98752ddc8a89d942a39546c6fcf48553e" alt=""
image.png
# 转成百分比
percent = (train.isnull().sum()/train.isnull().count()).sort_values(ascending=False)
percent.head()
data:image/s3,"s3://crabby-images/dc47a/dc47a2f4a97ae011470acc7685659218714daab8" alt=""
image.png
# 数据合并,整体的缺失值情况
missing_data = pd.concat([total,percent],axis=1,keys=["Total","Percent"])
missing_data.head()
data:image/s3,"s3://crabby-images/e2356/e2356c215da93c0f2c8bbc60663c80abaaa305e3" alt=""
image.png
# 删除缺失值
missing_data[missing_data["Total"]>1].index
data:image/s3,"s3://crabby-images/8ffe5/8ffe5e1b4f48c6b2d67abd69e093296e23914e8a" alt=""
image.png
train = train.drop(missing_data[missing_data["Total"]>1].index,1)
train = train.drop(train.loc[train["Electrical"].isnull()].index)
train.isnull().sum().max()
data:image/s3,"s3://crabby-images/d19a8/d19a8349c18af24e9d05402efb2fc04a23050fa6" alt=""
image.png
离群点out liars
# 查找离群点
saleprice_scaled = StandardScaler().fit_transform(train["SalePrice"][:,np.newaxis])
saleprice_scaled[:5]
data:image/s3,"s3://crabby-images/179ea/179ea7a55a4eadaef50a5d3abe5943133621ee02" alt=""
image.png
low_range = saleprice_scaled[saleprice_scaled[:,0].argsort()][:10]
high_range = saleprice_scaled[saleprice_scaled[:,0].argsort()][-10:]
print(low_range)
print('---------------')
print(high_range)
data:image/s3,"s3://crabby-images/d06a2/d06a20f3b03418472150615e10474ec8fb9fe161" alt=""
image.png
# 单变量分析1
data = train[["SalePrice","GrLivArea"]]
data.plot.scatter(x="GrLivArea",y="SalePrice",ylim=(0,800000))
plt.show()
data:image/s3,"s3://crabby-images/abee8/abee8c7c931b3087cd2d721c4980967ab2399804" alt=""
image.png
# 删除离群点
train.sort_values(by='GrLivArea',ascending=False)[:2]
data:image/s3,"s3://crabby-images/85767/857670b0f3f8635ddd66a6c837a1ef95f8014548" alt=""
image.png
train = train.drop(train[train['Id']==1299].index)
train = train.drop(train[train['Id']==524].index)
train.head()
data:image/s3,"s3://crabby-images/324e8/324e8d39f84fb287991e29fcd4e5458033b67409" alt=""
image.png
# 单变量分析2
data = train[["SalePrice","TotalBsmtSF"]]
data.plot.scatter(x="TotalBsmtSF",y="SalePrice",ylim=(0,800000))
plt.show()
data:image/s3,"s3://crabby-images/9dadb/9dadb47c4b93329de64371ac08de3f625f0e1b42" alt=""
image.png
深入理解SalePrice
# Normality归一化(SalePrice)
sns.distplot(train["SalePrice"],fit=norm)
fig = plt.figure()
res = stats.probplot(train["SalePrice"],plot=plt)
data:image/s3,"s3://crabby-images/a0a2e/a0a2e0aafd5e1e840baf82f6464551c0ecbd52a9" alt=""
image.png
# 对数变换
train["SalePrice"] = np.log(train["SalePrice"])
sns.distplot(train["SalePrice"],fit=norm)
fig = plt.figure()
res = stats.probplot(train["SalePrice"],plot=plt)
data:image/s3,"s3://crabby-images/aa38d/aa38d6859a92f4dfe39cdf371e1a1248101690f4" alt=""
image.png
# Normality归一化(GrLivArea)
sns.distplot(train["GrLivArea"],fit=norm)
fig = plt.figure()
res = stats.probplot(train["GrLivArea"],plot=plt)
data:image/s3,"s3://crabby-images/3b4e4/3b4e429baba8499e34511cb1dcffeb3b10e16fb0" alt=""
image.png
# 执行对数操作
train["GrLivArea"] = np.log(train["GrLivArea"])
sns.distplot(train["GrLivArea"],fit=norm)
fig = plt.figure()
res = stats.probplot(train["GrLivArea"],plot=plt)
data:image/s3,"s3://crabby-images/95783/957834c3885619acc11309bd588d0b4a67da1267" alt=""
image.png
# Normality归一化(TotalBsmtSF)
sns.distplot(train["TotalBsmtSF"],fit=norm)
fig = plt.figure()
res = stats.probplot(train["TotalBsmtSF"],plot=plt)
data:image/s3,"s3://crabby-images/4a138/4a13873f520c3c0cafb5417f93175815ee1e3f9c" alt=""
image.png
train['HasBsmt'] = 0
# 当TotalBsstSF>0 则赋值1
train.loc[train['TotalBsmtSF']>0,'HasBsmt'] = 1
# 对数转换:等于1的部分
train.loc[train['HasBsmt']==1,'TotalBsmtSF'] = np.log(train['TotalBsmtSF'])
data = train[train['TotalBsmtSF']>0]['TotalBsmtSF']
sns.distplot(data,fit=norm)
fig = plt.figure()
res = stats.probplot(data,plot=plt)
data:image/s3,"s3://crabby-images/e1c84/e1c847af5ac36491c5453eadbe10a5460c0adc00" alt=""
image.png
同方差性
# 讨论'SalePrice'和'GrLivArea'之间的关系
plt.scatter(train['GrLivArea'],train['SalePrice'])
plt.show()
data:image/s3,"s3://crabby-images/a7133/a7133ca70297c64d8f9e8a1883e763959c2d29f2" alt=""
image.png
# 讨论'SalePrice'和'TotalBsmtSF'之间的关系
data = train[train['TotalBsmtSF']>0]
plt.scatter(data['TotalBsmtSF'],data['SalePrice'])
plt.show()
data:image/s3,"s3://crabby-images/f58bd/f58bd4df1c4355bedfa19685f92bec565282e18c" alt=""
image.png
生成哑变量
train = pd.get_dummies(train)
train
data:image/s3,"s3://crabby-images/ed5e4/ed5e4b878ed3cd82a8480272ffbf9dcdcc0c7089" alt=""
image.png
网友评论