数据EDA
# 导入库
import pandas as pd
import numpy as np
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use("fivethirtyeight")
from scipy.stats import norm
from scipy import stats
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
# 导入数据
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
train.head()
image.png
# 数据信息
train.shape
image.png
train.isnull().sum().sort_values()
image.png
# 描述统计信息
train.describe()
image.png
销售价格SalePrice分析
# 统计信息
train["SalePrice"].describe()
image.png
sns.distplot(train['SalePrice'])
plt.show()
image.png
# 偏度和峰度
print("Skewness(偏度):%f"%train['SalePrice'].skew())
print("Kurtosis(峰度):%f"%train['SalePrice'].kurt())
# image.png
# SalePrice和数值型字段的关系
data = train[["SalePrice","GrLivArea"]]
data.head()
image.png
plt.figure(1,figsize=(12,6))
sns.scatterplot(x="GrLivArea",y="SalePrice",data=data)
plt.show()
image.png
data = train[["SalePrice","TotalBsmtSF"]]
plt.figure(1,figsize=(12,6))
sns.scatterplot(x="TotalBsmtSF",y="SalePrice",data=data)
plt.show()
image.png
# 价格和分类型字段的关系
train["OverallQual"].value_counts()
image.png
data = train[["SalePrice","OverallQual"]]
f,ax = plt.subplots(1,figsize=(12,6))
fig = sns.boxplot(x="OverallQual",y="SalePrice",data=data)
fig.axis(ymin=0,ymax=800000)
plt.show()
image.png
data = train[["SalePrice","YearBuilt"]]
f,ax = plt.subplots(1,figsize=(16,8))
fig = sns.boxplot(x="YearBuilt",y="SalePrice",data=data)
fig.axis(ymin=0,ymax=800000)
plt.show()
image.png
相关性分析
# 整体相关性
corrmat = train.corr()
corrmat.head()
image.png
f,ax = plt.subplots(figsize=(12,6))
sns.heatmap(corrmat,vmax=0.8,square=True)
plt.show()
image.png
# 缩放相关矩阵
k = 10
corrmat.nlargest(k,"SalePrice")["SalePrice"].index
image.png
k = 10
corrmat.nlargest(k,"SalePrice")
image.png
cols = corrmat.nlargest(k,"SalePrice").index
cols
image.png
cm = np.corrcoef(train[cols].values.T)
cm[:3]
image.png
sns.set(font_scale=1.25)
hm = sns.heatmap(cm,
cbar=True,
annot=True,
square=True,
fmt='.2f',
annot_kws={'size':10},
xticklabels=cols.values,
yticklabels=cols.values)
plt.show()
image.png
# 变量离散图
cols = ['SalePrice','OverallQual','GrLivArea','GarageCars','TotalBsmtSF','FullBath','YearBuilt']
sns.pairplot(train[cols],size=2.5)
plt.show()
image.png
缺失值处理
# 缺失值占比
total = train.isnull().sum().sort_values(ascending=False)
total.head()
image.png
# 转成百分比
percent = (train.isnull().sum()/train.isnull().count()).sort_values(ascending=False)
percent.head()
image.png
# 数据合并,整体的缺失值情况
missing_data = pd.concat([total,percent],axis=1,keys=["Total","Percent"])
missing_data.head()
image.png
# 删除缺失值
missing_data[missing_data["Total"]>1].index
image.png
train = train.drop(missing_data[missing_data["Total"]>1].index,1)
train = train.drop(train.loc[train["Electrical"].isnull()].index)
train.isnull().sum().max()
image.png
离群点out liars
# 查找离群点
saleprice_scaled = StandardScaler().fit_transform(train["SalePrice"][:,np.newaxis])
saleprice_scaled[:5]
image.png
low_range = saleprice_scaled[saleprice_scaled[:,0].argsort()][:10]
high_range = saleprice_scaled[saleprice_scaled[:,0].argsort()][-10:]
print(low_range)
print('---------------')
print(high_range)
image.png
# 单变量分析1
data = train[["SalePrice","GrLivArea"]]
data.plot.scatter(x="GrLivArea",y="SalePrice",ylim=(0,800000))
plt.show()
image.png
# 删除离群点
train.sort_values(by='GrLivArea',ascending=False)[:2]
image.png
train = train.drop(train[train['Id']==1299].index)
train = train.drop(train[train['Id']==524].index)
train.head()
image.png
# 单变量分析2
data = train[["SalePrice","TotalBsmtSF"]]
data.plot.scatter(x="TotalBsmtSF",y="SalePrice",ylim=(0,800000))
plt.show()
image.png
深入理解SalePrice
# Normality归一化(SalePrice)
sns.distplot(train["SalePrice"],fit=norm)
fig = plt.figure()
res = stats.probplot(train["SalePrice"],plot=plt)
image.png
# 对数变换
train["SalePrice"] = np.log(train["SalePrice"])
sns.distplot(train["SalePrice"],fit=norm)
fig = plt.figure()
res = stats.probplot(train["SalePrice"],plot=plt)
image.png
# Normality归一化(GrLivArea)
sns.distplot(train["GrLivArea"],fit=norm)
fig = plt.figure()
res = stats.probplot(train["GrLivArea"],plot=plt)
image.png
# 执行对数操作
train["GrLivArea"] = np.log(train["GrLivArea"])
sns.distplot(train["GrLivArea"],fit=norm)
fig = plt.figure()
res = stats.probplot(train["GrLivArea"],plot=plt)
image.png
# Normality归一化(TotalBsmtSF)
sns.distplot(train["TotalBsmtSF"],fit=norm)
fig = plt.figure()
res = stats.probplot(train["TotalBsmtSF"],plot=plt)
image.png
train['HasBsmt'] = 0
# 当TotalBsstSF>0 则赋值1
train.loc[train['TotalBsmtSF']>0,'HasBsmt'] = 1
# 对数转换:等于1的部分
train.loc[train['HasBsmt']==1,'TotalBsmtSF'] = np.log(train['TotalBsmtSF'])
data = train[train['TotalBsmtSF']>0]['TotalBsmtSF']
sns.distplot(data,fit=norm)
fig = plt.figure()
res = stats.probplot(data,plot=plt)
image.png
同方差性
# 讨论'SalePrice'和'GrLivArea'之间的关系
plt.scatter(train['GrLivArea'],train['SalePrice'])
plt.show()
image.png
# 讨论'SalePrice'和'TotalBsmtSF'之间的关系
data = train[train['TotalBsmtSF']>0]
plt.scatter(data['TotalBsmtSF'],data['SalePrice'])
plt.show()
image.png
生成哑变量
train = pd.get_dummies(train)
train
image.png
网友评论