美文网首页
2019-08-12

2019-08-12

作者: 发现一个喜悦的地方 | 来源:发表于2019-08-13 10:08 被阅读0次

    train_data=pd.read_csv(train_path)

    train_data.columns

    test_data.head()


    import pandas_profiling as ppf ##探索性数据分析(EDA)

    ppf.ProfileReport(train_data)


    train_data['SalePrice'].describe()

    print('train_data_skew:%f'%train_data['SalePrice'].skew() ) #偏度

    print('train_data_kurt:%f'%train_data['SalePrice'].kurt())  #峰态


    import seaborn as sns

    sns.distplot(train_data['SalePrice'])


    import matplotlib.pyplot as plt #变量关系图 -线性关系

    var='GrLivArea'

    data=pd.concat([train_data['SalePrice'],train_data[var]],axis=1)

    data.plot.scatter(x=var,y='SalePrice',ylim=(0,800000))


    var = 'OverallQual'           #箱型图-异常值

    data = pd.concat([train_data['SalePrice'], train_data[var]], axis=1)

    f, ax = plt.subplots(figsize=(8, 6))

    fig = sns.boxplot(x=var, y="SalePrice", data=data)

    fig.axis(ymin=0, ymax=800000);

    #删除异常值

    train_data.drop(train_data[(train_data['GrLivArea']>4000)&(train_data['SalePrice']<300000)].index,inplace=True)


    corrmat = df_train.corr()

    f, ax = plt.subplots(figsize=(12, 9))

    sns.heatmap(corrmat, vmax=.8, square=True);

    k = 10  #number of variables for heatmap

    cols = corrmat.nlargest(k, 'SalePrice')['SalePrice'].index

    cm = np.corrcoef(df_train[cols].values.T)

    print('cm>>',cm)

    sns.set(font_scale=1.25)

    hm = sns.heatmap(cm, cbar=True, annot=True, square=True, fmt='.2f', annot_kws={'size': 10}, yticklabels=cols.values, xticklabels=cols.values)

    plt.show()


    数据清洗:空值的填充,删除,不处理

    miss=full.isnull().sum()#统计空值的个数

    miss[miss>0].sort_values(ascending=True)

    对object,int,float分别填充:

            full[col].fillna('None',inplace=True)

            full['LotFrontage'].fillna(np.mean(full['LotFrontage']),inplace=True)

            

    相关文章

      网友评论

          本文标题:2019-08-12

          本文链接:https://www.haomeiwen.com/subject/xcryjctx.html