美文网首页
【线性回归】预测Boston房价

【线性回归】预测Boston房价

作者: 唯师默蓝 | 来源:发表于2019-04-12 10:11 被阅读0次
    import pandas as pd
    import matplotlib.pyplot as plt
    import matplotlib
    from sklearn.cross_validation import train_test_split
    import warnings
    
    from sklearn.metrics import mean_squared_error
    
    warnings.filterwarnings("ignore")
    
    
    f_housing = open('housing.csv')
    data = pd.read_csv(f_housing)
    data.isnull().any().sum()
    
    # # 查看各个特征的散点分布
    # grr = pd.plotting.scatter_matrix(data, alpha=0.7, figsize=(10,10), diagonal='kde')
    # plt.show()
    
    corr = data.corr()
    print(corr)
    
    
    # 通过相关系数法 进行特征选择
    # 设置x,y
    x = data[['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAS',
           'PTRATIO', 'B', 'LSTAT']]
    y = data[['MEDV']]
    
    from sklearn.feature_selection import SelectKBest
    from sklearn.feature_selection import f_regression
    # 计算得出波士顿房价相关性最强的5个因素
    SelectKBest = SelectKBest(f_regression, k=5)
    bestFeature = SelectKBest.fit_transform(x,y)
    print(SelectKBest.get_support())
    #
    print("得到波士顿房价相关性最强的5个因素:",x.columns[SelectKBest.get_support()]) # Index(['RM', 'PTRATIO', 'LSTAT'], dtype='object')
                                                # 得到波士顿房价相关性最强的三个因素['RM', 'PTRATIO', 'LSTAT']
    # 查看这三个特征的散点分布
    features = data[['INDUS', 'RM', 'TAS', 'PTRATIO', 'LSTAT']]
    pd.plotting.scatter_matrix(features, alpha=0.7, figsize=(6,6), diagonal='hist')
    plt.show()
    
    
    from sklearn.preprocessing import MinMaxScaler
    scaler = MinMaxScaler()
    for feature in features.columns:
        features['标准化'+feature] = scaler.fit_transform(features[[feature]])
    
    #散点可视化,查看特征归一化后的数据
    font={
          'family':'SimHei'
          }
    matplotlib.rc('font', **font)
    pd.plotting.scatter_matrix(features[['标准化INDUS', '标准化RM', '标准化TAS', '标准化PTRATIO', '标准化LSTAT']], alpha=0.7, figsize=(10,10), diagonal='hist')
    plt.show()
    
    # 数据拆分,将数据集拆分成训练数据与测试数据
    from sklearn.model_selection import cross_val_predict
    from sklearn.model_selection import cross_val_score
    
    x_train, x_test, y_train, y_test = train_test_split(features[['标准化INDUS', '标准化RM', '标准化TAS', '标准化PTRATIO', '标准化LSTAT']], y, test_size=0.3,random_state=33)
    
    # 用线性回归的方法预测房价
    from sklearn import linear_model, metrics
    
    lr = linear_model.LinearRegression()
    lr.fit(x_train, y_train)
    lr_predict = cross_val_predict(lr,x_train, y_train, cv=5)
    lr_score = cross_val_score(lr, x_train, y_train, cv=5)
    lr_meanscore = lr_score.mean()
    
    print('test score:')
    print('%.2f%%' % (lr_meanscore * 100))
    
    
    y_Predict = lr.predict(x_test) #在测试集上的预测结果
    model_mse = metrics.mean_squared_error(y_test, y_Predict)
    print('mse score:')
    print('%.2f' % model_mse)
    

    相关文章

      网友评论

          本文标题:【线性回归】预测Boston房价

          本文链接:https://www.haomeiwen.com/subject/sjuqwqtx.html