import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
from sklearn.cross_validation import train_test_split
import warnings
from sklearn.metrics import mean_squared_error
warnings.filterwarnings("ignore")
f_housing = open('housing.csv')
data = pd.read_csv(f_housing)
data.isnull().any().sum()
# # 查看各个特征的散点分布
# grr = pd.plotting.scatter_matrix(data, alpha=0.7, figsize=(10,10), diagonal='kde')
# plt.show()
corr = data.corr()
print(corr)
# 通过相关系数法 进行特征选择
# 设置x,y
x = data[['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAS',
'PTRATIO', 'B', 'LSTAT']]
y = data[['MEDV']]
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression
# 计算得出波士顿房价相关性最强的5个因素
SelectKBest = SelectKBest(f_regression, k=5)
bestFeature = SelectKBest.fit_transform(x,y)
print(SelectKBest.get_support())
#
print("得到波士顿房价相关性最强的5个因素:",x.columns[SelectKBest.get_support()]) # Index(['RM', 'PTRATIO', 'LSTAT'], dtype='object')
# 得到波士顿房价相关性最强的三个因素['RM', 'PTRATIO', 'LSTAT']
# 查看这三个特征的散点分布
features = data[['INDUS', 'RM', 'TAS', 'PTRATIO', 'LSTAT']]
pd.plotting.scatter_matrix(features, alpha=0.7, figsize=(6,6), diagonal='hist')
plt.show()
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
for feature in features.columns:
features['标准化'+feature] = scaler.fit_transform(features[[feature]])
#散点可视化,查看特征归一化后的数据
font={
'family':'SimHei'
}
matplotlib.rc('font', **font)
pd.plotting.scatter_matrix(features[['标准化INDUS', '标准化RM', '标准化TAS', '标准化PTRATIO', '标准化LSTAT']], alpha=0.7, figsize=(10,10), diagonal='hist')
plt.show()
# 数据拆分,将数据集拆分成训练数据与测试数据
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import cross_val_score
x_train, x_test, y_train, y_test = train_test_split(features[['标准化INDUS', '标准化RM', '标准化TAS', '标准化PTRATIO', '标准化LSTAT']], y, test_size=0.3,random_state=33)
# 用线性回归的方法预测房价
from sklearn import linear_model, metrics
lr = linear_model.LinearRegression()
lr.fit(x_train, y_train)
lr_predict = cross_val_predict(lr,x_train, y_train, cv=5)
lr_score = cross_val_score(lr, x_train, y_train, cv=5)
lr_meanscore = lr_score.mean()
print('test score:')
print('%.2f%%' % (lr_meanscore * 100))
y_Predict = lr.predict(x_test) #在测试集上的预测结果
model_mse = metrics.mean_squared_error(y_test, y_Predict)
print('mse score:')
print('%.2f' % model_mse)
网友评论