#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue Oct 29 10:39:29 2019
@author: liyili2
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.model_selection import ShuffleSplit
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score
boston=pd.read_csv("/Users/liyili2/Downloads/datas/kaggle/housing.csv")
price=boston['MEDV']
features=boston.drop('MEDV',axis=1)
print ("波士顿房价数据有{} 行 points with {} variables each.".format(*boston.shape))
'''基础统计运算'''
'''因变量基本统计量查看'''
print("房价最小值是:",price.min())
print("房价最大值是:",price.max())
print("房价均值是:",price.mean())
print("房价中位数是:",price.median())
print("房价标准差是:",price.std())
'''建模'''
def performance_metric(y, y_predict):
score = r2_score(y, y_predict)
return score
score = performance_metric([3, -0.5, 2, 7, 4.2], [2.5, 0.0, 2.1, 7.8, 5.3])
print ("Model has a coefficient of determination, R^2, of {:.3f}.".format(score))
X_train, X_test, y_train, y_test =train_test_split(features, price,test_size = 0.2, random_state = 17)#随机种子值
print ("Training and testing split was successful.")
def fit_model(X, y):
cv_sets = ShuffleSplit(X.shape[0], random_state = 0)
regressor = DecisionTreeRegressor()
params ={'max_depth': [1,2,3,4,5,6,7,8,9,10]}
scoring_fnc = make_scorer(performance_metric)
grid = GridSearchCV(regressor, params, scoring = scoring_fnc, cv = cv_sets)
grid = grid.fit(X, y)
return grid.best_estimator_
reg = fit_model(X_train, y_train)
print ("Parameter 'max_depth' is {} for the optimal model.".format(reg.get_params()['max_depth']))
client_data = [[5, 17, 15], # Client 1
[4, 32, 22], # Client 2
[8, 3, 12]] # Client 3
'''预测'''
for i, price in enumerate(reg.predict(client_data)):
print ("Predicted selling price for Client {}'s home: ${:,.2f}".format(i+1, price))
结果:
波士顿房价数据有489 行 points with 4 variables each.
房价最小值是: 105000.0
房价最大值是: 1024800.0
房价均值是: 454342.9447852761
房价中位数是: 438900.0
房价标准差是: 165340.27765266786
Model has a coefficient of determination, R^2, of 0.923.
Training and testing split was successful.
网友评论