美文网首页
2019-01-15

2019-01-15

作者: DT数据说 | 来源:发表于2019-01-15 16:54 被阅读0次
    Scrapy爬虫与机器学习之三:房屋挂牌价格预测

    Scrapy爬虫与机器学习之三:房屋挂牌价格预测

    本文在前期抓取房产中介二手房某区域所有2453套房屋基础上,使用机器学习的线性回归模型进行预测朋友拟挂牌房屋的价格。经过比较几个模型,使用平均误差评价指标MAE,发现表现最好的模型是Grandient Boosting Regressor. MAE=1837元/平方米。把朋友房屋信息输入此预测模型,得到的预测挂牌价格是75064.67元/平方米。比去年市场挂牌价下降了10%左右。

    代码:

    import pandas as pd

    #import some necessary librairies

    import numpy as np # linear algebra

    import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

    get_ipython().run_line_magic('matplotlib', 'inline')

    import matplotlib.pyplot as plt  # Matlab-style plotting

    import seaborn as sns

    color = sns.color_palette()

    sns.set_style('darkgrid')

    import warnings

    def ignore_warn(*args, **kwargs):

        pass

    warnings.warn = ignore_warn #ignore annoying warning (from sklearn and seaborn)

    from scipy import stats

    from scipy.stats import norm, skew #for some statistics

    pd.set_option('display.float_format', lambda x: '{:.3f}'.format(x)) #Limiting floats output to 3 decimal points

    from subprocess import check_output

    #读入数据文件

    dftrain = pd.read_csv(r'C:\Users\Guoli\Desktop\scrapyfolder\new_env\ajk\ajk0115.csv',encoding="gbk")

    #删除总价

    dftrain=dftrain.drop('totalprice',axis=1)

    dftrain.head()

    print("keys of df dataset:\n{}".format(dftrain.keys()))

    from sklearn.model_selection import train_test_split

    print("type of data:{}".format(type(dftrain['unitprice'])))

    fig, ax = plt.subplots()

    ax.scatter(x = dftrain['floorsize'], y = dftrain['unitprice'])

    plt.ylabel('unitprice', fontsize=13)

    plt.xlabel('floorsize', fontsize=13)

    plt.show()

    #删除异常数据:面积超过400平方米

    dftrain = dftrain.drop(dftrain[(dftrain['floorsize']>400)].index)

    #Check the graphic again

    fig, ax = plt.subplots()

    ax.scatter(dftrain['floorsize'], dftrain['unitprice'])

    plt.ylabel('unitprice', fontsize=13)

    plt.xlabel('floorsize', fontsize=13)

    plt.show()

    #删除单价大于10万的数据

    dftrain = dftrain.drop(dftrain[(dftrain['unitprice']>100000)].index)

    #面积与单价的关系画图

    #Check the graphic again

    fig, ax = plt.subplots()

    ax.scatter(dftrain['floorsize'], dftrain['unitprice'])

    plt.ylabel('unitprice', fontsize=13)

    plt.xlabel('floorsize', fontsize=13)

    plt.show()

    import numpy as np # linear algebra

    import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

    get_ipython().run_line_magic('matplotlib', 'inline')

    import matplotlib.pyplot as plt  # Matlab-style plotting

    import seaborn as sns

    color = sns.color_palette()

    sns.set_style('darkgrid')

    import warnings

    def ignore_warn(*args, **kwargs):

        pass

    warnings.warn = ignore_warn #ignore annoying warning (from sklearn and seaborn)

    #查看单价的分布状态

    from scipy import stats

    from scipy.stats import norm, skew #for some statistics

    pd.set_option('display.float_format', lambda x: '{:.3f}'.format(x)) #Limiting floats output to 3 decimal points

    from subprocess import check_output

    import seaborn as sns

    import matplotlib.pyplot as plt

    sns.distplot(dftrain['unitprice'] , fit=norm);

    # Get the fitted parameters used by the function

    (mu, sigma) = norm.fit(dftrain['unitprice'])

    print( '\n mu = {:.2f} and sigma = {:.2f}\n'.format(mu, sigma))

    #Now plot the distribution

    plt.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)],

                loc='best')

    plt.ylabel('Frequency')

    plt.title('unitprice distribution')

    #Get also the QQ-plot

    fig = plt.figure()

    res = stats.probplot(dftrain['unitprice'], plot=plt)

    plt.show()

    #相关系数

    #Correlation map to see how features are correlated with SalePrice

    corrmat = dftrain.corr()

    plt.subplots(figsize=(12,9))

    sns.heatmap(corrmat, vmax=0.9, square=True)

    dftrain.corr()#correlations of features

    #category 独热编码

    dftraindummy = pd.get_dummies(dftrain)

    #分箱

    bins = [0,0.1,0.5,0.9,1]

    groupnames = ['floorsize1','floorsize2','floorsize3','floorsize4']

    floorsize_binned = pd.get_dummies(pd.qcut(dftraindummy['floorsize'],bins,labels=groupnames))

    dftrainbined = pd.concat([dftraindummy,floorsize_binned],axis=1)

    #Correlation map to see how features are correlated with SalePrice

    corrmat = dftrainbined.corr()

    plt.subplots(figsize=(12,9))

    sns.heatmap(corrmat, vmax=0.9, square=True)

    #划分训练集和测试集

    X = dftrainbined.iloc[:,dftrainbined.columns!='unitprice']

    y = dftrainbined['unitprice'].values

    #预测分析

    import numpy as np

    import matplotlib.pyplot as plt

    from sklearn import ensemble

    from sklearn.utils import shuffle

    from sklearn.metrics import mean_squared_error

    #线性回归模型

    from sklearn.linear_model import LinearRegression

    from sklearn.model_selection import train_test_split

    X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=42)#random_state=42时SCORE OF TEST:0.79

    lr = LinearRegression().fit(X_train, y_train)

    # print("Training set score:{:.2f}".format(lr.score(X_train,y_train)))

    # print('Test train score:{:.2f}'.format(grid_search.score(X_train,y_train)))

    print("Train set score:{:.2f}".format(lr.score(X_train,y_train)))

    print("Test set score:{:.2f}".format(lr.score(X_test,y_test)))

    mse = mean_squared_error(y_test, lr.predict(X_test))

    print("MSE: %.4f" % mse)

    #Ridge模型

    from sklearn.linear_model import Ridge

    ridge = Ridge().fit(X_train, y_train)#alpha=1.0 by default

    print("Ridge Training set score:{:.2f}".format(ridge.score(X_train, y_train)))

    print("Ridge Test set score:{:.2f}".format(ridge.score(X_test, y_test)))

    from sklearn.model_selection import cross_val_score

    #Ridge CV优化

    from sklearn.model_selection import GridSearchCV

    from sklearn.model_selection import train_test_split

    from sklearn.linear_model import Ridge

    X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=42)#random_state=42时SCORE OF TEST:0.79

    ridge = Ridge()

    param_grid={'alpha':[0.001,0.01,0.1,1]}

    grid_search = GridSearchCV(ridge, param_grid, cv=5)

    grid_search.fit(X_train,y_train)

    print('Test set score:{:.2f}'.format(grid_search.score(X_test,y_test)))

    print("Best parameters:{}".format(grid_search.best_params_))

    print('Test train score:{:.2f}'.format(grid_search.score(X_train,y_train)))

    print('Best cross-validation score:{:.2f}'.format(grid_search.best_score_))

    #Lasso 模型

    # L1 regularization, some coefficients are exactly zero.

    from sklearn.linear_model import Lasso

    lasso = Lasso(max_iter=1000000)

    from sklearn.model_selection import GridSearchCV

    param_grid={'alpha':[0.001,0.01,0.1,1]}

    grid_search = GridSearchCV(lasso, param_grid,cv=5)

    X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=42)

    grid_search.fit(X_train,y_train)

    print('Test set score:{:.2f}'.format(grid_search.score(X_test,y_test)))

    print("Best parameters:{}".format(grid_search.best_params_))

    print('Test train score:{:.2f}'.format(grid_search.score(X_train,y_train)))

    print('Best cross-validation score:{:.2f}'.format(grid_search.best_score_))

    lasso=grid_search.fit(X_train,y_train)

    acutuals = y_test

    predictions = lasso.predict(X_test)

    maelasso = median_absolute_error(actuals, predictions)

    print("MAE: %.4f" % maelasso)#平均绝对差

    #Lasso negelect some features, the score is not as good as Ridge, let's do another combination

    #ElasticNet 模型

    from sklearn.linear_model import ElasticNet, Lasso,  BayesianRidge, LassoLarsIC

    ENet = ElasticNet(l1_ratio=.9,max_iter=1000000, random_state=42)

    param_grid={'alpha':[0.001,0.01,0.1,1]}

    grid_search = GridSearchCV(ENet, param_grid,cv=5)

    X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=42)

    grid_search.fit(X_train,y_train)

    print('Test train score:{:.2f}'.format(grid_search.score(X_train,y_train)))

    print('Test set score:{:.2f}'.format(grid_search.score(X_test,y_test)))

    enet=grid_search.fit(X_train,y_train)

    acutuals = y_test

    predictions = enet.predict(X_test)

    maeenet = median_absolute_error(actuals, predictions)

    print("MAE: %.4f" % maeenet)#平均绝对差

    # GBoost模型

    from sklearn.ensemble import RandomForestRegressor,  GradientBoostingRegressor

    from sklearn.model_selection import KFold

    from sklearn.model_selection import cross_val_score

    kfold = KFold(n_splits=5)

    GBoost = GradientBoostingRegressor(n_estimators=500,learning_rate=0.1,max_depth=10, max_features='sqrt',

                                      min_samples_leaf=5, min_samples_split=5,

                                      loss='huber', random_state =42)

    # param_grid={'n_estimator':[100,500,1000,2000,30000]}

    # grid_search = GridSearchCV(GBoost, param_grid,cv=5)

    X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=42)

    GBoost.fit(X_train,y_train)

    print("cross-validation train scores:\n{}".format(np.mean(cross_val_score(GBoost,X_train,y_train,cv=kfold))))

    print("cross-validation test scores:\n{}".format(np.mean(cross_val_score(GBoost,X_test,y_test,cv=kfold))))

    mse = mean_squared_error(y_test, GBoost.predict(X_test))

    print("MSE: %.4f" % mse)

    from sklearn.metrics import mean_squared_log_error

    msle = mean_squared_log_error(y_test, GBoost.predict(X_test))

    print("MSLE: %.4f" % msle)

    from sklearn.metrics import median_absolute_error

    mae = median_absolute_error(y_test, GBoost.predict(X_test))

    print("MAE: %.4f" % mae)#平均绝对差

    # GradientBoosting Regressor 模型参数优化

    <h1>GradientBoostingRegressor优化后mae:1795.62元/平方米,在所有模型表现最好</h1>

    params = {'n_estimators': 100, 'max_depth': 10, 'min_samples_split': 5,

              'learning_rate': 0.1, 'loss': 'ls'}

    clfr = ensemble.GradientBoostingRegressor(**params)

    clfr.fit(X_train, y_train)

    print("cross-validation train scores:\n{}".format(np.mean(cross_val_score(clfr,X_train,y_train,cv=kfold))))

    print("cross-validation test scores:\n{}".format(np.mean(cross_val_score(clfr,X_test,y_test,cv=kfold))))

    mse = mean_squared_error(y_test, clfr.predict(X_test))

    print("MSE: %.4f" % mse)

    mae = median_absolute_error(y_test, clfr.predict(X_test))

    print("MAE: %.4f" % mae)#平均绝对差

    # compute test set deviance

    test_score = np.zeros((params['n_estimators'],), dtype=np.float64)

    for i, y_pred in enumerate(clfr.staged_predict(X_test)):

        test_score[i] = clfr.loss_(y_test, y_pred)

    plt.figure(figsize=(12, 6))

    plt.subplot(1, 2, 1)

    plt.title('Deviance')

    plt.plot(np.arange(params['n_estimators']) + 1, clfr.train_score_, 'b-',

            label='Training Set Deviance')

    plt.plot(np.arange(params['n_estimators']) + 1, test_score, 'r-',

            label='Test Set Deviance')

    plt.legend(loc='upper right')

    plt.xlabel('Boosting Iterations')

    plt.ylabel('Deviance')

    # #############################################################################

    # Plot feature importance

    feature_importance = clfr.feature_importances_

    # make importances relative to max importance

    feature_importance = 100.0 * (feature_importance / feature_importance.max())

    sorted_idx = np.argsort(feature_importance)

    pos = np.arange(sorted_idx.shape[0]) + .5

    plt.subplot(1, 2, 2)

    plt.barh(pos, feature_importance[sorted_idx], align='center')

    plt.yticks(pos, dftrainbined.columns[sorted_idx])

    plt.xlabel('Relative Importance')

    plt.title('Variable Importance')

    plt.show()

    # <h1>预测房屋的拟挂牌价格76293元/平方米,误差在1795.62元/平方米</h1>

    # In[259]:

    import numpy as np

    X2 =np.array([3,2004,155,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0])

    X3=X2.reshape(1, -1)

    pred_clfr=clfr.predict(X3)

    # In[260]:

    #LightGBM模型

    import lightgbm as lgb

    gbm1 = lgb.LGBMRegressor(objective='regression',

                            num_leaves=31,

                            learning_rate=0.05,

                            n_estimators=1000)

    gbm1.fit(X_train, y_train,

            eval_set=[(X_test, y_test)],

            eval_metric='l1',

            early_stopping_rounds=5)

    print('Start predicting...')

    y_pred = gbm1.predict(X_test)

    # eval

    #print('The rmse of prediction is:', mean_squared_error(y_test, y_pred) ** 0.5)

    maelgbm = median_absolute_error(y_test, y_pred)

    print("MAE: %.4f" % maelgbm)#平均绝对差

    print("cross-validation test scores:\n{}".format(np.mean(cross_val_score(gbm1,X_test,y_test,cv=5))))

    #XGbOOST 模型

    import pickle

    import xgboost as xgb

    import numpy as np

    from sklearn.model_selection import KFold, train_test_split, GridSearchCV

    from sklearn.metrics import confusion_matrix, mean_squared_error

    from sklearn.model_selection import KFold

    kf = KFold(n_splits=2, shuffle=True, random_state=42)

    for train_index, test_index in kf.split(X):

        xgb_model = xgb.XGBRegressor().fit(X, y)

        predictions = xgb_model.predict(X)

        actuals = y

        print(mean_squared_error(actuals, predictions))

    print("cross-validation test scores:\n{}".format(np.mean(cross_val_score(xgb_model,X_test,y_test,cv=kfold))))

    maelxgb_model = median_absolute_error(actuals, predictions)

    print("MAE: %.4f" % maelxgb_model)#平均绝对差

    xgb_model2 = xgb.XGBRegressor(n_estimators= 1000,early_stopping_rounds=5,learning_rate=0.05,n_jobs=-1)

    clf = xgb_model2.fit(X_train,y_train)

    predictions = clf.predict(X_test)

    actuals = y_test

    print(mean_squared_error(actuals, predictions))

    print("cross-validation test scores:\n{}".format(np.mean(cross_val_score(clf,X_test,y_test,cv=kfold))))

    maeclf = median_absolute_error(actuals, predictions)

    print("MAE: %.4f" % maeclf)#平均绝对差

    xgb_model3 = xgb.XGBRegressor(n_estimators= 2000,learning_rate=0.01,max_depth=2,n_jobs=-1)#learning rate 越小,得分越高

    clf2 = xgb_model3.fit(X_train,y_train)

    predictions = clf2.predict(X_test)

    actuals = y_test

    print(mean_squared_error(actuals, predictions))

    print("cross-validation test scores:\n{}".format(np.mean(cross_val_score(clf2,X_test,y_test,cv=kfold))))

    maeclf2 = median_absolute_error(actuals, predictions)

    print("MAE: %.4f" % maeclf2)#平均绝对差

    xgb_model4 = xgb.XGBRegressor(n_estimators= 1000,learning_rate=0.1,max_depth=3,gamma=0.01,n_jobs=-1)

    clf3 = xgb_model4.fit(X_train,y_train)

    predictions = clf3.predict(X_test)

    actuals = y_test

    print(mean_squared_error(actuals, predictions))

    print("cross-validation test scores:\n{}".format(np.mean(cross_val_score(clf3,X_test,y_test,cv=5))))

    maeclf3 = median_absolute_error(actuals, predictions)

    print("MAE: %.4f" % maeclf3)#平均绝对差

    #

    ensemble3 = X2*0.3+ pred_gbm1*0.7

    ensemble3

    #把表现最好的GBM模型保存

    from sklearn.externals import joblib

    model1=clfr

    filename = 'GBMclfr_finalized_model0115.sav'

    joblib.dump(model1, filename)

    # some time later...

    # load the model from disk

    # loaded_model = joblib.load(filename)

    # result = loaded_model.score(X_test, Y_test)

    # print(result)

    # #The sklearn API models are picklable

    # print("Pickling sklearn API models")

    # # must open in binary format to pickle

    # pickle.dump(clf3, open("XGB0918NYCTaxi.pkl", "wb"))

    # # clf2 = pickle.load(open("best_boston.pkl", "rb"))

    # # print(np.allclose(clf.predict(X), clf2.predict(X)))

    相关文章

      网友评论

          本文标题:2019-01-15

          本文链接:https://www.haomeiwen.com/subject/qpbydqtx.html