美文网首页
阿里云蒸汽量预测新人赛赛题解析

阿里云蒸汽量预测新人赛赛题解析

作者: qiufeng1ye | 来源:发表于2020-11-20 15:49 被阅读0次

    教材选用《阿里云天池大赛赛题解析——机器学习篇》;


    2.2 数据探索

    2.2.2读取数据

    train_data_file = "./zhengqi_train.txt"
    test_data_file = "./zhengqi_test.txt"
    
    train_data = pd.read_csv(train_data_file, sep='\t', encoding='utf-8')
    test_data = pd.read_csv(train_test_file, sep='\t', encoding='utf-8')
    
    train_data.describe()
    

    2.2.4可视化数据分布

    fig = plt.figure(figsize=(4,6))
    sns.boxplot(train_data['V0'], orient='v', width=0.5)
    
    column = train_data.columns.tolist()[:39]
    fig = plt.figure(figsize=(80,60), dpi=75)
    for i in range(38):
        plt.subplot(7,8,i + 1)
        sns.boxplot(train_data[column[i]], orient='v', width = 0.5)
        plt.ylabel(column[i],fontsize= 36)
    plt.show()
    
    箱线图

    获取异常数据并画图(岭回归)

    # function to detect outliers based on the predictions of a model 
    def find_outliers(model, X, y, sigma=3): # X:feature y:label
        # predict y values using model
        try:
            y_pred = pd.Series(model.predict(X), index=y.index)
        # if predicting failed, try fitting the model first
        except:
            model.fit(X, y)
            y_pred = pd.Series(model.predict(X), index=y.index)
            
        # calculate residuals between the model prediction and true y values
        resid = y - y_pred
        mean_resid = resid.mean()
        std_resid = resid.std()
        
        # calculate a statistic, define outliers to be where z >sigma
        z = (resid - mean_resid) / std_resid
        outliers = z[abs(z) > sigma].index
        
        # print and plot the results 
        print('R2=', model.score(X,y))
        print('mse=', mean_squared_error(y,y_pred))
        print('-----------------------------------')
        
        print('mean of residuals:', mean_resid)
        print('std of residuals:' , std_resid)
        print('-----------------------------------')
        
        print(len(outliers),'outliers:')
        print(outliers.tolist())
        
        plt.figure(figsize=(15,5))
        ax_131 = plt.subplot(1,3,1)
        plt.plot(y ,y_pred, '.')
        plt.plot(y.loc[outliers], y_pred.loc[outliers], 'ro')
        plt.legend(['Accepted','Outliers'])
        plt.xlabel('y')
        plt.ylabel('y_pred')    
        
        ax_132 = plt.subplot(1,3,2)
        plt.plot(y ,y - y_pred, '.')
        plt.plot(y.loc[outliers], y.loc[outliers] - y_pred.loc[outliers], 'ro')
        plt.legend(['Accepted','Outliers'])
        plt.xlabel('y')
        plt.ylabel('y_pred')
        
        ax_133 = plt.subplot(1,3,3)
        z.plot.hist(bins=50,ax= ax_133)
        z.loc[outliers].plot.hist(color='r', bins=50, ax=ax_133)
        plt.legend(['Accepted','Outliers'])
        plt.xlabel('z')
        
        plt.savefig('outliers.png')
        
        return outliers
    
    from sklearn.linear_model import Ridge
    from sklearn.metrics import mean_squared_error
    X_train = train_data.iloc[:,0:-1]
    y_train = train_data.iloc[:,-1]
    outliers = find_outliers(Ridge(), X_train, y_train)
    

    output

    R2= 0.8890858938210386
    mse= 0.10734857773123632
    -----------------------------------
    mean of residuals: 3.3291313688552134e-17
    std of residuals: 0.32769766731934985
    -----------------------------------
    31 outliers:
    [321, 348, 376, 777, 884, 1145, 1164, 1310, 1458, 1466, 1484, 1523, 1704, 1874, 1879, 1979, 2002, 2279, 2528, 2620, 2645, 2647, 2667, 2668, 2669, 2696, 2767, 2769, 2807, 2842, 2863]
    
    获取异常数据并画图(岭回归)

    直方图和Q-Q图

    首先,通过绘制特征变量的直方图查看其在训练集中的统计分布,并绘制Q-Q图查看统计分布是否近似于正态分布。

    train_cols = 8
    train_rows = len(train_data.columns)
    plt.figure(figsize=(4*train_cols,4* train_rows))
    
    i = 0
    for col in train_data.columns:
        i+=1
        ax = plt.subplot(train_rows, train_cols, i)
        sns.distplot(train_data[col],fit=stats.norm)
        
        i+=1
        ax = plt.subplot(train_rows, train_cols, i)
        res = stats.probplot(train_data[col], plot=plt)
        
    plt.tight_layout()
    plt.show()
    
    直方图和Q-Q图

    KDE(核密度估计)分布图

    查看并对比训练集和测试集中特征变量的分布情况。

    dist_cols = 8
    dist_rows = len(test_data.columns)
    plt.figure(figsize=(4*dist_cols,4*dist_rows))
    
    i=1
    for col in test_data.columns:
        ax = plt.subplot(dist_rows, dist_cols, i)
        ax = sns.kdeplot(train_data[col], color='red', shade=True)
        ax = sns.kdeplot(test_data[col], color='Blue',shade=True)
        ax.set_xlabel(col)
        ax.set_ylabel("Frequency")
        ax =ax.legend(["train","test"])
        i+=1
        
    plt.show()
    
    KDE分布图

    根据上图,删除分布不一致的特征变量V5,V9,V11,V17,V22,V28

    线性回归关系图

    线性回归关系图主要用于分析变量间的线性回归关系。

    fcols = 8
    frows = len(test_data.columns)
    plt.figure(figsize=(5*fcols,4*frows))
    
    i=0
    for col in test_data.columns:
        i+=1
        ax = plt.subplot(frows, fcols, i)
        sns.regplot(x=col, y= 'target', data=train_data, ax=ax, scatter_kws={'marker':'.','s':3,'alpha':0.3}, line_kws={'color':'k'})
        ax.set_xlabel(col)
        ax.set_ylabel("Target")
        
        i+=1
        ax = plt.subplot(frows, fcols, i)
        sns.distplot(train_data[col].dropna())
        plt.xlabel(col)
        
    plt.show()
    
    线性回归关系图

    2.2.5查看特征变量的相关性

    1.计算相关性系数

    在删除训练集和测试集中分布不一致的特征变量后,如V5,V9,V11,V17,V22,V28之后,计算剩余特征变量和target变量的相关性系数。

    pd.set_option('display.max_columns', 10)
    pd.set_option('display.max_rows', 10)
    data_train1 = train_data.drop(['V5','V9','V11','V17','V22','V28'],axis=1)
    
    train_corr = data_train1.corr()
    train_corr
    ax = plt.subplots(figsize=(20,16))
    ax = sns.heatmap(train_corr, vmax=.8, square=True, annot=True)
    
    相关性热力图
    3.根据相关系数筛选特征变量

    找出与target变量的相关系数大于0.5的特征变量。

    threshold = 0.5
    
    corrmat = train_data.corr()
    top_corr_features = corrmat.index[abs(corrmat["target"]) > threshold]
    plt.figure(figsize=(10,10))
    g = sns.heatmap(train_data[top_corr_features].corr(), annot=True, cmap="RdYlGn")
    
    根据相关系数筛选特征变量

    4.Box-Cox变换

    由于线性回归是基于正态分布的,因此在进行统计分析时,需要把数据转换使其符合正态分布。
    在做Box-Cox之前,需要对数据做归一化处理。在归一化时,对数据进行合并操作可以使训练数据和测试数据一致。

    drop_columns = ['V5','V9','V11','V17','V22','V28']
    train_x = train_data.drop(['target'],axis=1)
    # 合并训练集和测试集的数据
    data_all =pd.concat([train_x,test_data])
    data_all.drop(drop_columns,axis=1, inplace =True)
    data_all.head()
    

    对合并后的每一列数据做归一化。

    cols_numeric = list(data_all.columns)
    
    def scale_minmax(col):
        return (col - col.min()) / (col.max() - col.min())
    
    data_all[cols_numeric] = data_all[cols_numeric].apply(scale_minmax,axis=0)
    data_all[cols_numeric].describe()
    

    对特征变量做Box-Cox变换后,计算分位数并画图展示(基于正态分布),显示特征变量与targe变量的线性关系。

    train_data_process = train_data[cols_numeric]
    train_data_process = train_data_process[cols_numeric].apply(scale_minmax,axis=0)
    
    cols_numeric_left = cols_numeric[0:13]
    cols_numeric_right = cols_numeric[13:]
    train_data_process = pd.concat([train_data_process, train_data['target']],axis=1)
    
    fcols = 6
    frows = len(cols_numeric_left)
    plt.figure(figsize=(4*fcols,4*frows))
    i = 0
    
    for var in cols_numeric_left:
        dat = train_data_process[[var, 'target']].dropna()
        i+=1
        plt.subplot(frows,fcols,i)
        sns.distplot(dat[var],fit=stats.norm)
        plt.title(var+' Original')
        plt.xlabel('')
        i+=1
        plt.subplot(frows,fcols,i)
        _ = stats.probplot(dat[var],plot=plt)
        plt.title('skew='+'{:.4f}'.format(stats.skew(dat[var])))
        plt.xlabel('')
        plt.ylabel('')
        i+=1
        plt.subplot(frows,fcols,i)
        plt.plot(dat[var],dat['target'],'.',alpha=0.5)
        plt.title('corr='+'{:.2f}'.format(np.corrcoef(dat[var],dat['target'])[0][1]))
        i+=1
        plt.subplot(frows,fcols,i)
        trans_var, lambda_var = stats.boxcox(dat[var].dropna() + 1)
        trans_var = scale_minmax(trans_var)
        sns.distplot(trans_var,fit=stats.norm)
        plt.title(var+' Transformed')
        plt.xlabel('')
        i+=1
        plt.subplot(frows,fcols,i)
        _ = stats.probplot(trans_var,plot=plt)
        plt.title('skew='+'{:.4f}'.format(stats.skew(trans_var)))
        plt.xlabel('')
        plt.ylabel('')
        i+=1
        plt.subplot(frows,fcols,i)
        plt.plot(trans_var,dat['target'],'.',alpha=0.5)
        plt.title('corr='+'{:.2f}'.format(np.corrcoef(trans_var,dat['target'])[0][1]))
    
    Box-Cox变换前后对比

    可以发现,经过变换后,变量分布更接近于正态分布。

    3.4 特征工程

    3.4.1 异常值分析

    绘制各个特征的箱线图:

    plt.figure(figsize=(18,10))
    plt.boxplot(x = train_data.values, labels = train_data.columns)
    plt.hlines([-7.5,7.5],0,40,colors='r')
    plt.show()
    
    异常值分析

    把训练集和测试集中的异常值删除。

    train_data = train_data[train_data['V9']>-7.5]
    test_data = test_data[test_data['V9']>-7.5]
    display(train_data.describe())
    display(test_data.describe())
    

    3.4.2 最大值和最小值的归一化处理

    对数据进行归一化处理。

    from sklearn import preprocessing
    
    features_columns = [col for col in train_data.columns if col not in ['target']]
    min_max_scaler = preprocessing.MinMaxScaler()
    min_max_scaler = min_max_scaler.fit(train_data[features_columns]) 
    
    train_data_scaler = min_max_scaler.transform(train_data[features_columns])
    test_data_scaler = min_max_scaler.transform(test_data[features_columns])
    
    train_data_scaler = pd.DataFrame(train_data_scaler)
    train_data_scaler.columns = features_columns
    
    test_data_scaler = pd.DataFrame(test_data_scaler)
    test_data_scaler.columns = features_columns
    train_data_scaler['target'] = train_data['target']
    
    display(train_data_scaler.describe())
    display(test_data_scaler.describe())
    

    3.4.3 查看数据分布

    删除分布不一致的特征变量。

    train_data_scaler = train_data_scaler.drop(['V5','V9','V11','V17','V22','V28'],axis=1)
    test_data_scaler = test_data_scaler.drop(['V5','V9','V11','V17','V22','V28'],axis=1)
    

    3.4.4 特征相关性

    计算特征相关性,以热力图形式可视化显示。

    import seaborn as sns
    
    plt.figure(figsize=(20,16))
    column = train_data_scaler.columns.tolist()
    mcorr = train_data_scaler[column].corr(method="spearman")
    mask = np.zeros_like(mcorr, dtype=np.bool)
    mask[np.triu_indices_from(mask)] = True
    cmap = sns.diverging_palette(220,10,as_cmap = True)
    g = sns.heatmap(mcorr, mask = mask, cmap = cmap, square = True, annot = True, fmt='0.2f')
    plt.show()
    
    特征相关性

    3.4.5 特征降维

    计算相关性系数并筛选大于0.1的特征变量。

    mcorr = mcorr.abs()
    numerical_corr = mcorr[mcorr['target']>0.1]['target']
    print(numerical_corr.sort_values(ascending=False))
    

    output

    target    1.000000
    V0        0.866709
    V1        0.832457
    V8        0.799280
    V27       0.765133
    V31       0.749034
    V2        0.630160
    V4        0.574775
    V12       0.542429
    V16       0.510025
    V3        0.501114
    V37       0.497162
    V20       0.420424
    V10       0.371067
    V24       0.296056
    V36       0.287696
    V6        0.264778
    V15       0.213490
    V29       0.198244
    V13       0.177317
    V19       0.171120
    V7        0.164981
    V18       0.162710
    V30       0.123423
    Name: target, dtype: float64
    

    3.4.7 PCA处理

    利用PCA方法去除数据的多重共线性,并进行降维。PCA处理后可保持90%的信息,并与处理前测数据对比:

    from sklearn.decomposition import PCA
    
    pca = PCA(n_components = 0.9)
    new_train_pca_90 = pca.fit_transform(train_data_scaler.iloc[:,0:-1])
    new_test_pca_90 = pca.transform(test_data_scaler)
    new_train_pca_90 = pd.DataFrame(new_train_pca_90)
    new_test_pca_90 = pd.DataFrame(new_test_pca_90)
    new_train_pca_90['target'] = train_data_scaler['target']
    display(new_train_pca_90.describe())
    display(train_data_scaler.describe())
    

    PCA保留16个主成分:

    pca = PCA(n_components = 16)
    new_train_pca_16 = pca.fit_transform(train_data_scaler.iloc[:,0:-1])
    new_test_pca_16 = pca.transform(test_data_scaler)
    new_train_pca_16 = pd.DataFrame(new_train_pca_16)
    new_test_pca_16 = pd.DataFrame(new_test_pca_16)
    new_train_pca_16['target'] = train_data_scaler['target']
    display(new_train_pca_16.describe())
    display(train_data_scaler.describe())
    

    4.2 模型训练

    4.2.1 导入相关库

    首先导入一些python工具包,用于模型拟合和性能评价。

    from sklearn.linear_model import LinearRegression # 线性回归
    from sklearn.neighbors import KNeighborsRegressor # K近邻回归
    from sklearn.tree import DecisionTreeRegressor # 决策树回归
    from sklearn.ensemble import RandomForestRegressor # 随机森林回归
    from sklearn.svm import SVR # 支持向量机回归
    import lightgbm as lgb # LightGBM模型
    
    from sklearn.model_selection import train_test_split # 切分数据
    from sklearn.metrics import mean_squared_error # 评价指标
    
    from sklearn.model_selection import learning_curve
    from sklearn.model_selection import ShuffleSplit
    

    4.2.2 切分数据

    对训练集进行切分,得到80%的训练集和20%的验证集。

    new_train_pca_16 = new_train_pca_16.fillna(0)
    train = new_train_pca_16[new_train_pca_16.columns]
    target = new_train_pca_16['target']
    
    train_data, test_data, train_target, test_target = train_test_split(train, target, test_size=0.2, random_state=0)
    

    4.2.3 多元线性回归

    优点:模型简单,部署方便,回归权重可以用于结果分析:训练快。
    缺点:精度低,特征存在一定的共线性问题。
    使用技巧:需要进行归一化处理,建议进行一定的特征选择,尽量避免高度相关的特征同时存在。
    本题结果:效果一般。适合分析使用。

    clf = LinearRegression()
    clf.fit(train_data, train_target)
    score = mean_squared_error(test_target, clf.predict(test_data))
    print("LinearRegression:", score)
    LinearRegression: 0.26582236740768944
    

    4.2.4 K近邻回归

    优点:模型简单,易于理解,对于数据量小的情况方便快捷,可视化方便。
    缺点:计算量大,不适合数据量大的情况:需要调参数。
    使用技巧:特征需要归一化,重要的特征可以适当加一定比例的权重。
    本题结果:效果一般。

    clf = KNeighborsRegressor(n_neighbors=8)
    clf.fit(train_data, train_target)
    score = mean_squared_error(test_target, clf.predict(test_data))
    print("KNeighborsRegressor:", score)
    KNeighborsRegressor: 0.27912407507028547
    

    4.2.5 随机森林回归

    优点:使用方便,特征无需做过多变换;精度较高;模型并行训练快
    缺点:结果不容易解释
    使用技巧:参数调节,提高精度
    本题结果:比较适合

    clf = RandomForestRegressor(n_estimators=200)
    clf.fit(train_data, train_target)
    score = mean_squared_error(test_target, clf.predict(test_data))
    print("RandomForestRegressor:", score)
    RandomForestRegressor: 0.24562419837569202
    

    4.2.6 LGB模型回归

    优点:精度高
    缺点:训练时间长,模型复杂
    使用技巧:有效的验证集防止过拟合;参数搜索。
    本题结果:适合。

    clf = lgb.LGBMRegressor(
        learning_rate = 0.01,
        max_depth = -1,
        n_estimators = 5000,
        boosting_type = 'gbdt',
        random_state = 2019,
        objective='regression',
    )
    clf.fit(X=train_data, y=train_target, eval_metric='MSE', verbose=50)
    score = mean_squared_error(test_target, clf.predict(test_data))
    print("lightGBM:", score)
    lightGBM: 0.2309519095788757
    

    5.3 模型验证

    5.3.1 模型欠拟合和过拟合

    模型欠拟合的情况,代码和运行结果如下:

    from scipy import stats
    import warnings
    warnings.filterwarnings("ignore")
    from sklearn.linear_model import SGDRegressor
    
    clf = SGDRegressor(max_iter=500, tol=1e-2)
    clf.fit(train_data, train_target)
    score_train = mean_squared_error(train_target, clf.predict(train_data))
    score_test = mean_squared_error(test_target, clf.predict(test_data))
    print("SGDRegressor train MSE:", score_train)
    print("SGDRegressor test MSE:", score_test)
    SGDRegressor train MSE: 0.37776579423967643
    SGDRegressor test MSE: 0.30333099831254734
    

    模型过拟合的情况,代码和运行结果如下:

    from sklearn.preprocessing import PolynomialFeatures
    poly = PolynomialFeatures(5)
    train_data_poly  = poly.fit_transform(train_data)
    test_data_poly  = poly.fit_transform(test_data)
    clf = SGDRegressor(max_iter = 1000, tol = 1e-3)
    clf.fit(train_data_poly, train_target)
    score_train = mean_squared_error(train_target, clf.predict(train_data_poly))
    score_test = mean_squared_error(test_target, clf.predict(test_data_poly))
    print("SGDRegressor train MSE:", score_train)
    print("SGDRegressor test MSE:", score_test)
    SGDRegressor train MSE: 0.3088568266233788
    SGDRegressor test MSE: 0.26365730216068484
    

    模型正常拟合的情况,代码和运行结果如下:

    from sklearn.preprocessing import PolynomialFeatures
    poly = PolynomialFeatures(3)
    train_data_poly  = poly.fit_transform(train_data)
    test_data_poly  = poly.fit_transform(test_data)
    clf = SGDRegressor(max_iter = 1000, tol = 1e-3)
    clf.fit(train_data_poly, train_target)
    score_train = mean_squared_error(train_target, clf.predict(train_data_poly))
    score_test = mean_squared_error(test_target, clf.predict(test_data_poly))
    print("SGDRegressor train MSE:", score_train)
    print("SGDRegressor test MSE:", score_test)
    SGDRegressor train MSE: 0.31246590877931896
    SGDRegressor test MSE: 0.26532038403344865
    

    5.3.2 模型正则化

    采用L2范数正则化处理模型,代码如下:

    poly = PolynomialFeatures(3)
    train_data_poly  = poly.fit_transform(train_data)
    test_data_poly  = poly.fit_transform(test_data)
    clf = SGDRegressor(max_iter = 1000, tol = 1e-3, penalty='L2', alpha=0.0001)
    clf.fit(train_data_poly, train_target)
    score_train = mean_squared_error(train_target, clf.predict(train_data_poly))
    score_test = mean_squared_error(test_target, clf.predict(test_data_poly))
    print("SGDRegressor train MSE:", score_train)
    print("SGDRegressor test MSE:", score_test)
    SGDRegressor train MSE: 0.31444724414117475
    SGDRegressor test MSE: 0.26588746232300303
    

    采用L1范数正则化处理模型,代码如下:

    poly = PolynomialFeatures(3)
    train_data_poly  = poly.fit_transform(train_data)
    test_data_poly  = poly.fit_transform(test_data)
    clf = SGDRegressor(max_iter = 1000, tol = 1e-3, penalty='L1', alpha=0.00001)
    clf.fit(train_data_poly, train_target)
    score_train = mean_squared_error(train_target, clf.predict(train_data_poly))
    score_test = mean_squared_error(test_target, clf.predict(test_data_poly))
    print("SGDRegressor train MSE:", score_train)
    print("SGDRegressor test MSE:", score_test)
    SGDRegressor train MSE: 0.3143081286245166
    SGDRegressor test MSE: 0.26633882737907394
    

    采用ElasticNet正则化处理模型,代码如下:

    poly = PolynomialFeatures(3)
    train_data_poly  = poly.fit_transform(train_data)
    test_data_poly  = poly.fit_transform(test_data)
    clf = SGDRegressor(max_iter = 1000, tol = 1e-3, penalty='elasticnet',  l1_ratio = 0.9, alpha=0.00001)
    clf.fit(train_data_poly, train_target)
    score_train = mean_squared_error(train_target, clf.predict(train_data_poly))
    score_test = mean_squared_error(test_target, clf.predict(test_data_poly))
    print("SGDRegressor train MSE:", score_train)
    print("SGDRegressor test MSE:", score_test)
    SGDRegressor train MSE: 0.3145747479972795
    SGDRegressor test MSE: 0.2665089998958983
    

    5.3.3 模型交叉验证

    使用简单交叉验证并切分数据,训练集和验证集为8:2。

    # 简单交叉验证
    from sklearn.model_selection import train_test_split # 切分数据
    # 切分数据,训练集和验证集为8:2
    train_data, test_data, train_target, test_target = train_test_split(train, target, test_size = 0.2, random_state=0)
    
    clf = SGDRegressor(max_iter = 1000, tol = 1e-3)
    clf.fit(train_data, train_target)
    score_train = mean_squared_error(train_target, clf.predict(train_data))
    score_test = mean_squared_error(test_target, clf.predict(test_data))
    print("SGDRegressor train MSE:", score_train)
    print("SGDRegressor test MSE:", score_test)
    SGDRegressor train MSE: 0.35653456761449837
    SGDRegressor test MSE: 0.27929212422296323
    

    使用K折交叉验证方法对模型进行交叉验证,K = 5, 代码如下:

    from sklearn.model_selection import KFold
    
    kf = KFold(n_splits = 5)
    for k, (train_index, test_index) in enumerate(kf.split(train)):
        train_data, test_data, train_target, test_target = train.values[train_index], train.values[test_index], target[train_index], target[test_index]
        clf = SGDRegressor(max_iter = 1000, tol = 1e-3)
        clf.fit(train_data, train_target)
        score_train = mean_squared_error(train_target, clf.predict(train_data))
        score_test = mean_squared_error(test_target, clf.predict(test_data))
        print(k,"折","SGDRegressor train MSE:", score_train)
        print(k,"折","SGDRegressor test MSE:", score_test, '\n')
    0 折 SGDRegressor train MSE: 0.3705300016301921
    0 折 SGDRegressor test MSE: 0.21367230031317322 
    
    1 折 SGDRegressor train MSE: 0.3465093570084947
    1 折 SGDRegressor test MSE: 0.3071764501574828 
    
    2 折 SGDRegressor train MSE: 0.34798441935505325
    2 折 SGDRegressor test MSE: 0.33916831324981905 
    
    3 折 SGDRegressor train MSE: 0.3062791836060481
    3 折 SGDRegressor test MSE: 0.5373752495243562 
    
    4 折 SGDRegressor train MSE: 0.30784894549380654
    4 折 SGDRegressor test MSE: 0.49677933559823606 
    

    使用留一法交叉验证对模型进行交叉验证,代码如下:

    from sklearn.model_selection import LeaveOneOut
    loo = LeaveOneOut()
    num = 100
    for k, (train_index, test_index) in enumerate(loo.split(train)):
        train_data, test_data, train_target, test_target = train.values[train_index], train.values[test_index], target[train_index], target[test_index]
        clf = SGDRegressor(max_iter = 1000, tol = 1e-3)
        clf.fit(train_data, train_target)
        score_train = mean_squared_error(train_target, clf.predict(train_data))
        score_test = mean_squared_error(test_target, clf.predict(test_data))
        print(k,"个","SGDRegressor train MSE:", score_train)
        print(k,"个","SGDRegressor test MSE:", score_test, '\n')
        if k >= 9:
            break
    

    使用留P法交叉验证对模型进行交叉验证,代码如下:

    from sklearn.model_selection import LeavePOut
    lpo = LeavePOut(p=10)
    num = 100
    for k, (train_index, test_index) in enumerate(lpo.split(train)):
        train_data, test_data, train_target, test_target = train.values[train_index], train.values[test_index], target[train_index], target[test_index]
        clf = SGDRegressor(max_iter = 1000, tol = 1e-3)
        clf.fit(train_data, train_target)
        score_train = mean_squared_error(train_target, clf.predict(train_data))
        score_test = mean_squared_error(test_target, clf.predict(test_data))
        print(k,"10个","SGDRegressor train MSE:", score_train)
        print(k,"10个","SGDRegressor test MSE:", score_test, '\n')
        if k >= 9:
            break
    

    5.3.4 模型超参空间及调参

    使用数据训练随机森林模型,采用网格搜索方法调参,代码如下:

    from sklearn.model_selection import GridSearchCV
    from sklearn.ensemble import RandomForestRegressor
    from sklearn.model_selection import train_test_split # 切分数据
    # 切分数据 训练数据80% 验证数据20%
    train_data,test_data,train_target,test_target=train_test_split(train,target,test_size=0.2,random_state=0)
    randomForestRegressor = RandomForestRegressor()
    parameters = {'n_estimators':[50, 100, 200],'max_depth':[1, 2, 3]}
    clf = GridSearchCV(randomForestRegressor, parameters, cv=5)
    clf.fit(train_data, train_target)
    score_test = mean_squared_error(test_target, clf.predict(test_data))
    print("RandomForestRegressor GridSearchCV test MSE:   ", score_test)
    sorted(clf.cv_results_.keys())
    

    output

    RandomForestRegressor GridSearchCV test MSE:    0.34479414952956544
    ['mean_fit_time',
     'mean_score_time',
     'mean_test_score',
     'mean_train_score',
     'param_max_depth',
     'param_n_estimators',
     'params',
     'rank_test_score',
     'split0_test_score',
     'split0_train_score',
     'split1_test_score',
     'split1_train_score',
     'split2_test_score',
     'split2_train_score',
     'split3_test_score',
     'split3_train_score',
     'split4_test_score',
     'split4_train_score',
     'std_fit_time',
     'std_score_time',
     'std_test_score',
     'std_train_score']
    

    使用数据训练随机森林模型,采用随机参数优化方法调参,代码如下:

    from sklearn.model_selection import RandomizedSearchCV
    from sklearn.ensemble import RandomForestRegressor
    from sklearn.model_selection import train_test_split # 切分数据
    # 切分数据 训练数据80% 验证数据20%
    train_data,test_data,train_target,test_target=train_test_split(train,target,test_size=0.2,random_state=0)
    randomForestRegressor = RandomForestRegressor()
    parameters = {'n_estimators':[50, 100, 200, 300],'max_depth':[1, 2, 3, 4, 5]}
    clf = RandomizedSearchCV(randomForestRegressor, parameters, cv=5)
    clf.fit(train_data, train_target)
    score_test = mean_squared_error(test_target, clf.predict(test_data))
    print("RandomForestRegressor RandomizedSearchCV test MSE:   ", score_test)
    sorted(clf.cv_results_.keys())
    

    output

    RandomForestRegressor RandomizedSearchCV test MSE:    0.2824045787313695
    ['mean_fit_time',
     'mean_score_time',
     'mean_test_score',
     'mean_train_score',
     'param_max_depth',
     'param_n_estimators',
     'params',
     'rank_test_score',
     'split0_test_score',
     'split0_train_score',
     'split1_test_score',
     'split1_train_score',
     'split2_test_score',
     'split2_train_score',
     'split3_test_score',
     'split3_train_score',
     'split4_test_score',
     'split4_train_score',
     'std_fit_time',
     'std_score_time',
     'std_test_score',
     'std_train_score']
    

    使用数据训练LGB模型,采用网格搜索方法调参,代码如下:

    clf = lgb.LGBMRegressor(num_leaves=31)
    parameters = {'learning_rate': [0.01, 0.1, 1],'n_estimators': [20, 40]}
    clf = GridSearchCV(clf, parameters, cv=5)
    clf.fit(train_data, train_target)
    print('Best parameters found by grid search are:', clf.best_params_)
    score_test = mean_squared_error(test_target, clf.predict(test_data))
    print("LGBMRegressor RandomizedSearchCV test MSE:   ", score_test)
    

    5.3.5 学习曲线和验证曲线

    绘制数据训练SGDRegressor模型的学习曲线。

    print(__doc__)
    import numpy as np
    import matplotlib.pyplot as plt
    from sklearn import model_selection 
    from sklearn.linear_model import SGDRegressor
    from sklearn.model_selection import learning_curve
    
    def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5)):
        plt.figure()
        plt.title(title)
        if ylim is not None:
            plt.ylim(*ylim)
        plt.xlabel("Training examples")
        plt.ylabel("Score")
        train_sizes, train_scores, test_scores = learning_curve(
            estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
        train_scores_mean = np.mean(train_scores, axis=1)
        train_scores_std = np.std(train_scores, axis=1)
        test_scores_mean = np.mean(test_scores, axis=1)
        test_scores_std = np.std(test_scores, axis=1)
        plt.grid()
    
        plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                         train_scores_mean + train_scores_std, alpha=0.1,
                         color="r")
        plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                         test_scores_mean + test_scores_std, alpha=0.1, color="g")
        plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
                 label="Training score")
        plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
                 label="Cross-validation score")
    
        plt.legend(loc="best")
        return plt
    
    train_data2 = pd.read_csv('./zhengqi_train.txt',sep='\t')
    test_data2 = pd.read_csv('./zhengqi_test.txt',sep='\t')
    X = train_data2[test_data2.columns].values
    y = train_data2['target'].values
    title = "LinearRegression"
    # Cross validation with 100 iterations to get smoother mean test and train
    # score curves, each time with 20% data randomly selected as a validation set.
    cv = model_selection.ShuffleSplit(n_splits=100, test_size=0.2, random_state=0)
    estimator = SGDRegressor()
    plot_learning_curve(estimator, title, X, y, ylim=(0.7, 1.01), cv=cv, n_jobs=-1)
    
    学习曲线

    绘制数据训练SGDRegressor模型的验证曲线。

    print(__doc__)
    import matplotlib.pyplot as plt
    import numpy as np
    from sklearn.linear_model import SGDRegressor
    from sklearn.model_selection import validation_curve
    
    X = train_data2[test_data2.columns].values
    y = train_data2['target'].values
    param_range = [0.1, 0.01, 0.001, 0.0001, 0.00001, 0.000001]
    train_scores, test_scores = validation_curve(
        SGDRegressor(max_iter=1000, tol=1e-3, penalty= 'L1'), X, y, param_name="alpha", param_range=param_range,
        cv=10, scoring='r2', n_jobs=1)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    
    plt.title("Validation Curve with SGDRegressor")
    plt.xlabel("alpha")
    plt.ylabel("Score")
    plt.ylim(0.0, 1.1)
    plt.semilogx(param_range, train_scores_mean, label="Training score", color="r")
    plt.fill_between(param_range, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.2, color="r")
    plt.semilogx(param_range, test_scores_mean, label="Cross-validation score",
                 color="g")
    plt.fill_between(param_range, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.2, color="g")
    plt.legend(loc="best")
    plt.show()
    
    验证曲线

    6.2 特征优化

    import pandas as pd
    
    train_data_file = "./zhengqi_train.txt"
    test_data_file =  "./zhengqi_test.txt"
    train_data = pd.read_csv(train_data_file, sep='\t', encoding='utf-8')
    test_data = pd.read_csv(test_data_file, sep='\t', encoding='utf-8')
    
    epsilon=1e-5
    #组交叉特征,可以自行定义,如增加: x*x/y, log(x)/y 等等
    func_dict = {
                'add': lambda x,y: x+y,
                'mins': lambda x,y: x-y,
                'div': lambda x,y: x/(y+epsilon),
                'multi': lambda x,y: x*y
                }
    
    def auto_features_make(train_data,test_data,func_dict,col_list):
        train_data, test_data = train_data.copy(), test_data.copy()
        for col_i in col_list:
            for col_j in col_list:
                for func_name, func in func_dict.items():
                    for data in [train_data,test_data]:
                        func_features = func(data[col_i],data[col_j])
                        col_func_features = '-'.join([col_i,func_name,col_j])
                        data[col_func_features] = func_features
        return train_data,test_data
    
    train_data2, test_data2 = auto_features_make(train_data,test_data,func_dict,col_list=test_data.columns)
    
    from sklearn.decomposition import PCA   #主成分分析法
    
    #PCA方法降维
    pca = PCA(n_components=500)
    train_data2_pca = pca.fit_transform(train_data2.iloc[:,0:-1])
    test_data2_pca = pca.transform(test_data2)
    train_data2_pca = pd.DataFrame(train_data2_pca)
    test_data2_pca = pd.DataFrame(test_data2_pca)
    train_data2_pca['target'] = train_data2['target']
    
    X_train2 = train_data2[test_data2.columns].values
    y_train = train_data2['target']
    
    # ls_validation i
    from sklearn.model_selection import KFold
    from sklearn.metrics import mean_squared_error
    import lightgbm as lgb
    import numpy as np
    
    # 5折交叉验证
    Folds=5
    kf = KFold(n_splits=Folds, random_state=2019, shuffle=True)
    # 记录训练和预测MSE
    MSE_DICT = {
        'train_mse':[],
        'test_mse':[]
    }
    
    # 线下训练预测
    for i, (train_index, test_index) in enumerate(kf.split(X_train2)):
        # lgb树模型
        lgb_reg = lgb.LGBMRegressor(
            learning_rate=0.01,
            max_depth=-1,
            n_estimators=5000,
            boosting_type='gbdt',
            random_state=2019,
            objective='regression',
        )
       
        # 切分训练集和预测集
        X_train_KFold, X_test_KFold = X_train2[train_index], X_train2[test_index]
        y_train_KFold, y_test_KFold = y_train[train_index], y_train[test_index]
        
        # 训练模型
        lgb_reg.fit(
                X=X_train_KFold,y=y_train_KFold,
                eval_set=[(X_train_KFold, y_train_KFold),(X_test_KFold, y_test_KFold)],
                eval_names=['Train','Test'],
                early_stopping_rounds=100,
                eval_metric='MSE',
                verbose=50
            )
    
    
        # 训练集预测 测试集预测
        y_train_KFold_predict = lgb_reg.predict(X_train_KFold,num_iteration=lgb_reg.best_iteration_)
        y_test_KFold_predict = lgb_reg.predict(X_test_KFold,num_iteration=lgb_reg.best_iteration_) 
        
        print('第{}折 训练和预测 训练MSE 预测MSE'.format(i))
        train_mse = mean_squared_error(y_train_KFold_predict, y_train_KFold)
        print('------\n', '训练MSE\n', train_mse, '\n------')
        test_mse = mean_squared_error(y_test_KFold_predict, y_test_KFold)
        print('------\n', '预测MSE\n', test_mse, '\n------\n')
        
        MSE_DICT['train_mse'].append(train_mse)
        MSE_DICT['test_mse'].append(test_mse)
    print('------\n', '训练MSE\n', MSE_DICT['train_mse'], '\n', np.mean(MSE_DICT['train_mse']), '\n------')
    print('------\n', '预测MSE\n', MSE_DICT['test_mse'], '\n', np.mean(MSE_DICT['test_mse']), '\n------')
    

    output

    Training until validation scores don't improve for 100 rounds
    [50]    Train's l2: 0.413128    Test's l2: 0.455026
    [100]   Train's l2: 0.198017    Test's l2: 0.245523
    [150]   Train's l2: 0.108839    Test's l2: 0.164432
    [200]   Train's l2: 0.0683439   Test's l2: 0.132008
    [250]   Train's l2: 0.0478354   Test's l2: 0.117217
    [300]   Train's l2: 0.035836    Test's l2: 0.110572
    [350]   Train's l2: 0.0279916   Test's l2: 0.10673
    [400]   Train's l2: 0.0225218   Test's l2: 0.104686
    [450]   Train's l2: 0.0183733   Test's l2: 0.103133
    [500]   Train's l2: 0.0151476   Test's l2: 0.102168
    [550]   Train's l2: 0.012598    Test's l2: 0.101216
    [600]   Train's l2: 0.0105448   Test's l2: 0.100722
    [650]   Train's l2: 0.00886925  Test's l2: 0.100606
    [700]   Train's l2: 0.00751108  Test's l2: 0.100288
    [750]   Train's l2: 0.00639588  Test's l2: 0.100224
    [800]   Train's l2: 0.00547284  Test's l2: 0.100142
    [850]   Train's l2: 0.00469886  Test's l2: 0.0999705
    [900]   Train's l2: 0.00405206  Test's l2: 0.0997473
    [950]   Train's l2: 0.00350702  Test's l2: 0.0997148
    [1000]  Train's l2: 0.00304687  Test's l2: 0.0996172
    [1050]  Train's l2: 0.00265095  Test's l2: 0.0995466
    [1100]  Train's l2: 0.00231359  Test's l2: 0.0993888
    [1150]  Train's l2: 0.00202587  Test's l2: 0.0992934
    [1200]  Train's l2: 0.00178175  Test's l2: 0.0992655
    [1250]  Train's l2: 0.0015751   Test's l2: 0.0992336
    [1300]  Train's l2: 0.00139401  Test's l2: 0.0992164
    [1350]  Train's l2: 0.00123563  Test's l2: 0.0991488
    [1400]  Train's l2: 0.00110108  Test's l2: 0.0991156
    [1450]  Train's l2: 0.000984264 Test's l2: 0.0990345
    [1500]  Train's l2: 0.000882609 Test's l2: 0.0989801
    [1550]  Train's l2: 0.000793992 Test's l2: 0.0989664
    [1600]  Train's l2: 0.000717061 Test's l2: 0.0989519
    [1650]  Train's l2: 0.000649073 Test's l2: 0.0989394
    [1700]  Train's l2: 0.00058903  Test's l2: 0.0989571
    [1750]  Train's l2: 0.000536084 Test's l2: 0.0989557
    Early stopping, best iteration is:
    [1666]  Train's l2: 0.000628941 Test's l2: 0.0989285
    

    7.2 模型融合

    pip install xgboost --user
    
    import warnings
    warnings.filterwarnings("ignore")
    import matplotlib.pyplot as plt
    plt.rcParams.update({'figure.max_open_warning': 0})
    import seaborn as sns
    # modelling
    import pandas as pd
    import numpy as np
    from scipy import stats
    from sklearn.model_selection import train_test_split
    from sklearn.model_selection import GridSearchCV, RepeatedKFold, cross_val_score,cross_val_predict,KFold
    from sklearn.metrics import make_scorer,mean_squared_error
    from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
    from sklearn.svm import LinearSVR, SVR
    from sklearn.neighbors import KNeighborsRegressor
    from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor,AdaBoostRegressor
    from xgboost import XGBRegressor
    from sklearn.preprocessing import PolynomialFeatures,MinMaxScaler,StandardScaler
    
    #load_dataset
    with open("./zhengqi_train.txt")  as fr:
        data_train=pd.read_table(fr,sep="\t")
    with open("./zhengqi_test.txt") as fr_test:
        data_test=pd.read_table(fr_test,sep="\t")
        
    #merge train_set and test_set
    data_train["oringin"]="train"
    data_test["oringin"]="test"
    data_all=pd.concat([data_train,data_test],axis=0,ignore_index=True)
    
    data_all.drop(["V5","V9","V11","V17","V22","V28"],axis=1,inplace=True)
    
    # normalise numeric columns
    cols_numeric=list(data_all.columns)
    cols_numeric.remove("oringin")
    def scale_minmax(col):
        return (col-col.min())/(col.max()-col.min())
    scale_cols = [col for col in cols_numeric if col!='target']
    data_all[scale_cols] = data_all[scale_cols].apply(scale_minmax,axis=0)
    
    #Box-Cox
    cols_transform=data_all.columns[0:-2]
    for col in cols_transform:   
        # transform column
        data_all.loc[:,col], _ = stats.boxcox(data_all.loc[:,col]+1)
        
    #Log Transform SalePrice to improve normality
    sp = data_train.target
    data_train.target1 =np.power(1.5,sp)
    
    # function to get training samples
    def get_training_data():
        # extract training samples
        from sklearn.model_selection import train_test_split
        df_train = data_all[data_all["oringin"]=="train"]
        df_train["label"]=data_train.target1
        # split SalePrice and features
        y = df_train.target
        X = df_train.drop(["oringin","target","label"],axis=1)
        X_train,X_valid,y_train,y_valid=train_test_split(X,y,test_size=0.3,random_state=100)
        return X_train,X_valid,y_train,y_valid
    
    # extract test data (without SalePrice)
    def get_test_data():
        df_test = data_all[data_all["oringin"]=="test"].reset_index(drop=True)
        return df_test.drop(["oringin","target"],axis=1)
    
    from sklearn.metrics import make_scorer
    # metric for evaluation
    def rmse(y_true, y_pred):
        diff = y_pred - y_true
        sum_sq = sum(diff**2)    
        n = len(y_pred)   
        
        return np.sqrt(sum_sq/n)
    def mse(y_ture,y_pred):
        return mean_squared_error(y_ture,y_pred)
    
    # scorer to be used in sklearn model fitting
    rmse_scorer = make_scorer(rmse, greater_is_better=False)
    mse_scorer = make_scorer(mse, greater_is_better=False)
    
    def find_outliers(model, X, y, sigma=3):
    
        # predict y values using model
        try:
            y_pred = pd.Series(model.predict(X), index=y.index)
        # if predicting fails, try fitting the model first
        except:
            model.fit(X,y)
            y_pred = pd.Series(model.predict(X), index=y.index)
            
        # calculate residuals between the model prediction and true y values
        resid = y - y_pred
        mean_resid = resid.mean()
        std_resid = resid.std()
    
        # calculate z statistic, define outliers to be where |z|>sigma
        z = (resid - mean_resid)/std_resid    
        outliers = z[abs(z)>sigma].index
        
        return outliers
    
    from sklearn.linear_model import Ridge
    X_train, X_valid,y_train,y_valid = get_training_data()
    test=get_test_data()
    
    # find and remove outliers using a Ridge model
    outliers = find_outliers(Ridge(), X_train, y_train)
    
    # permanently remove these outliers from the data
    X_outliers=X_train.loc[outliers]
    y_outliers=y_train.loc[outliers]
    X_t=X_train.drop(outliers)
    y_t=y_train.drop(outliers)
    
    ef get_trainning_data_omitoutliers():
        y1=y_t.copy()
        X1=X_t.copy()
        return X1,y1
    
    from sklearn.preprocessing import StandardScaler
    def train_model(model, param_grid=[], X=[], y=[], 
                    splits=5, repeats=5):
    
        # get unmodified training data, unless data to use already specified
        if len(y)==0:
            X,y = get_trainning_data_omitoutliers()
            #poly_trans=PolynomialFeatures(degree=2)
            #X=poly_trans.fit_transform(X)
            #X=MinMaxScaler().fit_transform(X)
        
        # create cross-validation method
        rkfold = RepeatedKFold(n_splits=splits, n_repeats=repeats)
        
        # perform a grid search if param_grid given
        if len(param_grid)>0:
            # setup grid search parameters
            gsearch = GridSearchCV(model, param_grid, cv=rkfold,
                                   scoring="neg_mean_squared_error",
                                   verbose=1, return_train_score=True)
    
            # search the grid
            gsearch.fit(X,y)
    
            # extract best model from the grid
            model = gsearch.best_estimator_        
            best_idx = gsearch.best_index_
    
            # get cv-scores for best model
            grid_results = pd.DataFrame(gsearch.cv_results_)       
            cv_mean = abs(grid_results.loc[best_idx,'mean_test_score'])
            cv_std = grid_results.loc[best_idx,'std_test_score']
    
        # no grid search, just cross-val score for given model    
        else:
            grid_results = []
            cv_results = cross_val_score(model, X, y, scoring="neg_mean_squared_error", cv=rkfold)
            cv_mean = abs(np.mean(cv_results))
            cv_std = np.std(cv_results)
        
        # combine mean and std cv-score in to a pandas series
        cv_score = pd.Series({'mean':cv_mean,'std':cv_std})
    
        # predict y using the fitted model
        y_pred = model.predict(X)
        
        # print stats on model performance         
        print('----------------------')
        print(model)
        print('----------------------')
        print('score=',model.score(X,y))
        print('rmse=',rmse(y, y_pred))
        print('mse=',mse(y, y_pred))
        print('cross_val: mean=',cv_mean,', std=',cv_std)
        
        # residual plots
        y_pred = pd.Series(y_pred,index=y.index)
        resid = y - y_pred
        mean_resid = resid.mean()
        std_resid = resid.std()
        z = (resid - mean_resid)/std_resid    
        n_outliers = sum(abs(z)>3)
        
        plt.figure(figsize=(15,5))
        ax_131 = plt.subplot(1,3,1)
        plt.plot(y,y_pred,'.')
        plt.xlabel('y')
        plt.ylabel('y_pred');
        plt.title('corr = {:.3f}'.format(np.corrcoef(y,y_pred)[0][1]))
        ax_132=plt.subplot(1,3,2)
        plt.plot(y,y-y_pred,'.')
        plt.xlabel('y')
        plt.ylabel('y - y_pred');
        plt.title('std resid = {:.3f}'.format(std_resid))
        
        ax_133=plt.subplot(1,3,3)
        z.plot.hist(bins=50,ax=ax_133)
        plt.xlabel('z')
        plt.title('{:.0f} samples with z>3'.format(n_outliers))
    
        return model, cv_score, grid_results
    
    # places to store optimal models and scores
    opt_models = dict()
    score_models = pd.DataFrame(columns=['mean','std'])
    
    # no. k-fold splits
    splits=5
    # no. k-fold iterations
    repeats=5
    

    7.2.5 单一模型预测结果

    岭回归

    model = 'Ridge'
    
    opt_models[model] = Ridge()
    alph_range = np.arange(0.25,6,0.25)
    param_grid = {'alpha': alph_range}
    
    opt_models[model],cv_score,grid_results = train_model(opt_models[model], param_grid=param_grid, 
                                                  splits=splits, repeats=repeats)
    
    cv_score.name = model
    score_models = score_models.append(cv_score)
    
    plt.figure()
    plt.errorbar(alph_range, abs(grid_results['mean_test_score']),
                 abs(grid_results['std_test_score'])/np.sqrt(splits*repeats))
    plt.xlabel('alpha')
    plt.ylabel('score')
    

    output

    Fitting 25 folds for each of 23 candidates, totalling 575 fits
    [Parallel(n_jobs=1)]: Done 575 out of 575 | elapsed:   43.7s finished
    ----------------------
    Ridge(alpha=0.25, copy_X=True, fit_intercept=True, max_iter=None,
       normalize=False, random_state=None, solver='auto', tol=0.001)
    ----------------------
    score= 0.8962818810610906
    rmse= 0.31906002611988776
    mse= 0.10179930026762334
    cross_val: mean= 0.10638412641872626 , std= 0.007268366114488213
    Text(0, 0.5, 'score')
    
    岭回归

    lasso回归

    model = 'Lasso'
    
    opt_models[model] = Lasso()
    alph_range = np.arange(1e-4,1e-3,4e-5)
    param_grid = {'alpha': alph_range}
    
    opt_models[model], cv_score, grid_results = train_model(opt_models[model], param_grid=param_grid, 
                                                  splits=splits, repeats=repeats)
    
    cv_score.name = model
    score_models = score_models.append(cv_score)
    
    plt.figure()
    plt.errorbar(alph_range, abs(grid_results['mean_test_score']),abs(grid_results['std_test_score'])/np.sqrt(splits*repeats))
    plt.xlabel('alpha')
    plt.ylabel('score')
    

    output

    Fitting 25 folds for each of 23 candidates, totalling 575 fits
    [Parallel(n_jobs=1)]: Done 575 out of 575 | elapsed:  1.5min finished
    ----------------------
    Lasso(alpha=0.0001, copy_X=True, fit_intercept=True, max_iter=1000,
       normalize=False, positive=False, precompute=False, random_state=None,
       selection='cyclic', tol=0.0001, warm_start=False)
    ----------------------
    score= 0.8964039177441461
    rmse= 0.31887226486884535
    mse= 0.10167952130258699
    cross_val: mean= 0.10608347958156857 , std= 0.006409844698430518
    Text(0, 0.5, 'score')
    
    lasso回归

    7.2.6 模型融合Boosting方法

    GBDT模型

    model = 'GradientBoosting'
    opt_models[model] = GradientBoostingRegressor()
    
    param_grid = {'n_estimators':[150,250,350],
                  'max_depth':[1,2,3],
                  'min_samples_split':[5,6,7]}
    
    opt_models[model], cv_score, grid_results = train_model(opt_models[model], param_grid=param_grid, 
                                                  splits=splits, repeats=1)
    
    cv_score.name = model
    score_models = score_models.append(cv_score)
    

    output

    Fitting 5 folds for each of 27 candidates, totalling 135 fits
    [Parallel(n_jobs=1)]: Done 135 out of 135 | elapsed:  1.1min finished
    ----------------------
    GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
                 learning_rate=0.1, loss='ls', max_depth=3, max_features=None,
                 max_leaf_nodes=None, min_impurity_decrease=0.0,
                 min_impurity_split=None, min_samples_leaf=1,
                 min_samples_split=5, min_weight_fraction_leaf=0.0,
                 n_estimators=250, presort='auto', random_state=None,
                 subsample=1.0, verbose=0, warm_start=False)
    ----------------------
    score= 0.9687667265046709
    rmse= 0.17508697226339623
    mse= 0.03065544785636322
    cross_val: mean= 0.0975195734183274 , std= 0.005170355408187927
    
    GBDT模型

    XGB模型

    model = 'XGB'
    opt_models[model] = XGBRegressor()
    
    param_grid = {'n_estimators':[100,200,300,400,500],
                  'max_depth':[1,2,3],
                 }
    
    opt_models[model], cv_score,grid_results = train_model(opt_models[model], param_grid=param_grid, 
                                                  splits=splits, repeats=1)
    
    cv_score.name = model
    score_models = score_models.append(cv_score)
    

    output

    Fitting 5 folds for each of 15 candidates, totalling 75 fits
    [Parallel(n_jobs=1)]: Done  75 out of  75 | elapsed:  3.2min finished
    ----------------------
    XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
           colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
           importance_type='gain', interaction_constraints='',
           learning_rate=0.300000012, max_delta_step=0, max_depth=3,
           min_child_weight=1, missing=nan, monotone_constraints='()',
           n_estimators=100, n_jobs=0, num_parallel_tree=1,
           objective='reg:squarederror', random_state=0, reg_alpha=0,
           reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='exact',
           validate_parameters=1, verbosity=None)
    ----------------------
    score= 0.97027238602052
    rmse= 0.17081464670059526
    mse= 0.0291776435274492
    cross_val: mean= 0.10242834745407967 , std= 0.010701020260954689
    
    XGB模型

    7.2.7 多模型预测bagging方法

    def model_predict(test_data,test_y=[],stack=False):
        #poly_trans=PolynomialFeatures(degree=2)
        #test_data1=poly_trans.fit_transform(test_data)
        #test_data=MinMaxScaler().fit_transform(test_data)
        i=0
        y_predict_total=np.zeros((test_data.shape[0],))
        for model in opt_models.keys():
            if model!="LinearSVR" and model!="KNeighbors":
                y_predict=opt_models[model].predict(test_data)
                y_predict_total+=y_predict
                i+=1
            if len(test_y)>0:
                print("{}_mse:".format(model),mean_squared_error(y_predict,test_y))
        y_predict_mean=np.round(y_predict_total/i,3)
        if len(test_y)>0:
            print("mean_mse:",mean_squared_error(y_predict_mean,test_y))
        else:
            y_predict_mean=pd.Series(y_predict_mean)
            return y_predict_mean
    
    model_predict(X_valid,y_valid)
    

    output

    Ridge_mse: 0.13825688728309005
    Lasso_mse: 0.13841411408178492
    XGB_mse: 0.1413474866753928
    mean_mse: 0.1289751753171857
    

    7.2.8 多模型融合stacking方法

    模型融合stacking简单示例

    pip install mlxtend --user
    
    import numpy as np
    import matplotlib.pyplot as plt
    import matplotlib.gridspec as gridspec
    import itertools
    from sklearn.linear_model import LogisticRegression
    from sklearn.svm import SVC
    from sklearn.ensemble import RandomForestClassifier
    
    ##主要使用pip install mlxtend安装mlxtend
    from mlxtend.classifier import EnsembleVoteClassifier
    from mlxtend.data import iris_data
    from mlxtend.plotting import plot_decision_regions
    %matplotlib inline
    
    # Initializing Classifiers
    clf1 = LogisticRegression(random_state=0)
    clf2 = RandomForestClassifier(random_state=0)
    clf3 = SVC(random_state=0, probability=True)
    eclf = EnsembleVoteClassifier(clfs=[clf1, clf2, clf3], weights=[2, 1, 1], voting='soft')
    
    # Loading some example data
    X, y = iris_data()
    X = X[:,[0, 2]]
    
    # Plotting Decision Regions
    gs = gridspec.GridSpec(2, 2)
    fig = plt.figure(figsize=(10, 8))
    
    for clf, lab, grd in zip([clf1, clf2, clf3, eclf],
                             ['Logistic Regression', 'Random Forest', 'RBF kernel SVM', 'Ensemble'],
                             itertools.product([0, 1], repeat=2)):
        clf.fit(X, y)
        ax = plt.subplot(gs[grd[0], grd[1]])
        fig = plot_decision_regions(X=X, y=y, clf=clf, legend=2)
        plt.title(lab)
    plt.show()
    
    模型融合stacking简单示例

    相关文章

      网友评论

          本文标题:阿里云蒸汽量预测新人赛赛题解析

          本文链接:https://www.haomeiwen.com/subject/jnttuktx.html