美文网首页
机器学习代码记录-分类(机器学习)

机器学习代码记录-分类(机器学习)

作者: 深度学习模型优化 | 来源:发表于2019-06-28 06:54 被阅读0次

    分类问题的k折校验,一般使用KFold,而分类问题一般使用StratifiedKFold。

    • 参数:X - 训练数据(可以是pd.DataFrame或者np.ndarray)一般是归一化后的数据
    • 参数:X_test - 测试数据,(可以是pd.DataFrame或者np.ndarray)一般是归一化后的数据
    • 参数:y - 目标值
    • 参数:folds - KFolds校验方法。记得事先设置好n_splits
    • 参数:eval_metric - 使用测度标准
    • 参数:要使用列的名称
    • 参数:plot_feature_importance - 是否打印LGB的特征重要性
    • 参数:model - sklearn 模型, 仅仅在sklearn模型类型时才用
    def train_model_classification(X, X_test, y, params, folds, model_type='lgb', eval_metric='auc', columns=None, plot_feature_importance=False, model=None,
                                   verbose=10000, early_stopping_rounds=200, n_estimators=50000):
        """
        A function to train a variety of regression models.
        Returns dictionary with oof predictions, test predictions, scores and, if necessary, feature importances.
        
        :params: X - training data, can be pd.DataFrame or np.ndarray (after normalizing)
        :params: X_test - test data, can be pd.DataFrame or np.ndarray (after normalizing)
        :params: y - target
        :params: folds - folds to split data
        :params: model_type - type of model to use
        :params: eval_metric - metric to use
        :params: columns - columns to use. If None - use all columns
        :params: plot_feature_importance - whether to plot feature importance of LGB
        :params: model - sklearn model, works only for "sklearn" model type
        
        """
        columns = X.columns if columns == None else columns
        X_test = X_test[columns]
        
        # to set up scoring parameters
        metrics_dict = {'auc': {'lgb_metric_name': eval_auc,
                            'catboost_metric_name': 'AUC',
                            'sklearn_scoring_function': metrics.roc_auc_score},
                        }
        
        result_dict = {}
        
        # out-of-fold predictions on train data
        oof = np.zeros((len(X), len(set(y.values))))
        
        # averaged predictions on train data
        prediction = np.zeros((len(X_test), oof.shape[1]))
        
        # list of scores on folds
        scores = []
        feature_importance = pd.DataFrame()
        
        # split and train on folds
        for fold_n, (train_index, valid_index) in enumerate(folds.split(X)):
            print(f'Fold {fold_n + 1} started at {time.ctime()}')
            if type(X) == np.ndarray:
                X_train, X_valid = X[columns][train_index], X[columns][valid_index]
                y_train, y_valid = y[train_index], y[valid_index]
            else:
                X_train, X_valid = X[columns].iloc[train_index], X[columns].iloc[valid_index]
                y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
                
            if model_type == 'lgb':
                model = lgb.LGBMClassifier(**params, n_estimators=n_estimators, n_jobs = -1)
                model.fit(X_train, y_train, 
                        eval_set=[(X_train, y_train), (X_valid, y_valid)], eval_metric=metrics_dict[eval_metric]['lgb_metric_name'],
                        verbose=verbose, early_stopping_rounds=early_stopping_rounds)
                
                y_pred_valid = model.predict_proba(X_valid)
                y_pred = model.predict_proba(X_test, num_iteration=model.best_iteration_)
                
            if model_type == 'xgb':
                train_data = xgb.DMatrix(data=X_train, label=y_train, feature_names=X.columns)
                valid_data = xgb.DMatrix(data=X_valid, label=y_valid, feature_names=X.columns)
    
                watchlist = [(train_data, 'train'), (valid_data, 'valid_data')]
                model = xgb.train(dtrain=train_data, num_boost_round=n_estimators, evals=watchlist, early_stopping_rounds=early_stopping_rounds, verbose_eval=verbose, params=params)
                y_pred_valid = model.predict(xgb.DMatrix(X_valid, feature_names=X.columns), ntree_limit=model.best_ntree_limit)
                y_pred = model.predict(xgb.DMatrix(X_test, feature_names=X.columns), ntree_limit=model.best_ntree_limit)
            
            if model_type == 'sklearn':
                model = model
                model.fit(X_train, y_train)
                
                y_pred_valid = model.predict(X_valid).reshape(-1,)
                score = metrics_dict[eval_metric]['sklearn_scoring_function'](y_valid, y_pred_valid)
                print(f'Fold {fold_n}. {eval_metric}: {score:.4f}.')
                print('')
                
                y_pred = model.predict_proba(X_test)
            
            if model_type == 'cat':
                model = CatBoostClassifier(iterations=n_estimators, eval_metric=metrics_dict[eval_metric]['catboost_metric_name'], **params,
                                          loss_function=metrics_dict[eval_metric]['catboost_metric_name'])
                model.fit(X_train, y_train, eval_set=(X_valid, y_valid), cat_features=[], use_best_model=True, verbose=False)
    
                y_pred_valid = model.predict(X_valid)
                y_pred = model.predict(X_test)
            
            oof[valid_index] = y_pred_valid
            scores.append(metrics_dict[eval_metric]['sklearn_scoring_function'](y_valid, y_pred_valid[:, 1]))
    
            prediction += y_pred    
            
            if model_type == 'lgb' and plot_feature_importance:
                # feature importance
                fold_importance = pd.DataFrame()
                fold_importance["feature"] = columns
                fold_importance["importance"] = model.feature_importances_
                fold_importance["fold"] = fold_n + 1
                feature_importance = pd.concat([feature_importance, fold_importance], axis=0)
    
        prediction /= folds.n_splits
        
        print('CV mean score: {0:.4f}, std: {1:.4f}.'.format(np.mean(scores), np.std(scores)))
        
        result_dict['oof'] = oof
        result_dict['prediction'] = prediction
        result_dict['scores'] = scores
        
        if model_type == 'lgb':
            if plot_feature_importance:
                feature_importance["importance"] /= folds.n_splits
                cols = feature_importance[["feature", "importance"]].groupby("feature").mean().sort_values(
                    by="importance", ascending=False)[:50].index
    
                best_features = feature_importance.loc[feature_importance.feature.isin(cols)]
    
                plt.figure(figsize=(16, 12));
                sns.barplot(x="importance", y="feature", data=best_features.sort_values(by="importance", ascending=False));
                plt.title('LGB Features (avg over folds)');
                
                result_dict['feature_importance'] = feature_importance
            
        return result_dict
    

    相关文章

      网友评论

          本文标题:机器学习代码记录-分类(机器学习)

          本文链接:https://www.haomeiwen.com/subject/cophcctx.html