美文网首页
XGBoost baseline

XGBoost baseline

作者: 请不要问我是谁 | 来源:发表于2018-09-16 19:28 被阅读0次

    XGBoost是一个优化的分布式梯度增强库,它在Gradient Boosting框架下实现机器学习算法。XGBoost提供了并行树提升(也称为GBDT,GBM)。
    使用交叉验证,以f1为评价方法的baseline:

    #!/usr/bin/env python
    # _*_ coding:utf-8 _*_
    
    import matplotlib.pyplot as plt
    import pandas as pd
    import numpy as np
    import xgboost as xgb
    from sklearn.externals import joblib
    from sklearn.model_selection import StratifiedKFold
    from sklearn.metrics import f1_score
    import warnings
    warnings.filterwarnings('ignore')
    
    pd.set_option('display.max_columns', 500, 'display.width', 1000)  # 设置显示宽度
    data_train = pd.read_csv("train_3.csv")
    data_test = pd.read_csv("test_3.csv")
    # 对current_service映射编码
    label2current_service = dict(zip(range(0, len(set(data_train['current_service']))),
                                     sorted(list(set(data_train['current_service'])))))
    current_service2label = dict(zip(sorted(list(set(data_train['current_service']))),
                                     range(0, len(set(data_train['current_service'])))))
    data_train['current_service'] = data_train['current_service'].map(current_service2label)
    # print(len(set(data_train['current_service'])))      # 15种类型
    y = data_train.pop('current_service')
    user_id = data_train.pop('user_id')
    x_train = data_train
    test_user_id = data_test.pop('user_id')
    x_test = data_test
    print(x_train.info())
    X, y, X_test = x_train.values, y.values, x_test.values
    n_splits = 5
    seed = 2333
    
    
    # 采取k折模型方案
    # 自定义F1评价函数
    def f1_score_vail(pred, data_vail):
        labels = data_vail.get_label()
        score_vail = f1_score(y_true=labels, y_pred=pred, average='macro')      # xgb的predict输出即为对应的label
        return '1-f1_score', 1-score_vail   # xgb目标是将目标指标降低
    
    
    # xgb参数
    xgb_params = {
        "max_depth": 7,
        "learning_rate": 0.05,
        "objective": "multi:softmax",
        "silent": 1,
        "eta": 0.1,
        "n_jobs": -1,
        "num_class": 15,
        "subsample": 0.8,
        "min_child_weight": 2,
        "seed": 2333,
        "alpha": 0.1,
        "lambda": 0.2,
        "predictor": "gpu_predictor",
        "num_boost_round": 100000,
        "colsample_bytree": 0.7,
        "colsample_bylevel": 0.7,
        "tree_method": 'gpu_exact'
    }
    
    x_score = []
    cv_pred = []
    skf = StratifiedKFold(n_splits=n_splits, random_state=seed, shuffle=True)
    X_test = xgb.DMatrix(X_test)    # 转化为xgb需要的数据格式
    
    for index, (train_index, test_index) in enumerate(skf.split(X, y)):
        print(index)
        X_train, X_valid, y_train, y_valid = X[train_index], X[test_index], y[train_index], y[test_index]
        train_data = xgb.DMatrix(X_train, y_train)      # 训练集
        validation_data = xgb.DMatrix(X_valid, y_valid)     # 验证集
        watchlist = [(validation_data, 'train')]    # 验证的数据
        clf = xgb.train(xgb_params, train_data, num_boost_round=100000, early_stopping_rounds=100, feval=f1_score_vail,
                        evals=watchlist, verbose_eval=1)    # 训练
        joblib.dump(clf, "model/xgb_{}.m".format(index))
        # clf = joblib.load("model/xgb_{}.m".format(index))
        X_valid = xgb.DMatrix(X_valid)  # 转为xgb需要的格式
        x_pred = clf.predict(X_valid)
        x_score.append(f1_score(y_valid, x_pred, average='macro'))
        y_test = clf.predict(X_test)
        if index == 0:
            cv_pred = np.array(y_test).reshape(-1, 1)
        else:
            cv_pred = np.hstack((cv_pred, np.array(y_test).reshape(-1, 1)))
        if index == 4:
            xgb.plot_importance(clf, max_num_features=25)
            plt.title("Feature_importance")
            plt.show()
    cv_pred = cv_pred.astype(np.int64)  # 转为int64,不然后面会报错
    # 投票
    submit = []
    for line in cv_pred:
        submit.append(np.argmax(np.bincount(line)))
    # 保存结果
    df_test = pd.DataFrame()
    df_test['id'] = list(test_user_id.unique())
    df_test['predict'] = submit
    df_test['predict'] = df_test['predict'].map(label2current_service)
    df_test.to_csv('output/xgb.csv', index=False)
    print(x_score, np.mean(x_score))
    
    
    
    

    相关文章

      网友评论

          本文标题:XGBoost baseline

          本文链接:https://www.haomeiwen.com/subject/tbpfnftx.html