xgboost多分类问题训练及特征选择

作者: 婉妃 | 来源:发表于2019-06-12 15:13 被阅读0次

xgboost多分类问题训练及特征选择
导入jieba库提取字词特征，使用分类模型解决多新闻分类
xgboost二元分类和多元分类原理总结
用于分类的决策树的理解
机器学习算法——决策树2
scala 连续特征转化成离散特征
模型表示（Model representation）
Xgboost
XGB：展示每个预测结果的各特征概率
用R语言实现遗传算法

按信息增益排序:
bst.get_score(importance_type='gain')

{'ftr_col1': 77.21064539577829,
'ftr_col2': 10.28690566363971,
'ftr_col3': 24.225014841466294,
'ftr_col4': 11.234086283060112}

说明: 根据 train() API's 中get_score()方法定义如下:

get_score(fmap='', importance_type='weight')
fmap (str (optional)) – The name of feature map file.
importance_type
‘weight’ - the number of times a feature is used to split the data across all trees.
‘gain’ - the average gain across all splits the feature is used in.
‘cover’ - the average coverage across all splits the feature is used in.
‘total_gain’ - the total gain across all splits the feature is used in.
‘total_cover’ - the total coverage across all splits the feature is used i

API文档: https://xgboost.readthedocs.io/en/latest/python/python_api.html

特征重要度排序:

importance = bst.get_score(importance_type='gain')
sorted_importance = sorted(importance.items(), key=operator.itemgetter(1), reverse=True)

附: 我项目中刚完成的一个完整的多分类问题模型训练案例, 特征名已屏蔽:

# coding: utf-8

import os
import operator
import pandas as pd
import numpy as np

import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

data_folder = '/home/leizhou.xlz/workshop/data'
sample_file = os.path.join(data_folder, 'samples.txt')

df = pd.read_csv(sample_file, sep="\t")
print("number of raw samples: {}".format(len(df)))

df = df[df['label'].notnull()]
print("number of not na samples: {}".format(len(df)))

feature_cols = ['f1','f2',...,'fn']
target_col = ['label']

def apk(actual, predicted, k=5):
    if len(predicted)>k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    if not actual:
        return 0.0

    return score / min(len(actual), k)

def mapk5(actual, predicted, k=5):
    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])

def map5eval(preds, dtrain):
    actual = dtrain.get_label()
    predicted = preds.argsort(axis=1)[:,-np.arange(1,6)]
    metric = 0.
    for i in range(5):
        metric += np.sum(actual==predicted[:,i])/(i+1)
    metric /= actual.shape[0]
    return 'MAP@5', metric

def feature_preprocess(df):
    for f in df.columns:
        if df[f].dtype=='object' or f in target_col:
            lbl = preprocessing.LabelEncoder()
            lbl.fit(list(df[f].values))
            df[f] = lbl.transform(list(df[f].values))
    df.fillna((-999), inplace=True)
    return df


df = feature_preprocess(df)

X_train, X_test, y_train, y_test = train_test_split(df[feature_cols], df[target_col],
                                                    test_size=0.2, random_state=20190612)

print("number of train samples: {}".format(len(X_train)))
print("number of test samples: {}".format(len(X_test)))

dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

print ("labels numbers:")
print(y_train.groupby(['rf_speed'])['rf_speed'].count().to_frame('count'))

param = {}
param['eta'] = 0.01
param['max_depth'] = 6
param["booster"] = "gbtree"
param['silent'] = 1
param['nthread'] = 4
param["silent"] = 1
param["min_child_weight"] = 3
param['num_class'] = 5

watchlist = [(dtrain, 'train'), (dtest, 'test')]
num_round = 5000

# train1
param['objective'] = 'multi:softmax' # shape: (5080,), 直接输出标签[0,1,3,..]
bst = xgb.train(param, dtrain, num_round, watchlist)

# get prediction
pred = bst.predict(dtest) #
error_rate = np.sum(pred != y_test.squeeze(axis=1).values)*1.0 / y_test.shape[0]
print('Test error using softmax = {}'.format(error_rate))


# train2
#param['objective'] = 'multi:softprob' # shape: (5080, 5), 输出nclass 概率值[[ 0.20230168  0.14427288  0.29164004  0.20053586  0.16124953], [...]]
#bst = xgb.train(param, dtrain, num_round, watchlist)
#
## get prediction
#pred_prob = bst.predict(dtest).reshape(y_test.shape[0], 5)
#pred_label = np.argmax(pred_prob, axis=1)
#error_rate = np.sum(pred_label != y_test.squeeze(axis=1).values)*1.0 / y_test.shape[0]
#print('Test error using softprob = {}'.format(error_rate))

importance = bst.get_score(importance_type='gain')
sorted_importance = sorted(importance.items(), key=operator.itemgetter(1), reverse=True)
print('feature importances[gain]:')
print(sorted_importance)

# cv
#print xgb.cv(param, dtrain, 1000, nfold=5, early_stopping_rounds=50)