按信息增益排序:
bst.get_score(importance_type='gain')
{'ftr_col1': 77.21064539577829,
'ftr_col2': 10.28690566363971,
'ftr_col3': 24.225014841466294,
'ftr_col4': 11.234086283060112}
说明: 根据 train() API's 中get_score()
方法定义如下:
get_score(fmap='', importance_type='weight')
fmap (str (optional)) – The name of feature map file.
importance_type
‘weight’ - the number of times a feature is used to split the data across all trees.
‘gain’ - the average gain across all splits the feature is used in.
‘cover’ - the average coverage across all splits the feature is used in.
‘total_gain’ - the total gain across all splits the feature is used in.
‘total_cover’ - the total coverage across all splits the feature is used i
API文档: https://xgboost.readthedocs.io/en/latest/python/python_api.html
特征重要度排序:
importance = bst.get_score(importance_type='gain')
sorted_importance = sorted(importance.items(), key=operator.itemgetter(1), reverse=True)
附: 我项目中刚完成的一个完整的多分类问题模型训练案例, 特征名已屏蔽:
# coding: utf-8
import os
import operator
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
data_folder = '/home/leizhou.xlz/workshop/data'
sample_file = os.path.join(data_folder, 'samples.txt')
df = pd.read_csv(sample_file, sep="\t")
print("number of raw samples: {}".format(len(df)))
df = df[df['label'].notnull()]
print("number of not na samples: {}".format(len(df)))
feature_cols = ['f1','f2',...,'fn']
target_col = ['label']
def apk(actual, predicted, k=5):
if len(predicted)>k:
predicted = predicted[:k]
score = 0.0
num_hits = 0.0
for i,p in enumerate(predicted):
if p in actual and p not in predicted[:i]:
num_hits += 1.0
score += num_hits / (i+1.0)
if not actual:
return 0.0
return score / min(len(actual), k)
def mapk5(actual, predicted, k=5):
return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])
def map5eval(preds, dtrain):
actual = dtrain.get_label()
predicted = preds.argsort(axis=1)[:,-np.arange(1,6)]
metric = 0.
for i in range(5):
metric += np.sum(actual==predicted[:,i])/(i+1)
metric /= actual.shape[0]
return 'MAP@5', metric
def feature_preprocess(df):
for f in df.columns:
if df[f].dtype=='object' or f in target_col:
lbl = preprocessing.LabelEncoder()
lbl.fit(list(df[f].values))
df[f] = lbl.transform(list(df[f].values))
df.fillna((-999), inplace=True)
return df
df = feature_preprocess(df)
X_train, X_test, y_train, y_test = train_test_split(df[feature_cols], df[target_col],
test_size=0.2, random_state=20190612)
print("number of train samples: {}".format(len(X_train)))
print("number of test samples: {}".format(len(X_test)))
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)
print ("labels numbers:")
print(y_train.groupby(['rf_speed'])['rf_speed'].count().to_frame('count'))
param = {}
param['eta'] = 0.01
param['max_depth'] = 6
param["booster"] = "gbtree"
param['silent'] = 1
param['nthread'] = 4
param["silent"] = 1
param["min_child_weight"] = 3
param['num_class'] = 5
watchlist = [(dtrain, 'train'), (dtest, 'test')]
num_round = 5000
# train1
param['objective'] = 'multi:softmax' # shape: (5080,), 直接输出标签[0,1,3,..]
bst = xgb.train(param, dtrain, num_round, watchlist)
# get prediction
pred = bst.predict(dtest) #
error_rate = np.sum(pred != y_test.squeeze(axis=1).values)*1.0 / y_test.shape[0]
print('Test error using softmax = {}'.format(error_rate))
# train2
#param['objective'] = 'multi:softprob' # shape: (5080, 5), 输出nclass 概率值[[ 0.20230168 0.14427288 0.29164004 0.20053586 0.16124953], [...]]
#bst = xgb.train(param, dtrain, num_round, watchlist)
#
## get prediction
#pred_prob = bst.predict(dtest).reshape(y_test.shape[0], 5)
#pred_label = np.argmax(pred_prob, axis=1)
#error_rate = np.sum(pred_label != y_test.squeeze(axis=1).values)*1.0 / y_test.shape[0]
#print('Test error using softprob = {}'.format(error_rate))
importance = bst.get_score(importance_type='gain')
sorted_importance = sorted(importance.items(), key=operator.itemgetter(1), reverse=True)
print('feature importances[gain]:')
print(sorted_importance)
# cv
#print xgb.cv(param, dtrain, 1000, nfold=5, early_stopping_rounds=50)
网友评论