美文网首页
1.xgboost_cheatsheet

1.xgboost_cheatsheet

作者: 许志辉Albert | 来源:发表于2021-01-30 09:13 被阅读0次

    1. xgboost速查表

    1.1内置建模方式

    • xgb.train训练方式
    • DMatrix数据形态,不是DataFrame

    1.1.1 读取libsvm格式数据并指定参数建模

    import numpy as np
    import pandas as pd
    import pickle 
    import xgboost as xgb  
    
    # 基本例子,从libsvm文件中读取数据,做二分类
    # 数据是libsvm的格式
    #1 3:1 10:1 11:1 21:1 30:1 34:1 36:1 40:1 41:1 53:1 58:1 65:1 69:1 77:1 86:1 88:1 92:1 95:1 102:1 105:1 117:1 124:1
    #0 3:1 10:1 20:1 21:1 23:1 34:1 36:1 39:1 41:1 53:1 56:1 65:1 69:1 77:1 86:1 88:1 92:1 95:1 102:1 106:1 116:1 120:1
    #0 1:1 10:1 19:1 21:1 24:1 34:1 36:1 39:1 42:1 53:1 56:1 65:1 69:1 77:1 86:1 88:1 92:1 95:1 102:1 106:1 116:1 122:1
    
    dtrain = xgb.DMatrix('agaricus.txt.train')
    dtest = xgb.DMatrix('./data/agaricus.txt.test')
    
    #超参数设定
    param = {'max_depth':2 , 'eta' :1 , 'silent':1 , 'objective' : 'binary :logistic'}
    
    # 设定watchlist用于查看模型状态
    watchlist = [(dtest , 'eval') , (dtrain , 'train')]
    num_round = 2 
    bst = xgb.train(param , dtrain , num_round ,watchlist)
    
    #使用模型预测
    preds = bst.predict(dtest)
    
    #准确率判断
    labels = dtest.get_label()
    print('错误率为%f' % \
           (sum(1 for i in range(len(preds)) if int(preds[i]>0.5)!=labels[i]) /float(len(preds))))
    
    # 模型存储
    bst.save_model('./model/0001.model')
    
    1

    1.1.2 配合pandas DataFrame格式数据建模

    # 皮马印第安人糖尿病数据集 包含很多字段:怀孕次数 口服葡萄糖耐量试验中血浆葡萄糖浓度 舒张压(mm Hg) 三头肌组织褶厚度(mm) 
    # 2小时血清胰岛素(μU/ ml) 体重指数(kg/(身高(m)^2) 糖尿病系统功能 年龄(岁)
    import pandas as pd
    data = pd.read_csv('./data/Pima-Indians-Diabetes.csv')
    data.head()
    
    2
    import numpy as np
    import pandas as pd
    import pickle
    import xgboost as xgb
    from sklearn.model_selection import train_test_split
    
    #基本例子,从csv文件中读取数据,做二分类
    
    #用pandas读入数据
    data  = pd.read_csv('./data/Pima-Indians-Diabetes.csv')
    
    #做数据切分
    train , test = train_test_split(data)
    
    #转换成Dmatrix格式
    feature_columns = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']
    target_column = 'Outcome'
    # 取出numpy array去初始化DMatrix对象
    xgtrain = xgb.DMatrix(train[feature_columns].values, train[target_column].values)
    xgtest = xgb.DMatrix(test[feature_columns].values, test[target_column].values)
    
    #参数设定
    param = {'max_depth':5, 'eta':0.1, 'silent':1, 'subsample':0.7, 'colsample_bytree':0.7, 'objective':'binary:logistic' }
    
    # 设定watchlist用于查看模型状态
    watchlist  = [(xgtest,'eval'), (xgtrain,'train')]
    num_round = 10
    bst = xgb.train(param, xgtrain, num_round, watchlist)
    
    # 使用模型预测
    preds = bst.predict(xgtest)
    
    # 判断准确率
    labels = xgtest.get_label()
    print('错误类为%f' % \
           (sum(1 for i in range(len(preds)) if int(preds[i]>0.5)!=labels[i]) /float(len(preds))))
    
    # 模型存储
    bst.save_model('./model/0002.model')
    
    3

    1.2预估器建模方式(sklearn形态)

    import warnings 
    warings.filterwarning('ignore')
    import numpy as np
    import pandas as pd
    import pickle
    import xgboost as xgb
    from skelarn.model import train_test_split
    from sklearn.externals import joblib
    
    #基本例子,从csv文件读取数据,做二分类
    
    #用pandas读入数据
    data = pd.read_csv('./data/Pima-Indians-Diabetes.csv')
    
    #做数据切分
    train , test = train_test_split(data)
    
    #取出特征X和分类目标y的部分
    feature_columns = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']
    target_column = 'Outcome'
    train_X = train[feature_columns].values
    train_y = train[target_column].values
    test_X = test[feature_columns].values
    test_y = test[target_column].values
    
    #初始化模型
    xgb_classfier = xgb.XGBClassifier(n_estimators = 20 , max_depth = 4 , learing_rate = 0.1 , subsample = 0.7 , colsample_bytree = 0.7)
    
    #拟合模型
    xgb_classifier.fit(train_X , train_y)
    
    #使用模型预测
    preds  = xgb_classifier.predict(test_X)
    
    #判断准确率
    print('错误率为%f' %(preds != test_y).sum() / float(test_y.shape[0])))
    
    #模型存储
    joblib.dump(xgb_classifier , './model/0003.model')
    
    4

    1.3 内置建模方式:交叉验证与高级功能

    1.3.1 交叉验证

    xgb.cv(param , dtrain , num_round , nfold = 5 ,metrics = {'error'} , seed = 0)
    
    5

    1.3.2 添加预处理的交叉验证

    # 计算正负样本比,调整样本权重
    def fpreproc(dtrain , dtest , param):
        label = dtrain.get_label()
        ratio = float(np.sum(label == 0 ) / np.sum(label = 1)
        param['scale_pos_weight']  = ratio
        return (dtrain , dtest , param)
    
    
    # 先做预处理 , 计算样本权重 , 再做交叉验证
    xgb.cv(param , dtrain , num_round , nfold = 5 , metrics = {'auc'} , seed = 0 , fpreproc = fpreproc)
    
    
    6

    1.3.3 自定义损失函数与评估准则

    print('使用自定义函数进行交叉验证')
    #自定义损失函数,需要提供损失函数的一阶导与二阶导
    def logregobj(preds , dtrain):
        labels = dtrain.get_label()
        preds = 1.0/(1.0 + np.exp(-preds))
        grad  = preds - labels 
        hess = preds *(1.0 - preds)
        return grad , hess 
    
    #自定义评估准则 , 评估预测值和标准答案之间的差距
    def evalerror(preds , dtrain):
        labels = dtrain.get_label()
        return 'error' , float(sum(lables != (preds > 0.0))) / len(labels) 
    
    watchlist = [(dtest , 'eval' ) , (dtrain , 'train')]
    param = {'max_depth' :3 , 'eta' :0.1 ,'silent' :1}
    num_round = 5
    
    #自定义损失函数训练
    bst  = xgb.train(param , dtrain , num_round , watchlist, logregobj , evalerror)
    
    #交叉验证
    xgb.cv(param , dtrain , num_round , nfold = 5 , seed = 0 ,obj = logregobj , feval = evalerror)
    
    7

    1.3.4 只用前n颗树预测

    import numpy as np
    import pandas as pd
    import pickle
    import xgboost as xgb
    from sklearn.model_selection import train_test_split
    
    #基本例子,从CSV文件中读取数据做二分类
    
    #用pandas导入数据
    data = pd.read_csv('./data/Pima-Indians-Diabetes.csv')
    
    #做数据切分
    train , test = train_test_split(data)
    
    #转换成Dmatrix格式
    feature_columns = ['Pregnanies' , 'Glucose','BloodPressure' , 'SkinThinkness' , 'Insulin' , 'BMI' , 'DiabetesPedigreeFunction' , 'Age']
    target_column = 'Outcome'
    xgtrain =   xgb.DMatrix(train[feature_columns].values , train[target_column].values)
    xgtest = xgb.DMatrix(test[feature_columns] . values , test[target_columns].values)
    
    #参数设定
    param = {'max_depth' :5 ,'eta' : 0.1 , 'silent' : 1 ,'subsample' :'0.7' , 'colsample_bytree' :0.7 , 'objective' :'binary:logistic' }
    
    #设定watchlist用于查看模型
    watchlist = [(xgtest , 'eval') ,(xgtrain , 'train')] 
    num_round = 10 
    bst = xgb.train(param , xgtrain , num_round , watchlist)
    
    #只用第一颗树预测
    ypred1 = bst.predict(xgtest , ntree_limit = 1 )
    
    #只用前9颗树预测
    ypred2 = bst.predict(xgtest , ntree_limit = 9 )
    label = xgtest.get_label()
    print('用前1颗树预测的错误率为 %f' % (np.sum((ypred1>0.5)!=label) /float(len(label))))
    print('用前9颗树预测的错误率为 %f' % (np.sum((ypred2>0.5)!=label) /float(len(label))))
    
    8

    1.4 预估器建模方式:sklearn与xgboost配合使用

    1.4.1 Xgboost建模 sklearn评估

    import pickle 
    import xgboost as xgb
    
    import numpy as np
    from sklearn.model_selection  import KFold , train_test_split , GridSearchCV
    from sklearn.metrics import confusion_matrix , mean_squared_error
    from sklearn.datasets import load_iris , load_digits , load_boston
    
    rng = np.random.RandomState(31337)
    
    
    #二分类:混淆矩阵
    print('数字0和1的二分类问题')
    digits = load_digits(2)
    y = digits['target']
    X = digits['data']
    
    #数据切分对象
    kf = KFold(n_splits = 2 , shuffle = True , random_state = rng)
    print('在2折数据上的交叉验证')
    
    #2折交叉验证
    for train_index , test_index in kf.split(X):
        xgb_model = xgb.XGBClassifier().fit(X[train_index] , y[train_index])
        preditcions = xgb_model.predict(X[test_index])
        actuals = y[test_index]
        print('混淆矩阵:')
        print(confusion_matrix(actuals, predictions))
    
    #多分类:混淆矩阵
    print('\nIris:多分类')
    y = iris['target']
    X = iris['data']
    kf = KFold(n_splits = 2 , shuffle = True , random_state = rng)
    print('在2折数据上交叉验证')
    for train_index , test_index in kf.split(X):
        xgb_model = xgb.XGBClassifier().fit(X[train_index] , y[train_index])
        predictions = xgb_model.predict(X[test_index])
        actuals = y[test_index]
        print('混淆矩阵')
        print(confusion_matrxi(actuals , predictions))
    
    #回归问题:MSE
    print('\n波士顿放假回归预测问题')
    boston = load_boston()
    y = boston['target']
    x = boston['data']
    kf = KFold(n_splits = 2 , shuffle = True , random_state = rng)
    print('在2折数据上的交叉验证')
    for train_index, test_index in kf.split(X):
        xgb_model = xgb.XGBRegressor().fit(X[train_index],y[train_index])
        predictions = xgb_model.predict(X[test_index])
        actuals = y[test_index]
        print("MSE:",mean_squared_error(actuals, predictions))
    
    
    9

    1.4.2 网格搜索交叉验证查找最优参数

    print('参数最优化:')
    y = boston['target']
    x = boston['data']
    xgb_model = xgb.XGBRegressor()
    clf = GridSearchCV(xgb_model , {'max_depth' : [2,4,6] , 'n_estimators' : [20 , 100,200]} , verbose = 1 )
    
    clf.fit(X , y)
    print(clf.best_score_)
    print(clf.best_params_)
    
    10

    1.4.3 early-stoping早停

    #在训练集上学习模型,一颗一颗树添加,在验证集上看效果,当效果不再提升,停止树的添加与生长
    
    X = digits['data']
    y = digits['target']
    
    X_train , X_val , y_train , y_val = train_test_split(X , y ,random_state= 0)
    clf = xgb.XGBClassifier()
    clf.fit(X_train , y_train , early_stopping_round = 10 ,eval_metric = 'auc' , eval_set = [(X_val , y_val)])
    
    11
    12

    1.4.4 特征重要度

    iris = load_iris()
    y = iris['traget']
    x = iris['data']
    xgb_model = xgb.XGBClassifier().fit(X , y)
    
    print('特征排序')
    feature_names = ['sepal_length' , 'sepal_width' , 'petal_length' ,'petal_width']
    feature_importances = xgb_model.feature_importances_
    indices = np.argsort(feature_importances)[: :-1]
    
    for index in indices:
        print("特征 %s 重要度为 %f" %(feature_names[index], feature_importances[index]))
    
    %matplotlib inline
    import matplotlib.pyplot as plt
    plt.figure(figsize=(16,8))
    plt.title("feature importances")
    plt.bar(range(len(feature_importances)), feature_importances[indices], color='b')
    plt.xticks(range(len(feature_importances)), np.array(feature_names)[indices], color='b')
    
    13 14

    1.4.5 并行加速

    import os
    
    if __name__ == "__main__":
        try:
            from multiprocessing import set_start_method
        except ImportError:
            raise ImportError("Unable to import multiprocessing.set_start_method."
                              " This example only runs on Python 3.4")
        #set_start_method("forkserver")
    
        import numpy as np
        from sklearn.model_selection import GridSearchCV
        from sklearn.datasets import load_boston
        import xgboost as xgb
    
        rng = np.random.RandomState(31337)
    
        print("Parallel Parameter optimization")
        boston = load_boston()
    
        os.environ["OMP_NUM_THREADS"] = "2"  # or to whatever you want
        y = boston['target']
        X = boston['data']
        xgb_model = xgb.XGBRegressor()
        clf = GridSearchCV(xgb_model, {'max_depth': [2, 4, 6],
                                       'n_estimators': [50, 100, 200]}, verbose=1,
                           n_jobs=2)
        clf.fit(X, y)
        print(clf.best_score_)
        print(clf.best_params_)
    
    15

    相关文章

      网友评论

          本文标题:1.xgboost_cheatsheet

          本文链接:https://www.haomeiwen.com/subject/lndszktx.html