三大树模型

作者: 一只当归 | 来源:发表于2020-08-17 21:32 被阅读0次

三大树模型
大树老三——
大树大树
第76周，理解业务模型 -学习实践成凤#新学霸社群#
机器学习系列-EM算法
【余老诗经典领读】《大学》三大模型教你面对四大焦虑
“三权分立”模型之角色模型
[成长] 图解：个人成长“大树理论”模型
3ds Max建模规范参考
《大树》

lgb1

from sklearn.externals import joblib
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import GroupKFold
from sklearn.metrics import roc_auc_score
import gc

oof = np.zeros(len(X_train))
preds = np.zeros(len(X_test))

params = {
            'objective':'binary',
          'learning_rate': 0.007,
          "boosting_type": "gbdt",
          'num_leaves':256,
           'tree_learner':'serial',
          "subsample": 0.7,
          "subsample_freq": 1,
          "metric": 'auc',
          'max_bin':255,
          'colsample_bytree': 0.5,   #0.4
          #'categorical_feature': cat_cols
         }
skf = GroupKFold(n_splits = 6)
for i, (idxT, idxV) in enumerate( skf.split(X_train, Y_train, groups=X_train['DT_M']) ):
    month = X_train.iloc[idxV]['DT_M'].iloc[0]
    print('Fold',i,'withholding month',month)
    print(' rows of train =',len(idxT),'rows of holdout =',len(idxV))

    train_data = lgb.Dataset(X_train[cols].iloc[idxT], label= Y_train.iloc[idxT])
    val_data = lgb.Dataset(X_train[cols].iloc[idxV], label=Y_train.iloc[idxV])

    clf = lgb.train(params, train_set=train_data, num_boost_round=50000, valid_sets=[train_data, val_data],
                  valid_names=['train', 'valid'], early_stopping_rounds=100, feval=None, verbose_eval=200)
    
    oof[idxV] += clf.predict(X_train[cols].iloc[idxV])
    preds += clf.predict(X_test[cols])/skf.n_splits

print ('XGB95 OOF CV=',roc_auc_score(Y_train,oof/5))

lig2

lgb_model = lgb.LGBMClassifier(boosting_type='gbdt', num_leaves=100, reg_alpha=3, reg_lambda=5, max_depth=-1,

    n_estimators=5000, objective='binary', subsample=0.9, colsample_bytree=0.77, subsample_freq=1, learning_rate=0.05,

    random_state=1000, n_jobs=16, min_child_weight=4, min_child_samples=5, min_split_gain=0,class_weight = {0:1,1:2.5})

skf = StratifiedKFold(n_splits=5, random_state=2018, shuffle=True)

best_score = []



oof_preds = np.zeros(train.shape[0])

sub_preds = np.zeros(test_id.shape[0])



for index, (train_index, test_index) in enumerate(skf.split(train, label)):

    lgb_model.fit(train.iloc[train_index], label.iloc[train_index], verbose=50,

                  eval_set=[(train.iloc[train_index], label.iloc[train_index]),

                            (train.iloc[test_index], label.iloc[test_index])], early_stopping_rounds=30)

    best_score.append(lgb_model.best_score_['valid_1']['binary_logloss'])

    print(best_score)

    oof_preds[test_index] = lgb_model.predict_proba(train.iloc[test_index], num_iteration=lgb_model.best_iteration_)[:,1]



    test_pred = lgb_model.predict_proba(test, num_iteration=lgb_model.best_iteration_)[:, 1]

    sub_preds += test_pred / 5

    #print('test mean:', test_pred.mean())

    #predict_result['predicted_score'] = predict_result['predicted_score'] + test_pred



m = tpr_weight_funtion(y_predict=oof_preds,y_true=label)

print(m[1])

sub = pd.read_csv('submit.csv')

sub['Tag'] = sub_preds

sub.to_csv('sub/baseline_%s.csv'%str(m),index=False)

随机森林

from sklearn.ensemble import RandomForestClassifier


from sklearn.externals import joblib
import pandas as pd
import numpy as np
from sklearn.model_selection import GroupKFold
from sklearn.metrics import roc_auc_score
import gc
clf = RandomForestClassifier (n_estimators=10, criterion='gini', max_depth=8,class_weight={0:1,1:10})
oof = np.zeros(len(X_train))
preds = np.zeros(len(X_test))

skf = GroupKFold(n_splits = 6)
for i, (idxT, idxV) in enumerate( skf.split(X_train, Y_train, groups=X_train['DT_M']) ):
    month = X_train.iloc[idxV]['DT_M'].iloc[0]
    print('Fold',i,'withholding month',month)
    print(' rows of train =',len(idxT),'rows of holdout =',len(idxV))

    clf.fit(X_train[cols].iloc[idxT],Y_train.iloc[idxT])  
    oof[idxV] += clf.predict_proba(X_train[cols].iloc[idxV])[:,1]
    preds += clf.predict_proba(X_test[cols])[:,1]/skf.n_splits

xgboost

if BUILD96:
    oof = np.zeros(len(X_train))
    preds = np.zeros(len(X_test))

    skf = GroupKFold(n_splits=6)
    for i, (idxT, idxV) in enumerate( skf.split(X_train, y_train, groups=X_train['DT_M']) ):
        month = X_train.iloc[idxV]['DT_M'].iloc[0]
        print('Fold',i,'withholding month',month)
        print(' rows of train =',len(idxT),'rows of holdout =',len(idxV))
        clf = xgb.XGBClassifier(
            n_estimators=5000,
            max_depth=12,
            learning_rate=0.02,
            subsample=0.8,
            colsample_bytree=0.4,
            missing=-1,
            eval_metric='auc',
            # USE CPU
            #nthread=4,
            #tree_method='hist'
            # USE GPU
            tree_method='gpu_hist' 
        )        
        h = clf.fit(X_train[cols].iloc[idxT], y_train.iloc[idxT], 
                eval_set=[(X_train[cols].iloc[idxV],y_train.iloc[idxV])],
                verbose=100, early_stopping_rounds=200)
    
        oof[idxV] += clf.predict_proba(X_train[cols].iloc[idxV])[:,1]
        preds += clf.predict_proba(X_test[cols])[:,1]/skf.n_splits
        del h, clf
        x=gc.collect()
    print('#'*20)
    print ('XGB96 OOF CV=',roc_auc_score(y_train,oof))