美文网首页
lightGBM二分类加权总结

lightGBM二分类加权总结

作者: FreeTheWorld | 来源:发表于2021-04-26 20:57 被阅读0次

    源码参考:

    https://github.com/microsoft/LightGBM/blob/dc6995742a5284a1e942978e2542fc49adda9ea1/src/objective/binary_objective.hpp

    label_weight

    有两个超参会影响label_weight,分别是scale_pos_weight和is_unbalance.
    对于二分类,正、负样本的label_weight 默认值是1:1,当设置了scale_pos_weight时,正、负样本的label_weight比例变成scale_pos_weight:1。如果未设置scale_pos_weight 但是设置了is_unbalance=true超参后,则正负样本的label_weight比例是样本数的反比。
    具体的参考代码:

    void Init(const Metadata& metadata, data_size_t num_data) override {
        num_data_ = num_data;
        label_ = metadata.label();
        weights_ = metadata.weights();
        data_size_t cnt_positive = 0;
        data_size_t cnt_negative = 0;
        // count for positive and negative samples
        #pragma omp parallel for schedule(static) reduction(+:cnt_positive, cnt_negative)
        for (data_size_t i = 0; i < num_data_; ++i) {
          if (is_pos_(label_[i])) {
            ++cnt_positive;
          } else {
            ++cnt_negative;
          }
        }
        if (cnt_negative == 0 || cnt_positive == 0) {
          Log::Warning("Contains only one class");
          // not need to boost.
          num_data_ = 0;
        }
        Log::Info("Number of positive: %d, number of negative: %d", cnt_positive, cnt_negative);
        // use -1 for negative class, and 1 for positive class
        label_val_[0] = -1;
        label_val_[1] = 1;
        // weight for label
        label_weights_[0] = 1.0f;
        label_weights_[1] = 1.0f;
        // if using unbalance, change the labels weight
        if (is_unbalance_ && cnt_positive > 0 && cnt_negative > 0) {
          if (cnt_positive > cnt_negative) {
            label_weights_[1] = 1.0f;
            label_weights_[0] = static_cast<double>(cnt_positive) / cnt_negative;
          } else {
            label_weights_[1] = static_cast<double>(cnt_negative) / cnt_positive;
            label_weights_[0] = 1.0f;
          }
        }
        label_weights_[1] *= scale_pos_weight_;
      }
    

    sample_weight

    lgb还支持设置样本权重sample_weight,样本权重一旦设置会和label_weight一起影响最终的gradients和hessians,且sample_weight可以为0,为0的意思是当前样本不参与模型训练。
    具体的参考代码:

    void GetGradients(const double* score, score_t* gradients, score_t* hessians) const override {
        if (weights_ == nullptr) {
          #pragma omp parallel for schedule(static)
          for (data_size_t i = 0; i < num_data_; ++i) {
            // get label and label weights
            const int is_pos = is_pos_(label_[i]);
            const int label = label_val_[is_pos];
            const double label_weight = label_weights_[is_pos];
            // calculate gradients and hessians
            const double response = -label * sigmoid_ / (1.0f + std::exp(label * sigmoid_ * score[i]));
            const double abs_response = fabs(response);
            gradients[i] = static_cast<score_t>(response * label_weight);
            hessians[i] = static_cast<score_t>(abs_response * (sigmoid_ - abs_response) * label_weight);
          }
        } else {
          #pragma omp parallel for schedule(static)
          for (data_size_t i = 0; i < num_data_; ++i) {
            // get label and label weights
            const int is_pos = is_pos_(label_[i]);
            const int label = label_val_[is_pos];
            const double label_weight = label_weights_[is_pos];
            // calculate gradients and hessians
            const double response = -label * sigmoid_ / (1.0f + std::exp(label * sigmoid_ * score[i]));
            const double abs_response = fabs(response);
            gradients[i] = static_cast<score_t>(response * label_weight  * weights_[i]);
            hessians[i] = static_cast<score_t>(abs_response * (sigmoid_ - abs_response) * label_weight * weights_[i]);
          }
        }
      }
    

    (待续~)

    相关文章

      网友评论

          本文标题:lightGBM二分类加权总结

          本文链接:https://www.haomeiwen.com/subject/ysfyrltx.html