美文网首页我爱编程
ROIPooling代码理解(CPU)

ROIPooling代码理解(CPU)

作者: 魔法少女玛格姬 | 来源:发表于2018-05-23 11:55 被阅读0次

    MXNet中ROIPooling的具体实现。
    代码来自https://github.com/apache/incubator-mxnet
    包括前向传播的c++实现

    void ROIPoolForward(out,in,bbox,max_idx,spatial_scal){...}
    

    反向传播的c++实现

    void ROIPoolBackwardAcc(in_grad,out_grad,bbox,max_idx,spatial_scal){...}
    

    以上操作封装成Operator,并在MXNet里注册。
    前传定义为ROIPooling,反传定义为ROIPoolingProp。

    namespace mxnet {
    namespace op {
    
    template<>
    Operator *CreateOp<cpu>(ROIPoolingParam param, int dtype) {
      Operator* op = NULL;
      MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
        op = new ROIPoolingOp<cpu, DType>(param);
      });
      return op;
    }
    
    Operator *ROIPoolingProp::CreateOperatorEx(Context ctx, std::vector<TShape> *in_shape,
                                               std::vector<int> *in_type) const {
      DO_BIND_DISPATCH(CreateOp, param_, in_type->at(0));
    }
    
    DMLC_REGISTER_PARAMETER(ROIPoolingParam);
    
    MXNET_REGISTER_OP_PROPERTY(ROIPooling, ROIPoolingProp).describe(...)
    

    代码中还给了具体的例子:

    //4-d tensor input, shape=(batch,channel,h,w)
      x = [[[[  0.,   1.,   2.,   3.,   4.,   5.],
             [  6.,   7.,   8.,   9.,  10.,  11.],
             [ 12.,  13.,  14.,  15.,  16.,  17.],
             [ 18.,  19.,  20.,  21.,  22.,  23.],
             [ 24.,  25.,  26.,  27.,  28.,  29.],
             [ 30.,  31.,  32.,  33.,  34.,  35.],
             [ 36.,  37.,  38.,  39.,  40.,  41.],
             [ 42.,  43.,  44.,  45.,  46.,  47.]]]]
    
    //2-d tensor bounding box, shape=(num_roi, coordinate)
     y = [[0,0,0,4,4]]
    
    //pooled_size
     (2,2)
    
    //ROIPooling的结果,缩放尺寸为1的情况下,bbox的坐标为[0,0,4,4],pooling到2*2的尺寸
      ROIPooling(x, y, (2,2), 1.0) = [[[[ 14.,  16.],
                                        [ 26.,  28.]]]]
    
    //ROIPooling的结果,缩放尺寸为0.7的情况下,bbox的坐标为[0,0,3,3],pooling到2*2的尺寸
      ROIPooling(x, y, (2,2), 0.7) = [[[[  7.,   9.],
                                        [ 19.,  21.]]]]
    

    完整代码和注释如下:

    #include "./roi_pooling-inl.h"
    #include <mshadow/base.h>
    #include <mshadow/tensor.h>
    #include <mshadow/packet-inl.h>
    #include <mshadow/dot_engine-inl.h>
    #include <cassert>
    
    using std::max;
    using std::min;
    using std::floor;
    using std::ceil;
    //ROIPooling前向部分
    namespace mshadow {
    template<typename Dtype>
    inline void ROIPoolForward(const Tensor<cpu, 4, Dtype> &out,//[batch,channel,h,w]
                               const Tensor<cpu, 4, Dtype> &data,//[batch,channel,h,w]
                               const Tensor<cpu, 2, Dtype> &bbox,//[index,coordinate]
                               const Tensor<cpu, 4, Dtype> &max_idx,
                               const float spatial_scale_) {//ROI的缩放尺度1,0.75,0.5,1.25等
      const Dtype *bottom_data = data.dptr_;
      const Dtype *bottom_rois = bbox.dptr_;
      Dtype *top_data = out.dptr_;
      Dtype *argmax_data = max_idx.dptr_;
      const int channels_ = data.size(1);
      const int height_ = data.size(2);
      const int width_ = data.size(3);
      const int pooled_height_ = out.size(2);
      const int pooled_width_ = out.size(3);
    
      const int num_rois = bbox.size(0);
      const int data_size = data.size(1) * data.size(2) * data.size(3);
      const int data_size_c = data.size(2) * data.size(3);
      const int out_size_c = out.size(2) * out.size(3);
      const int out_size = channels_ * out_size_c;
      const int max_idx_size_c = max_idx.size(2) * max_idx.size(3);
      const int max_idx_size = channels_ * max_idx_size_c;
      // For each ROI R = [batch_index x1 y1 x2 y2]: 对每个ROI做max pooling
      for (int n = 0; n < num_rois; ++n) {
        // 定位到第n个roi
        const Dtype *bottom_rois_n = bottom_rois + n * bbox.size(1);
        Dtype *top_data_n = top_data + n * out_size;
        Dtype *argmax_data_n = argmax_data + n * max_idx_size;
        int roi_batch_ind = bottom_rois_n[0];
        int roi_start_w = round(bottom_rois_n[1] * spatial_scale_);
        int roi_start_h = round(bottom_rois_n[2] * spatial_scale_);
        int roi_end_w = round(bottom_rois_n[3] * spatial_scale_);
        int roi_end_h = round(bottom_rois_n[4] * spatial_scale_);
        assert(roi_batch_ind >= 0);
        assert(static_cast<index_t>(roi_batch_ind) < data.size(0) /* batch size */);
    
        // 避免ROI的大小小于1*1
        int roi_height = max(roi_end_h - roi_start_h + 1, 1);
        int roi_width = max(roi_end_w - roi_start_w + 1, 1);
        //计算pooling后的坐标,ROI Pooling是固定宽高的
        const Dtype bin_size_h = static_cast<Dtype>(roi_height)
                                 / static_cast<Dtype>(pooled_height_);
        const Dtype bin_size_w = static_cast<Dtype>(roi_width)
                                 / static_cast<Dtype>(pooled_width_);
        //定位到下一个ROI
        const Dtype* batch_data = bottom_data + data_size * roi_batch_ind;
    
        #pragma omp parallel for
        for (int c = 0; c < channels_; ++c) {
          // 定位到第n个ROI,第c个通道
          const Dtype* batch_data_c = batch_data + c * data_size_c;
          Dtype* top_data_c = top_data_n + c * out_size_c;
          Dtype* argmax_data_c = argmax_data_n + c * max_idx_size_c;
          // 定位当前pooling的位置坐标
          for (int ph = 0; ph < pooled_height_; ++ph) {
            for (int pw = 0; pw < pooled_width_; ++pw) {
              
              // start (included) = floor(ph * roi_height / pooled_height_)
              //左上两个坐标向下取整。
              // end (excluded) = ceil((ph + 1) * roi_height / pooled_height_)
              //右下两个坐标向上取整。
            
              int hstart = static_cast<int>(floor(static_cast<Dtype>(ph)
                                                  * bin_size_h));
              int wstart = static_cast<int>(floor(static_cast<Dtype>(pw)
                                                  * bin_size_w));
              int hend = static_cast<int>(ceil(static_cast<Dtype>(ph + 1)
                                               * bin_size_h));
              int wend = static_cast<int>(ceil(static_cast<Dtype>(pw + 1)
                                               * bin_size_w));
              
              hstart = min(max(hstart + roi_start_h, 0), height_);
              hend = min(max(hend + roi_start_h, 0), height_);
              wstart = min(max(wstart + roi_start_w, 0), width_);
              wend = min(max(wend + roi_start_w, 0), width_);
    
              bool is_empty = (hend <= hstart) || (wend <= wstart);
    
              const int pool_index = ph * pooled_width_ + pw;
              if (is_empty) {
                top_data_c[pool_index] = 0;
                argmax_data_c[pool_index] = -1;
              }
              //做maxpooling
              for (int h = hstart; h < hend; ++h) {
                for (int w = wstart; w < wend; ++w) {
                  const int index = h * width_ + w;
                  //找最大值
                  if (batch_data_c[index] > top_data_c[pool_index]) {
                    top_data_c[pool_index] = batch_data_c[index];
                    //输出到data的第c个通道的相应位置
                    argmax_data_c[pool_index] = index;
                  }
                }
              }
            }
          }
        }
      }
      return;
    }
    //ROIPooling后向部分
    template<typename Dtype>
    inline void ROIPoolBackwardAcc(const Tensor<cpu, 4, Dtype> &in_grad,
                                   const Tensor<cpu, 4, Dtype> &out_grad,
                                   const Tensor<cpu, 2, Dtype> &bbox,
                                   const Tensor<cpu, 4, Dtype> &max_idx,
                                   const float spatial_scale_) {
      const Dtype *top_diff = out_grad.dptr_;
      const Dtype *bottom_rois = bbox.dptr_;
      Dtype *bottom_diff = in_grad.dptr_;
      Dtype *argmax_data = max_idx.dptr_;
    
      const int batch_size_ = in_grad.size(0);
      const int channels_ = in_grad.size(1);
      const int height_ = in_grad.size(2);
      const int width_ = in_grad.size(3);
      const int pooled_height_ = out_grad.size(2);
      const int pooled_width_ = out_grad.size(3);
    
      const int num_rois = bbox.size(0);
    
      for (int b = 0; b < batch_size_; ++b) {
        for (int c = 0; c < channels_; ++c) {
          for (int h = 0; h < height_; ++h) {
            for (int w = 0; w < width_; ++w) {
              int offset_bottom_diff = (b * channels_ + c) * height_ * width_;
              offset_bottom_diff += h * width_ + w;
    
              Dtype gradient = 0;
              // Accumulate gradient over all ROIs that pooled this element
              for (int roi_n = 0; roi_n < num_rois; ++roi_n) {
                const Dtype* offset_bottom_rois = bottom_rois + roi_n * 5;
                int roi_batch_ind = offset_bottom_rois[0];
                assert(roi_batch_ind >= 0);
                assert(roi_batch_ind < batch_size_);
                if (b != roi_batch_ind) {
                  continue;
                }
    
                int roi_start_w = round(offset_bottom_rois[1] * spatial_scale_);
                int roi_start_h = round(offset_bottom_rois[2] * spatial_scale_);
                int roi_end_w = round(offset_bottom_rois[3] * spatial_scale_);
                int roi_end_h = round(offset_bottom_rois[4] * spatial_scale_);
    
                bool in_roi = (w >= roi_start_w && w <= roi_end_w &&
                               h >= roi_start_h && h <= roi_end_h);
                if (!in_roi) {
                  continue;
                }
    
                // force malformed ROIs to be 1 * 1
                int roi_height = max(roi_end_h - roi_start_h + 1, 1);
                int roi_width = max(roi_end_w - roi_start_w + 1, 1);
                const Dtype bin_size_h = static_cast<Dtype>(roi_height)
                                         / static_cast<Dtype>(pooled_height_);
                const Dtype bin_size_w = static_cast<Dtype>(roi_width)
                                         / static_cast<Dtype>(pooled_width_);
    
                // compute pooled regions correspond to original (h, w) point
                int phstart = static_cast<int>(floor(static_cast<Dtype>(h - roi_start_h)
                                                     / bin_size_h));
                int pwstart = static_cast<int>(floor(static_cast<Dtype>(w - roi_start_w)
                                                     / bin_size_w));
                int phend = static_cast<int>(ceil(static_cast<Dtype>(h - roi_start_h + 1)
                                                  / bin_size_h));
                int pwend = static_cast<int>(ceil(static_cast<Dtype>(w - roi_start_w + 1)
                                                  / bin_size_w));
    
                // clip to boundaries of pooled region
                phstart = min(max(phstart, 0), pooled_height_);
                phend = min(max(phend, 0), pooled_height_);
                pwstart = min(max(pwstart, 0), pooled_width_);
                pwend = min(max(pwend, 0), pooled_width_);
    
                // accumulate over gradients in pooled regions
                int offset = (roi_n * channels_ + c) * pooled_height_ * pooled_width_;
                const Dtype* offset_top_diff = top_diff + offset;
                const Dtype* offset_argmax_data = argmax_data + offset;
                for (int ph = phstart; ph < phend; ++ph) {
                  for (int pw = pwstart; pw < pwend; ++pw) {
                    const int pooled_index = ph * pooled_width_ + pw;
                    if (static_cast<int>(offset_argmax_data[pooled_index]) == h * width_ + w) {
                      gradient += offset_top_diff[pooled_index];
                    }
                  }
                }
              }
              bottom_diff[offset_bottom_diff] += gradient;
            }
          }
        }
      }
    
      return;
    }
    }  // namespace mshadow
    
    namespace mxnet {
    namespace op {
    
    template<>
    Operator *CreateOp<cpu>(ROIPoolingParam param, int dtype) {
      Operator* op = NULL;
      MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
        op = new ROIPoolingOp<cpu, DType>(param);
      });
      return op;
    }
    
    Operator *ROIPoolingProp::CreateOperatorEx(Context ctx, std::vector<TShape> *in_shape,
                                               std::vector<int> *in_type) const {
      DO_BIND_DISPATCH(CreateOp, param_, in_type->at(0));
    }
    
    DMLC_REGISTER_PARAMETER(ROIPoolingParam);
    
    MXNET_REGISTER_OP_PROPERTY(ROIPooling, ROIPoolingProp)
    .describe(R"code(Performs region of interest(ROI) pooling on the input array.
    ROI pooling is a variant of a max pooling layer, in which the output size is fixed and
    region of interest is a parameter. Its purpose is to perform max pooling on the inputs
    of non-uniform sizes to obtain fixed-size feature maps. ROI pooling is a neural-net
    layer mostly used in training a `Fast R-CNN` network for object detection.
    This operator takes a 4D feature map as an input array and region proposals as `rois`,
    then it pools over sub-regions of input and produces a fixed-sized output array
    regardless of the ROI size.
    To crop the feature map accordingly, you can resize the bounding box coordinates
    by changing the parameters `rois` and `spatial_scale`.
    The cropped feature maps are pooled by standard max pooling operation to a fixed size output
    indicated by a `pooled_size` parameter. batch_size will change to the number of region
    bounding boxes after `ROIPooling`.
    The size of each region of interest doesn't have to be perfectly divisible by
    the number of pooling sections(`pooled_size`).
    Example::
      x = [[[[  0.,   1.,   2.,   3.,   4.,   5.],
             [  6.,   7.,   8.,   9.,  10.,  11.],
             [ 12.,  13.,  14.,  15.,  16.,  17.],
             [ 18.,  19.,  20.,  21.,  22.,  23.],
             [ 24.,  25.,  26.,  27.,  28.,  29.],
             [ 30.,  31.,  32.,  33.,  34.,  35.],
             [ 36.,  37.,  38.,  39.,  40.,  41.],
             [ 42.,  43.,  44.,  45.,  46.,  47.]]]]
      // region of interest i.e. bounding box coordinates.
      y = [[0,0,0,4,4]]
      // returns array of shape (2,2) according to the given roi with max pooling.
      ROIPooling(x, y, (2,2), 1.0) = [[[[ 14.,  16.],
                                        [ 26.,  28.]]]]
      // region of interest is changed due to the change in `spacial_scale` parameter.
      ROIPooling(x, y, (2,2), 0.7) = [[[[  7.,   9.],
                                        [ 19.,  21.]]]]
    )code" ADD_FILELINE)
    .add_argument("data", "NDArray-or-Symbol", "The input array to the pooling operator, "
                                                " a 4D Feature maps ")
    .add_argument("rois", "NDArray-or-Symbol", "Bounding box coordinates, a 2D array of "
    "[[batch_index, x1, y1, x2, y2]], where (x1, y1) and (x2, y2) are top left and bottom right "
    "corners of designated region of interest. `batch_index` indicates the index of corresponding "
    "image in the input array")
    .add_arguments(ROIPoolingParam::__FIELDS__());
    }  // namespace op
    }  // namespace mxnet
    

    相关文章

      网友评论

        本文标题:ROIPooling代码理解(CPU)

        本文链接:https://www.haomeiwen.com/subject/tmzljftx.html