ROIPooling代码理解（CPU）

作者: 魔法少女玛格姬 | 来源:发表于2018-05-23 11:55 被阅读0次

ROIPooling代码理解（CPU）
深入理解 Java 内存模型
3.技术-CPU飙高分析和排查(三)-CPU-us%高定位原因
RoIPooling和RoIAlign
如何使用Python脚本分析CPU使用情况的？
如何使用Python脚本分析CPU使用情况的？
编译器想做什么
如何使用python分析CPU使用情况？大概是这样吧
系统性能调优必知必会
理解CPU Cache

MXNet中ROIPooling的具体实现。
代码来自https://github.com/apache/incubator-mxnet
包括前向传播的c++实现

void ROIPoolForward(out,in,bbox,max_idx,spatial_scal){...}

反向传播的c++实现

void ROIPoolBackwardAcc(in_grad,out_grad,bbox,max_idx,spatial_scal){...}

以上操作封装成Operator，并在MXNet里注册。
前传定义为ROIPooling，反传定义为ROIPoolingProp。

namespace mxnet {
namespace op {

template<>
Operator *CreateOp<cpu>(ROIPoolingParam param, int dtype) {
  Operator* op = NULL;
  MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
    op = new ROIPoolingOp<cpu, DType>(param);
  });
  return op;
}

Operator *ROIPoolingProp::CreateOperatorEx(Context ctx, std::vector<TShape> *in_shape,
                                           std::vector<int> *in_type) const {
  DO_BIND_DISPATCH(CreateOp, param_, in_type->at(0));
}

DMLC_REGISTER_PARAMETER(ROIPoolingParam);

MXNET_REGISTER_OP_PROPERTY(ROIPooling, ROIPoolingProp).describe(...)

代码中还给了具体的例子：

//4-d tensor input, shape=(batch,channel,h,w)
  x = [[[[  0.,   1.,   2.,   3.,   4.,   5.],
         [  6.,   7.,   8.,   9.,  10.,  11.],
         [ 12.,  13.,  14.,  15.,  16.,  17.],
         [ 18.,  19.,  20.,  21.,  22.,  23.],
         [ 24.,  25.,  26.,  27.,  28.,  29.],
         [ 30.,  31.,  32.,  33.,  34.,  35.],
         [ 36.,  37.,  38.,  39.,  40.,  41.],
         [ 42.,  43.,  44.,  45.,  46.,  47.]]]]

//2-d tensor bounding box, shape=(num_roi, coordinate)
 y = [[0,0,0,4,4]]

//pooled_size
 (2,2)

//ROIPooling的结果，缩放尺寸为1的情况下，bbox的坐标为[0,0,4,4]，pooling到2*2的尺寸
  ROIPooling(x, y, (2,2), 1.0) = [[[[ 14.,  16.],
                                    [ 26.,  28.]]]]

//ROIPooling的结果，缩放尺寸为0.7的情况下，bbox的坐标为[0,0,3,3]，pooling到2*2的尺寸
  ROIPooling(x, y, (2,2), 0.7) = [[[[  7.,   9.],
                                    [ 19.,  21.]]]]

完整代码和注释如下：

#include "./roi_pooling-inl.h"
#include <mshadow/base.h>
#include <mshadow/tensor.h>
#include <mshadow/packet-inl.h>
#include <mshadow/dot_engine-inl.h>
#include <cassert>

using std::max;
using std::min;
using std::floor;
using std::ceil;
//ROIPooling前向部分
namespace mshadow {
template<typename Dtype>
inline void ROIPoolForward(const Tensor<cpu, 4, Dtype> &out,//[batch,channel,h,w]
                           const Tensor<cpu, 4, Dtype> &data,//[batch,channel,h,w]
                           const Tensor<cpu, 2, Dtype> &bbox,//[index,coordinate]
                           const Tensor<cpu, 4, Dtype> &max_idx,
                           const float spatial_scale_) {//ROI的缩放尺度1,0.75,0.5,1.25等
  const Dtype *bottom_data = data.dptr_;
  const Dtype *bottom_rois = bbox.dptr_;
  Dtype *top_data = out.dptr_;
  Dtype *argmax_data = max_idx.dptr_;
  const int channels_ = data.size(1);
  const int height_ = data.size(2);
  const int width_ = data.size(3);
  const int pooled_height_ = out.size(2);
  const int pooled_width_ = out.size(3);

  const int num_rois = bbox.size(0);
  const int data_size = data.size(1) * data.size(2) * data.size(3);
  const int data_size_c = data.size(2) * data.size(3);
  const int out_size_c = out.size(2) * out.size(3);
  const int out_size = channels_ * out_size_c;
  const int max_idx_size_c = max_idx.size(2) * max_idx.size(3);
  const int max_idx_size = channels_ * max_idx_size_c;
  // For each ROI R = [batch_index x1 y1 x2 y2]: 对每个ROI做max pooling
  for (int n = 0; n < num_rois; ++n) {
    // 定位到第n个roi
    const Dtype *bottom_rois_n = bottom_rois + n * bbox.size(1);
    Dtype *top_data_n = top_data + n * out_size;
    Dtype *argmax_data_n = argmax_data + n * max_idx_size;
    int roi_batch_ind = bottom_rois_n[0];
    int roi_start_w = round(bottom_rois_n[1] * spatial_scale_);
    int roi_start_h = round(bottom_rois_n[2] * spatial_scale_);
    int roi_end_w = round(bottom_rois_n[3] * spatial_scale_);
    int roi_end_h = round(bottom_rois_n[4] * spatial_scale_);
    assert(roi_batch_ind >= 0);
    assert(static_cast<index_t>(roi_batch_ind) < data.size(0) /* batch size */);

    // 避免ROI的大小小于1*1
    int roi_height = max(roi_end_h - roi_start_h + 1, 1);
    int roi_width = max(roi_end_w - roi_start_w + 1, 1);
    //计算pooling后的坐标，ROI Pooling是固定宽高的
    const Dtype bin_size_h = static_cast<Dtype>(roi_height)
                             / static_cast<Dtype>(pooled_height_);
    const Dtype bin_size_w = static_cast<Dtype>(roi_width)
                             / static_cast<Dtype>(pooled_width_);
    //定位到下一个ROI
    const Dtype* batch_data = bottom_data + data_size * roi_batch_ind;

    #pragma omp parallel for
    for (int c = 0; c < channels_; ++c) {
      // 定位到第n个ROI，第c个通道
      const Dtype* batch_data_c = batch_data + c * data_size_c;
      Dtype* top_data_c = top_data_n + c * out_size_c;
      Dtype* argmax_data_c = argmax_data_n + c * max_idx_size_c;
      // 定位当前pooling的位置坐标
      for (int ph = 0; ph < pooled_height_; ++ph) {
        for (int pw = 0; pw < pooled_width_; ++pw) {
          
          // start (included) = floor(ph * roi_height / pooled_height_)
          //左上两个坐标向下取整。
          // end (excluded) = ceil((ph + 1) * roi_height / pooled_height_)
          //右下两个坐标向上取整。
        
          int hstart = static_cast<int>(floor(static_cast<Dtype>(ph)
                                              * bin_size_h));
          int wstart = static_cast<int>(floor(static_cast<Dtype>(pw)
                                              * bin_size_w));
          int hend = static_cast<int>(ceil(static_cast<Dtype>(ph + 1)
                                           * bin_size_h));
          int wend = static_cast<int>(ceil(static_cast<Dtype>(pw + 1)
                                           * bin_size_w));
          
          hstart = min(max(hstart + roi_start_h, 0), height_);
          hend = min(max(hend + roi_start_h, 0), height_);
          wstart = min(max(wstart + roi_start_w, 0), width_);
          wend = min(max(wend + roi_start_w, 0), width_);

          bool is_empty = (hend <= hstart) || (wend <= wstart);

          const int pool_index = ph * pooled_width_ + pw;
          if (is_empty) {
            top_data_c[pool_index] = 0;
            argmax_data_c[pool_index] = -1;
          }
          //做maxpooling
          for (int h = hstart; h < hend; ++h) {
            for (int w = wstart; w < wend; ++w) {
              const int index = h * width_ + w;
              //找最大值
              if (batch_data_c[index] > top_data_c[pool_index]) {
                top_data_c[pool_index] = batch_data_c[index];
                //输出到data的第c个通道的相应位置
                argmax_data_c[pool_index] = index;
              }
            }
          }
        }
      }
    }
  }
  return;
}
//ROIPooling后向部分
template<typename Dtype>
inline void ROIPoolBackwardAcc(const Tensor<cpu, 4, Dtype> &in_grad,
                               const Tensor<cpu, 4, Dtype> &out_grad,
                               const Tensor<cpu, 2, Dtype> &bbox,
                               const Tensor<cpu, 4, Dtype> &max_idx,
                               const float spatial_scale_) {
  const Dtype *top_diff = out_grad.dptr_;
  const Dtype *bottom_rois = bbox.dptr_;
  Dtype *bottom_diff = in_grad.dptr_;
  Dtype *argmax_data = max_idx.dptr_;

  const int batch_size_ = in_grad.size(0);
  const int channels_ = in_grad.size(1);
  const int height_ = in_grad.size(2);
  const int width_ = in_grad.size(3);
  const int pooled_height_ = out_grad.size(2);
  const int pooled_width_ = out_grad.size(3);

  const int num_rois = bbox.size(0);

  for (int b = 0; b < batch_size_; ++b) {
    for (int c = 0; c < channels_; ++c) {
      for (int h = 0; h < height_; ++h) {
        for (int w = 0; w < width_; ++w) {
          int offset_bottom_diff = (b * channels_ + c) * height_ * width_;
          offset_bottom_diff += h * width_ + w;

          Dtype gradient = 0;
          // Accumulate gradient over all ROIs that pooled this element
          for (int roi_n = 0; roi_n < num_rois; ++roi_n) {
            const Dtype* offset_bottom_rois = bottom_rois + roi_n * 5;
            int roi_batch_ind = offset_bottom_rois[0];
            assert(roi_batch_ind >= 0);
            assert(roi_batch_ind < batch_size_);
            if (b != roi_batch_ind) {
              continue;
            }

            int roi_start_w = round(offset_bottom_rois[1] * spatial_scale_);
            int roi_start_h = round(offset_bottom_rois[2] * spatial_scale_);
            int roi_end_w = round(offset_bottom_rois[3] * spatial_scale_);
            int roi_end_h = round(offset_bottom_rois[4] * spatial_scale_);

            bool in_roi = (w >= roi_start_w && w <= roi_end_w &&
                           h >= roi_start_h && h <= roi_end_h);
            if (!in_roi) {
              continue;
            }

            // force malformed ROIs to be 1 * 1
            int roi_height = max(roi_end_h - roi_start_h + 1, 1);
            int roi_width = max(roi_end_w - roi_start_w + 1, 1);
            const Dtype bin_size_h = static_cast<Dtype>(roi_height)
                                     / static_cast<Dtype>(pooled_height_);
            const Dtype bin_size_w = static_cast<Dtype>(roi_width)
                                     / static_cast<Dtype>(pooled_width_);

            // compute pooled regions correspond to original (h, w) point
            int phstart = static_cast<int>(floor(static_cast<Dtype>(h - roi_start_h)
                                                 / bin_size_h));
            int pwstart = static_cast<int>(floor(static_cast<Dtype>(w - roi_start_w)
                                                 / bin_size_w));
            int phend = static_cast<int>(ceil(static_cast<Dtype>(h - roi_start_h + 1)
                                              / bin_size_h));
            int pwend = static_cast<int>(ceil(static_cast<Dtype>(w - roi_start_w + 1)
                                              / bin_size_w));

            // clip to boundaries of pooled region
            phstart = min(max(phstart, 0), pooled_height_);
            phend = min(max(phend, 0), pooled_height_);
            pwstart = min(max(pwstart, 0), pooled_width_);
            pwend = min(max(pwend, 0), pooled_width_);

            // accumulate over gradients in pooled regions
            int offset = (roi_n * channels_ + c) * pooled_height_ * pooled_width_;
            const Dtype* offset_top_diff = top_diff + offset;
            const Dtype* offset_argmax_data = argmax_data + offset;
            for (int ph = phstart; ph < phend; ++ph) {
              for (int pw = pwstart; pw < pwend; ++pw) {
                const int pooled_index = ph * pooled_width_ + pw;
                if (static_cast<int>(offset_argmax_data[pooled_index]) == h * width_ + w) {
                  gradient += offset_top_diff[pooled_index];
                }
              }
            }
          }
          bottom_diff[offset_bottom_diff] += gradient;
        }
      }
    }
  }

  return;
}
}  // namespace mshadow

namespace mxnet {
namespace op {

template<>
Operator *CreateOp<cpu>(ROIPoolingParam param, int dtype) {
  Operator* op = NULL;
  MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
    op = new ROIPoolingOp<cpu, DType>(param);
  });
  return op;
}

Operator *ROIPoolingProp::CreateOperatorEx(Context ctx, std::vector<TShape> *in_shape,
                                           std::vector<int> *in_type) const {
  DO_BIND_DISPATCH(CreateOp, param_, in_type->at(0));
}

DMLC_REGISTER_PARAMETER(ROIPoolingParam);

MXNET_REGISTER_OP_PROPERTY(ROIPooling, ROIPoolingProp)
.describe(R"code(Performs region of interest(ROI) pooling on the input array.
ROI pooling is a variant of a max pooling layer, in which the output size is fixed and
region of interest is a parameter. Its purpose is to perform max pooling on the inputs
of non-uniform sizes to obtain fixed-size feature maps. ROI pooling is a neural-net
layer mostly used in training a `Fast R-CNN` network for object detection.
This operator takes a 4D feature map as an input array and region proposals as `rois`,
then it pools over sub-regions of input and produces a fixed-sized output array
regardless of the ROI size.
To crop the feature map accordingly, you can resize the bounding box coordinates
by changing the parameters `rois` and `spatial_scale`.
The cropped feature maps are pooled by standard max pooling operation to a fixed size output
indicated by a `pooled_size` parameter. batch_size will change to the number of region
bounding boxes after `ROIPooling`.
The size of each region of interest doesn't have to be perfectly divisible by
the number of pooling sections(`pooled_size`).
Example::
  x = [[[[  0.,   1.,   2.,   3.,   4.,   5.],
         [  6.,   7.,   8.,   9.,  10.,  11.],
         [ 12.,  13.,  14.,  15.,  16.,  17.],
         [ 18.,  19.,  20.,  21.,  22.,  23.],
         [ 24.,  25.,  26.,  27.,  28.,  29.],
         [ 30.,  31.,  32.,  33.,  34.,  35.],
         [ 36.,  37.,  38.,  39.,  40.,  41.],
         [ 42.,  43.,  44.,  45.,  46.,  47.]]]]
  // region of interest i.e. bounding box coordinates.
  y = [[0,0,0,4,4]]
  // returns array of shape (2,2) according to the given roi with max pooling.
  ROIPooling(x, y, (2,2), 1.0) = [[[[ 14.,  16.],
                                    [ 26.,  28.]]]]
  // region of interest is changed due to the change in `spacial_scale` parameter.
  ROIPooling(x, y, (2,2), 0.7) = [[[[  7.,   9.],
                                    [ 19.,  21.]]]]
)code" ADD_FILELINE)
.add_argument("data", "NDArray-or-Symbol", "The input array to the pooling operator, "
                                            " a 4D Feature maps ")
.add_argument("rois", "NDArray-or-Symbol", "Bounding box coordinates, a 2D array of "
"[[batch_index, x1, y1, x2, y2]], where (x1, y1) and (x2, y2) are top left and bottom right "
"corners of designated region of interest. `batch_index` indicates the index of corresponding "
"image in the input array")
.add_arguments(ROIPoolingParam::__FIELDS__());
}  // namespace op
}  // namespace mxnet

ROIPooling代码理解（CPU）
MXNet中ROIPooling的具体实现。代码来自https://github.com/apache/incub...
深入理解 Java 内存模型
深入理解 Java 内存模型（一）从Java代码到CPU指令最开始，我们编写Java代码，是java文件。在...
3.技术-CPU飙高分析和排查(三)-CPU-us%高定位原因
目录一.定位CPU高代码位置二.总结一.定位CPU高代码位置模拟占用CPU 二.总结对于用户态的cpu飙...
RoIPooling和RoIAlign
RoIPooling Conv layers使用的是VGG16，feat_stride=32，假定原图中有一reg...
如何使用Python脚本分析CPU使用情况的？
用以分析Python中CPU使用情况。CPU分析是通过分析CPU执行代码的方式来测量代码的性能，以此找到代码中的不...
如何使用Python脚本分析CPU使用情况的？
用以分析Python中CPU使用情况。CPU分析是通过分析CPU执行代码的方式来测量代码的性能，以此找到代码中的不...
编译器想做什么
编译器就程序员写的代码变成CPU能理解机器代码。编译器的指令重排指开启编译器优化后，在不影响代码行为的前提下，代码...
如何使用python分析CPU使用情况？大概是这样吧
前言现在小编给大家分享一个可以分析CPU使用情况的代码，CPU分析是通过分享CPU执行代码的方式来测量代码的性能...
系统性能调优必知必会
01 | CPU缓存：怎样写代码能够让CPU执行得更快？ CPU缓存通常分为大小不等的三级缓存我们的代码优化目标...
理解CPU Cache
博客链接：http://www.ideabuffer.cn/2017/05/07/理解CPU-Cache/ CPU...