美文网首页
BatchNorm caffe C++源码解析

BatchNorm caffe C++源码解析

作者: 加油11dd23 | 来源:发表于2020-10-20 21:49 被阅读0次

解析batchnorm维度问题

一、 一句话描述batchnorm的过程:

  1. 对一个batch内的每个channel内的数据,减去均值除以方差,从而将这些数据变化到标准正态分布上面【中心极限定理】。
  2. 但是如果只这样做的话就会让非线性激活函数失去作用,因此之后再加上放缩因子,使得数据分布稍稍偏离标准正态分布一点。
  • (个人感觉batchnorm很多此一举,直接用一个非线性激活函数不就行了)
  • (batchnorm的本质问题还涉及到独立同分布定理,即默认训练集数据分布和测试集数据分布为同一个分布,因神经网络传输过程中参数改变,所以每一层的输出分布与输入数据分布不同,所以要通过batch变换分布。这一块感觉自己不太清楚,过几天更新的时候补上)
  • 如果ML系统实例集合<X,Y>中的输入值X的分布老是变,这不符合IID假设啊,那您怎么让我稳定的学规律啊,这不得引入迁移学习才能搞定吗,我们的ML系统还得去学习怎么迎合这种分布变化啊。
    对于深度学习这种包含很多隐层的网络结构,在训练过程中,因为各层参数老在变,所以每个隐层都会面临covariate shift的问题,也就是在训练过程中,隐层的输入分布老是变来变去,这就是所谓的“Internal Covariate Shift”,Internal指的是深层网络的隐层,是发生在网络内部的事情,而不是covariate shift问题只发生在输入层。


    batchnorm流程.png

二、caffe C++源码解析【注释】

image.png
image.png

(一)、batch_norm_layer.cpp

#include <algorithm>
#include <vector>

#include "caffe/layers/batch_norm_layer.hpp"
#include "caffe/util/math_functions.hpp"

namespace caffe {

template <typename Dtype>
void BatchNormLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
      const vector<Blob<Dtype>*>& top) {
  BatchNormParameter param = this->layer_param_.batch_norm_param(); // batchnorm 参数
  moving_average_fraction_ = param.moving_average_fraction(); // 滑动平均参数
  // 训练时,mean 和 variance 基于 mini-batch 计算
  // 测试时,mean 和 variance 基于整个 dataset.
  use_global_stats_ = this->phase_ == TEST; 
  if (param.has_use_global_stats())
    use_global_stats_ = param.use_global_stats();
  if (bottom[0]->num_axes() == 1)
    channels_ = 1;
  else
    channels_ = bottom[0]->shape(1);
  eps_ = param.eps();
  if (this->blobs_.size() > 0) {
    LOG(INFO) << "Skipping parameter initialization";
  } else {
    this->blobs_.resize(3); // 存储的学习参数
    vector<int> sz;
    sz.push_back(channels_);
    this->blobs_[0].reset(new Blob<Dtype>(sz)); // 均值滑动平均值,channels_ 大小的数组
    this->blobs_[1].reset(new Blob<Dtype>(sz)); // 方差滑动平均值,channels_ 大小的数组
    sz[0] = 1;
    this->blobs_[2].reset(new Blob<Dtype>(sz)); // 滑动平均系数,大小为 1 的数组
    for (int i = 0; i < 3; ++i) {
      caffe_set(this->blobs_[i]->count(), Dtype(0),
                this->blobs_[i]->mutable_cpu_data()); // 值初始化为 0
    }
  }
  // Mask statistics from optimization by setting local learning rates
  // for mean, variance, and the bias correction to zero.
  for (int i = 0; i < this->blobs_.size(); ++i) {
    if (this->layer_param_.param_size() == i) {
      ParamSpec* fixed_param_spec = this->layer_param_.add_param();
      fixed_param_spec->set_lr_mult(0.f);
    } else {
      CHECK_EQ(this->layer_param_.param(i).lr_mult(), 0.f)
          << "Cannot configure batch normalization statistics as layer "
          << "parameters.";
    }
  }
}

template <typename Dtype>
void BatchNormLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
      const vector<Blob<Dtype>*>& top) {
  // 如果 bottom 是一维的,均值和方差的个数为1;否则,等于 channels
  if (bottom[0]->num_axes() >= 1)
    CHECK_EQ(bottom[0]->shape(1), channels_);
  top[0]->ReshapeLike(*bottom[0]); // top[0] 与输入 bottom[0] 的形状一致

  vector<int> sz;
  sz.push_back(channels_);
  mean_.Reshape(sz); // 存储均值
  variance_.Reshape(sz); // 存储方差
  temp_.ReshapeLike(*bottom[0]); // 存储减去均值 mean_ 后的每个数的方差
  x_norm_.ReshapeLike(*bottom[0]);
  sz[0] = bottom[0]->shape(0);
  batch_sum_multiplier_.Reshape(sz); // batch size

  // 空间维度, height*width
  int spatial_dim = bottom[0]->count()/(channels_*bottom[0]->shape(0));

  if (spatial_sum_multiplier_.num_axes() == 0 ||
      spatial_sum_multiplier_.shape(0) != spatial_dim) {
    sz[0] = spatial_dim;
    spatial_sum_multiplier_.Reshape(sz);
    Dtype* multiplier_data = spatial_sum_multiplier_.mutable_cpu_data();
    // spatial_sum_multiplier_ 初始化值为1,其尺寸为 height*width 
    caffe_set(spatial_sum_multiplier_.count(), Dtype(1), multiplier_data);
  }

  int numbychans = channels_*bottom[0]->shape(0); // channels * batchsize
  if (num_by_chans_.num_axes() == 0 ||
      num_by_chans_.shape(0) != numbychans) {
    sz[0] = numbychans;
    num_by_chans_.Reshape(sz);
    caffe_set(batch_sum_multiplier_.count(), Dtype(1),
        batch_sum_multiplier_.mutable_cpu_data()); // 初始化值为 1
  }
}

// Forward 函数,计算均值和方差,以矩阵-向量乘积的方式.
template <typename Dtype>
void BatchNormLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
    const vector<Blob<Dtype>*>& top) {
  const Dtype* bottom_data = bottom[0]->cpu_data();
  Dtype* top_data = top[0]->mutable_cpu_data();
  int num = bottom[0]->shape(0); // 
  int spatial_dim = bottom[0]->count()/(bottom[0]->shape(0)*channels_); // height*width

  // 判断 BN 层的输入输出是否是同一 blob
  if (bottom[0] != top[0]) {
    caffe_copy(bottom[0]->count(), bottom_data, top_data);
  }

  if (use_global_stats_) {
    // 如果 use_global_stats_ = 1,则使用预定义的均值和方差估计值.
    // use the stored mean/variance estimates.
    const Dtype scale_factor = this->blobs_[2]->cpu_data()[0] == 0 ?
        0 : 1 / this->blobs_[2]->cpu_data()[0];
    caffe_cpu_scale(variance_.count(), scale_factor,
        this->blobs_[0]->cpu_data(), mean_.mutable_cpu_data()); // 乘以缩放因子
    caffe_cpu_scale(variance_.count(), scale_factor,
        this->blobs_[1]->cpu_data(), variance_.mutable_cpu_data());
  } else {
    // 如果 use_global_stats_ = 0
    // compute mean
    // 均值计算
    // num_by_chans_ = (1. / (num * spatial_dim)) * bottom_data * spatial_sum_multiplier_ 
    // channels*num 行; spatial_dim 列
    // 共 channels * num 个值
    caffe_cpu_gemv<Dtype>(CblasNoTrans, channels_ * num, spatial_dim,
        1. / (num * spatial_dim), bottom_data,
        spatial_sum_multiplier_.cpu_data(), 0.,
        num_by_chans_.mutable_cpu_data());

    // mean_ = 1 * num_by_chans_ * batch_sum_multiplier_
    // num 行; channels 列
    // 每个通道值相加,得到 channel 个值
    caffe_cpu_gemv<Dtype>(CblasTrans, num, channels_, 1.,
        num_by_chans_.cpu_data(), batch_sum_multiplier_.cpu_data(), 0.,
        mean_.mutable_cpu_data());
  }

  // subtract mean
  // 减均值
  // num_by_chans_ = 1 * batch_sum_multiplier_ * mean_ 
  caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, num, channels_, 1, 1,
      batch_sum_multiplier_.cpu_data(), mean_.cpu_data(), 0.,
      num_by_chans_.mutable_cpu_data());
  // top_data = -1 * num_by_chans_ *  + spatial_sum_multiplier_ + 1.0 * top_data
  // top_data 中的数据减去均值 mean_
  caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, channels_ * num,
      spatial_dim, 1, -1, num_by_chans_.cpu_data(),
      spatial_sum_multiplier_.cpu_data(), 1., top_data);

  if (!use_global_stats_) {
    // 如果 use_global_stats_ = 0,计算方差
    // compute variance using var(X) = E((X-EX)^2)
    // 对向量的每一个值求其方差,得到结果为 temp_
    caffe_sqr<Dtype>(top[0]->count(), top_data, temp_.mutable_cpu_data());  // (X-EX)^2
    // num_by_chans_ = (1. / (num * spatial_dim)) * temp_ * spatial_sum_multiplier_
    caffe_cpu_gemv<Dtype>(CblasNoTrans, channels_ * num, spatial_dim,
        1. / (num * spatial_dim), temp_.cpu_data(),
        spatial_sum_multiplier_.cpu_data(), 0.,
        num_by_chans_.mutable_cpu_data()); // 矩阵向量乘
    // variance_ = 1.0 * num_by_chans_ * batch_sum_multiplier_
    caffe_cpu_gemv<Dtype>(CblasTrans, num, channels_, 1.,
        num_by_chans_.cpu_data(), batch_sum_multiplier_.cpu_data(), 0.,
        variance_.mutable_cpu_data());  // E((X_EX)^2)

    // compute and save moving average
    // 计算并保存滑动平均值
    // 简述部分的 [F1] 步
    this->blobs_[2]->mutable_cpu_data()[0] *= moving_average_fraction_;
    this->blobs_[2]->mutable_cpu_data()[0] += 1;

    // this->blobs_[0] = 1 * mean_ + moving_average_fraction_ * this->blobs_[0]
    // 简述部分的 [F2] 步
    caffe_cpu_axpby(mean_.count(), Dtype(1), mean_.cpu_data(),
        moving_average_fraction_, this->blobs_[0]->mutable_cpu_data());
    // m = num * height * width 
    int m = bottom[0]->count()/channels_;

    Dtype bias_correction_factor = m > 1 ? Dtype(m)/(m-1) : 1;
    // this->blobs_[1] = bias_correction_factor * variance_ + moving_average_fraction_ * this->blobs_[1]
    // 无偏估计方差 m/(m-1)
    // 简述部分的 [F3] 步
    caffe_cpu_axpby(variance_.count(), bias_correction_factor,
        variance_.cpu_data(), moving_average_fraction_,
        this->blobs_[1]->mutable_cpu_data());
  }

  // normalize variance
  // 方差归一化
  // variance_ = variance_ + eps_ 添加一个很小的值
  caffe_add_scalar(variance_.count(), eps_, variance_.mutable_cpu_data());
  // 对 variance_ 的每个值进行操作,求开方
  caffe_sqrt(variance_.count(), variance_.cpu_data(),
             variance_.mutable_cpu_data());

  // replicate variance to input size
  // 下面两个 gemm 函数将 channels_ 个值的方差 variance_ 扩展到 channels_ * num * height * width
  // num_by_chans_ = 1 * batch_sum_multiplier_ * variance_
  caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, num, channels_, 1, 1,
      batch_sum_multiplier_.cpu_data(), variance_.cpu_data(), 0.,
      num_by_chans_.mutable_cpu_data());
  // temp_ = 1.0 * num_by_chans_ * spatial_sum_multiplier_
  caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, channels_ * num,
      spatial_dim, 1, 1., num_by_chans_.cpu_data(),
      spatial_sum_multiplier_.cpu_data(), 0., temp_.mutable_cpu_data());

  // 逐元素操作,top_data[i] = top_data[i] / temp_[i] 
  caffe_div(temp_.count(), top_data, temp_.cpu_data(), top_data);
  // TODO(cdoersch): The caching is only needed because later in-place layers
  //                 might clobber the data.  Can we skip this if they won't?
  // 将 top_data 的计算结果 copy 到 x_norm_. 
  caffe_copy(x_norm_.count(), top_data, x_norm_.mutable_cpu_data());
}


// 参考简述中的反向计算公式.  
template <typename Dtype>
void BatchNormLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
    const vector<bool>& propagate_down,
    const vector<Blob<Dtype>*>& bottom) {
  const Dtype* top_diff; // 梯度
  if (bottom[0] != top[0]) {
    top_diff = top[0]->cpu_diff();
  } else {
    caffe_copy(x_norm_.count(), top[0]->cpu_diff(), x_norm_.mutable_cpu_diff());
    top_diff = x_norm_.cpu_diff();
  }
  Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
  if (use_global_stats_) {
    caffe_div(temp_.count(), top_diff, temp_.cpu_data(), bottom_diff);
    return;
  }
  const Dtype* top_data = x_norm_.cpu_data();
  int num = bottom[0]->shape()[0];
  int spatial_dim = bottom[0]->count()/(bottom[0]->shape(0)*channels_);
  // if Y = (X-mean(X))/(sqrt(var(X)+eps)), then
  //
  // dE(Y)/dX =
  //   (dE/dY - mean(dE/dY) - mean(dE/dY \cdot Y) \cdot Y)
  //     ./ sqrt(var(X) + eps)
  //
  // where \cdot and ./ are hadamard product and elementwise division,
  // respectively, dE/dY is the top diff, and mean/var/sum are all computed
  // along all dimensions except the channels dimension.  In the above
  // equation, the operations allow for expansion (i.e. broadcast) along all
  // dimensions except the channels dimension where required.

  // sum(dE/dY \cdot Y)
  caffe_mul(temp_.count(), top_data, top_diff, bottom_diff);
  caffe_cpu_gemv<Dtype>(CblasNoTrans, channels_ * num, spatial_dim, 1.,
      bottom_diff, spatial_sum_multiplier_.cpu_data(), 0.,
      num_by_chans_.mutable_cpu_data());
  caffe_cpu_gemv<Dtype>(CblasTrans, num, channels_, 1.,
      num_by_chans_.cpu_data(), batch_sum_multiplier_.cpu_data(), 0.,
      mean_.mutable_cpu_data());

  // reshape (broadcast) the above
  caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, num, channels_, 1, 1,
      batch_sum_multiplier_.cpu_data(), mean_.cpu_data(), 0.,
      num_by_chans_.mutable_cpu_data());
  caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, channels_ * num,
      spatial_dim, 1, 1., num_by_chans_.cpu_data(),
      spatial_sum_multiplier_.cpu_data(), 0., bottom_diff);

  // sum(dE/dY \cdot Y) \cdot Y
  caffe_mul(temp_.count(), top_data, bottom_diff, bottom_diff);

  // sum(dE/dY)-sum(dE/dY \cdot Y) \cdot Y
  caffe_cpu_gemv<Dtype>(CblasNoTrans, channels_ * num, spatial_dim, 1.,
      top_diff, spatial_sum_multiplier_.cpu_data(), 0.,
      num_by_chans_.mutable_cpu_data());
  caffe_cpu_gemv<Dtype>(CblasTrans, num, channels_, 1.,
      num_by_chans_.cpu_data(), batch_sum_multiplier_.cpu_data(), 0.,
      mean_.mutable_cpu_data());
  // reshape (broadcast) the above to make
  // sum(dE/dY)-sum(dE/dY \cdot Y) \cdot Y
  caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, num, channels_, 1, 1,
      batch_sum_multiplier_.cpu_data(), mean_.cpu_data(), 0.,
      num_by_chans_.mutable_cpu_data());
  caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, num * channels_,
      spatial_dim, 1, 1., num_by_chans_.cpu_data(),
      spatial_sum_multiplier_.cpu_data(), 1., bottom_diff);

  // dE/dY - mean(dE/dY)-mean(dE/dY \cdot Y) \cdot Y
  caffe_cpu_axpby(temp_.count(), Dtype(1), top_diff,
      Dtype(-1. / (num * spatial_dim)), bottom_diff);

  // note: temp_ still contains sqrt(var(X)+eps), computed during the forward
  // pass.
  caffe_div(temp_.count(), bottom_diff, temp_.cpu_data(), bottom_diff);
}


#ifdef CPU_ONLY
STUB_GPU(BatchNormLayer);
#endif

INSTANTIATE_CLASS(BatchNormLayer);
REGISTER_LAYER_CLASS(BatchNorm);
}  // namespace caffe

(二)、batchnorm forward函数实现

template <typename Dtype>
void BatchNormLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
    const vector<Blob<Dtype>*>& top) {
  const Dtype* bottom_data = bottom[0]->cpu_data();
  Dtype* top_data = top[0]->mutable_cpu_data();
  int num = bottom[0]->shape(0);
  int spatial_dim = bottom[0]->count()/(bottom[0]->shape(0)*channels_);

  // 如果不是就地操作,首先将bottom的数据复制到top
  if (bottom[0] != top[0]) {
    caffe_copy(bottom[0]->count(), bottom_data, top_data);
  }

  // 如果使用全局统计量,我们需要先计算出真正的mean和var
  if (use_global_stats_) {
    // use the stored mean/variance estimates.
    const Dtype scale_factor = this->blobs_[2]->cpu_data()[0] == 0 ?
        0 : 1 / this->blobs_[2]->cpu_data()[0];
    // mean = blobs[0] / blobs[2]
    caffe_cpu_scale(variance_.count(), scale_factor,
        this->blobs_[0]->cpu_data(), mean_.mutable_cpu_data());
    // var = blobs[1] / blobs[2]
    caffe_cpu_scale(variance_.count(), scale_factor,
        this->blobs_[1]->cpu_data(), variance_.mutable_cpu_data());
  } else {
    // 不使用全局统计量时,我们要根据当前batch的mean和var做规范化
    // compute mean
    // spatial_sum_multiplier_是全1向量
    // batch_sum_multiplier_也是全1向量
    // gemv做矩阵与向量相乘 y = alpha*A*x + beta*y。
    // 下面式子是将bottom_data这个矩阵与一个全1向量相乘,
    // 相当于是在统计行和。
    // 注意第二个参数channels_ * num指矩阵的行数,第三个参数是矩阵的列数
    // 所以这是在计算每个channel的feature map的和
    // 结果out[n][c]是指输入第n个sample的第c个channel的和
    // 同时,传入了 1. / (num * spatial_dim) 作为因子乘到结果上面,作用见下面
    caffe_cpu_gemv<Dtype>(CblasNoTrans, channels_ * num, spatial_dim,
        1. / (num * spatial_dim), bottom_data,
        spatial_sum_multiplier_.cpu_data(), 0.,
        num_by_chans_.mutable_cpu_data());
    // 道理和上面相同,注意下面通过传入CblasTrans,指定了矩阵要转置。所以是在求列和
    // 这样,就求出了各个channel的和。
    // 上面不是已经除了 num * spatial_dim 吗?这就是求和元素的总数量
    // 到此,我们就完成了对当前batch的平均值的求解
    caffe_cpu_gemv<Dtype>(CblasTrans, num, channels_, 1.,
        num_by_chans_.cpu_data(), batch_sum_multiplier_.cpu_data(), 0.,
        mean_.mutable_cpu_data());
  }

  // subtract mean
  // gemm是在做矩阵与矩阵相乘 C = alpha*A*B + beta*C
  // 下面这个是在做broadcasting subtraction
  caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, num, channels_, 1, 1,
      batch_sum_multiplier_.cpu_data(), mean_.cpu_data(), 0.,
      num_by_chans_.mutable_cpu_data());
  caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, channels_ * num,
      spatial_dim, 1, -1, num_by_chans_.cpu_data(),
      spatial_sum_multiplier_.cpu_data(), 1., top_data);

  // 计算当前的var
  if (!use_global_stats_) {
    // compute variance using var(X) = E((X-EX)^2)
    caffe_sqr<Dtype>(top[0]->count(), top_data,
                     temp_.mutable_cpu_data());  // (X-EX)^2
    caffe_cpu_gemv<Dtype>(CblasNoTrans, channels_ * num, spatial_dim,
        1. / (num * spatial_dim), temp_.cpu_data(),
        spatial_sum_multiplier_.cpu_data(), 0.,
        num_by_chans_.mutable_cpu_data());
    caffe_cpu_gemv<Dtype>(CblasTrans, num, channels_, 1.,
        num_by_chans_.cpu_data(), batch_sum_multiplier_.cpu_data(), 0.,
        variance_.mutable_cpu_data());  // E((X_EX)^2)

    // compute and save moving average
    // 做滑动平均,更新全局统计量,这里可以参见上面的式子
    this->blobs_[2]->mutable_cpu_data()[0] *= moving_average_fraction_;
    this->blobs_[2]->mutable_cpu_data()[0] += 1;
    caffe_cpu_axpby(mean_.count(), Dtype(1), mean_.cpu_data(),
        moving_average_fraction_, this->blobs_[0]->mutable_cpu_data());
    int m = bottom[0]->count()/channels_;
    Dtype bias_correction_factor = m > 1 ? Dtype(m)/(m-1) : 1;
    caffe_cpu_axpby(variance_.count(), bias_correction_factor,
        variance_.cpu_data(), moving_average_fraction_,
        this->blobs_[1]->mutable_cpu_data());
  }

  // normalize variance
  caffe_add_scalar(variance_.count(), eps_, variance_.mutable_cpu_data());
  caffe_sqrt(variance_.count(), variance_.cpu_data(),
             variance_.mutable_cpu_data());

  // replicate variance to input size
  // 同样是在做broadcasting
  caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, num, channels_, 1, 1,
      batch_sum_multiplier_.cpu_data(), variance_.cpu_data(), 0.,
      num_by_chans_.mutable_cpu_data());
  caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, channels_ * num,
      spatial_dim, 1, 1., num_by_chans_.cpu_data(),
      spatial_sum_multiplier_.cpu_data(), 0., temp_.mutable_cpu_data());
  caffe_div(temp_.count(), top_data, temp_.cpu_data(), top_data);
  // TODO(cdoersch): The caching is only needed because later in-place layers
  //                 might clobber the data.  Can we skip this if they won't?
  caffe_copy(x_norm_.count(), top_data,
      x_norm_.mutable_cpu_data());
}

三、参考链接
https://xmfbit.github.io/2018/01/08/caffe-batch-norm/
https://cloud.tencent.com/developer/article/1391839
https://blog.csdn.net/seven_first/article/details/47378697#2-caffecpugemv-%E5%87%BD%E6%95%B0
https://blog.csdn.net/malefactor/article/details/51476961

相关文章

网友评论

      本文标题:BatchNorm caffe C++源码解析

      本文链接:https://www.haomeiwen.com/subject/eqjjmktx.html