美文网首页
BatchNorm caffe C++源码解析

BatchNorm caffe C++源码解析

作者: 加油11dd23 | 来源:发表于2020-10-20 21:49 被阅读0次

    解析batchnorm维度问题

    一、 一句话描述batchnorm的过程:

    1. 对一个batch内的每个channel内的数据,减去均值除以方差,从而将这些数据变化到标准正态分布上面【中心极限定理】。
    2. 但是如果只这样做的话就会让非线性激活函数失去作用,因此之后再加上放缩因子,使得数据分布稍稍偏离标准正态分布一点。
    • (个人感觉batchnorm很多此一举,直接用一个非线性激活函数不就行了)
    • (batchnorm的本质问题还涉及到独立同分布定理,即默认训练集数据分布和测试集数据分布为同一个分布,因神经网络传输过程中参数改变,所以每一层的输出分布与输入数据分布不同,所以要通过batch变换分布。这一块感觉自己不太清楚,过几天更新的时候补上)
    • 如果ML系统实例集合<X,Y>中的输入值X的分布老是变,这不符合IID假设啊,那您怎么让我稳定的学规律啊,这不得引入迁移学习才能搞定吗,我们的ML系统还得去学习怎么迎合这种分布变化啊。
      对于深度学习这种包含很多隐层的网络结构,在训练过程中,因为各层参数老在变,所以每个隐层都会面临covariate shift的问题,也就是在训练过程中,隐层的输入分布老是变来变去,这就是所谓的“Internal Covariate Shift”,Internal指的是深层网络的隐层,是发生在网络内部的事情,而不是covariate shift问题只发生在输入层。


      batchnorm流程.png

    二、caffe C++源码解析【注释】

    image.png
    image.png

    (一)、batch_norm_layer.cpp

    #include <algorithm>
    #include <vector>
    
    #include "caffe/layers/batch_norm_layer.hpp"
    #include "caffe/util/math_functions.hpp"
    
    namespace caffe {
    
    template <typename Dtype>
    void BatchNormLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
          const vector<Blob<Dtype>*>& top) {
      BatchNormParameter param = this->layer_param_.batch_norm_param(); // batchnorm 参数
      moving_average_fraction_ = param.moving_average_fraction(); // 滑动平均参数
      // 训练时,mean 和 variance 基于 mini-batch 计算
      // 测试时,mean 和 variance 基于整个 dataset.
      use_global_stats_ = this->phase_ == TEST; 
      if (param.has_use_global_stats())
        use_global_stats_ = param.use_global_stats();
      if (bottom[0]->num_axes() == 1)
        channels_ = 1;
      else
        channels_ = bottom[0]->shape(1);
      eps_ = param.eps();
      if (this->blobs_.size() > 0) {
        LOG(INFO) << "Skipping parameter initialization";
      } else {
        this->blobs_.resize(3); // 存储的学习参数
        vector<int> sz;
        sz.push_back(channels_);
        this->blobs_[0].reset(new Blob<Dtype>(sz)); // 均值滑动平均值,channels_ 大小的数组
        this->blobs_[1].reset(new Blob<Dtype>(sz)); // 方差滑动平均值,channels_ 大小的数组
        sz[0] = 1;
        this->blobs_[2].reset(new Blob<Dtype>(sz)); // 滑动平均系数,大小为 1 的数组
        for (int i = 0; i < 3; ++i) {
          caffe_set(this->blobs_[i]->count(), Dtype(0),
                    this->blobs_[i]->mutable_cpu_data()); // 值初始化为 0
        }
      }
      // Mask statistics from optimization by setting local learning rates
      // for mean, variance, and the bias correction to zero.
      for (int i = 0; i < this->blobs_.size(); ++i) {
        if (this->layer_param_.param_size() == i) {
          ParamSpec* fixed_param_spec = this->layer_param_.add_param();
          fixed_param_spec->set_lr_mult(0.f);
        } else {
          CHECK_EQ(this->layer_param_.param(i).lr_mult(), 0.f)
              << "Cannot configure batch normalization statistics as layer "
              << "parameters.";
        }
      }
    }
    
    template <typename Dtype>
    void BatchNormLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
          const vector<Blob<Dtype>*>& top) {
      // 如果 bottom 是一维的,均值和方差的个数为1;否则,等于 channels
      if (bottom[0]->num_axes() >= 1)
        CHECK_EQ(bottom[0]->shape(1), channels_);
      top[0]->ReshapeLike(*bottom[0]); // top[0] 与输入 bottom[0] 的形状一致
    
      vector<int> sz;
      sz.push_back(channels_);
      mean_.Reshape(sz); // 存储均值
      variance_.Reshape(sz); // 存储方差
      temp_.ReshapeLike(*bottom[0]); // 存储减去均值 mean_ 后的每个数的方差
      x_norm_.ReshapeLike(*bottom[0]);
      sz[0] = bottom[0]->shape(0);
      batch_sum_multiplier_.Reshape(sz); // batch size
    
      // 空间维度, height*width
      int spatial_dim = bottom[0]->count()/(channels_*bottom[0]->shape(0));
    
      if (spatial_sum_multiplier_.num_axes() == 0 ||
          spatial_sum_multiplier_.shape(0) != spatial_dim) {
        sz[0] = spatial_dim;
        spatial_sum_multiplier_.Reshape(sz);
        Dtype* multiplier_data = spatial_sum_multiplier_.mutable_cpu_data();
        // spatial_sum_multiplier_ 初始化值为1,其尺寸为 height*width 
        caffe_set(spatial_sum_multiplier_.count(), Dtype(1), multiplier_data);
      }
    
      int numbychans = channels_*bottom[0]->shape(0); // channels * batchsize
      if (num_by_chans_.num_axes() == 0 ||
          num_by_chans_.shape(0) != numbychans) {
        sz[0] = numbychans;
        num_by_chans_.Reshape(sz);
        caffe_set(batch_sum_multiplier_.count(), Dtype(1),
            batch_sum_multiplier_.mutable_cpu_data()); // 初始化值为 1
      }
    }
    
    // Forward 函数,计算均值和方差,以矩阵-向量乘积的方式.
    template <typename Dtype>
    void BatchNormLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
        const vector<Blob<Dtype>*>& top) {
      const Dtype* bottom_data = bottom[0]->cpu_data();
      Dtype* top_data = top[0]->mutable_cpu_data();
      int num = bottom[0]->shape(0); // 
      int spatial_dim = bottom[0]->count()/(bottom[0]->shape(0)*channels_); // height*width
    
      // 判断 BN 层的输入输出是否是同一 blob
      if (bottom[0] != top[0]) {
        caffe_copy(bottom[0]->count(), bottom_data, top_data);
      }
    
      if (use_global_stats_) {
        // 如果 use_global_stats_ = 1,则使用预定义的均值和方差估计值.
        // use the stored mean/variance estimates.
        const Dtype scale_factor = this->blobs_[2]->cpu_data()[0] == 0 ?
            0 : 1 / this->blobs_[2]->cpu_data()[0];
        caffe_cpu_scale(variance_.count(), scale_factor,
            this->blobs_[0]->cpu_data(), mean_.mutable_cpu_data()); // 乘以缩放因子
        caffe_cpu_scale(variance_.count(), scale_factor,
            this->blobs_[1]->cpu_data(), variance_.mutable_cpu_data());
      } else {
        // 如果 use_global_stats_ = 0
        // compute mean
        // 均值计算
        // num_by_chans_ = (1. / (num * spatial_dim)) * bottom_data * spatial_sum_multiplier_ 
        // channels*num 行; spatial_dim 列
        // 共 channels * num 个值
        caffe_cpu_gemv<Dtype>(CblasNoTrans, channels_ * num, spatial_dim,
            1. / (num * spatial_dim), bottom_data,
            spatial_sum_multiplier_.cpu_data(), 0.,
            num_by_chans_.mutable_cpu_data());
    
        // mean_ = 1 * num_by_chans_ * batch_sum_multiplier_
        // num 行; channels 列
        // 每个通道值相加,得到 channel 个值
        caffe_cpu_gemv<Dtype>(CblasTrans, num, channels_, 1.,
            num_by_chans_.cpu_data(), batch_sum_multiplier_.cpu_data(), 0.,
            mean_.mutable_cpu_data());
      }
    
      // subtract mean
      // 减均值
      // num_by_chans_ = 1 * batch_sum_multiplier_ * mean_ 
      caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, num, channels_, 1, 1,
          batch_sum_multiplier_.cpu_data(), mean_.cpu_data(), 0.,
          num_by_chans_.mutable_cpu_data());
      // top_data = -1 * num_by_chans_ *  + spatial_sum_multiplier_ + 1.0 * top_data
      // top_data 中的数据减去均值 mean_
      caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, channels_ * num,
          spatial_dim, 1, -1, num_by_chans_.cpu_data(),
          spatial_sum_multiplier_.cpu_data(), 1., top_data);
    
      if (!use_global_stats_) {
        // 如果 use_global_stats_ = 0,计算方差
        // compute variance using var(X) = E((X-EX)^2)
        // 对向量的每一个值求其方差,得到结果为 temp_
        caffe_sqr<Dtype>(top[0]->count(), top_data, temp_.mutable_cpu_data());  // (X-EX)^2
        // num_by_chans_ = (1. / (num * spatial_dim)) * temp_ * spatial_sum_multiplier_
        caffe_cpu_gemv<Dtype>(CblasNoTrans, channels_ * num, spatial_dim,
            1. / (num * spatial_dim), temp_.cpu_data(),
            spatial_sum_multiplier_.cpu_data(), 0.,
            num_by_chans_.mutable_cpu_data()); // 矩阵向量乘
        // variance_ = 1.0 * num_by_chans_ * batch_sum_multiplier_
        caffe_cpu_gemv<Dtype>(CblasTrans, num, channels_, 1.,
            num_by_chans_.cpu_data(), batch_sum_multiplier_.cpu_data(), 0.,
            variance_.mutable_cpu_data());  // E((X_EX)^2)
    
        // compute and save moving average
        // 计算并保存滑动平均值
        // 简述部分的 [F1] 步
        this->blobs_[2]->mutable_cpu_data()[0] *= moving_average_fraction_;
        this->blobs_[2]->mutable_cpu_data()[0] += 1;
    
        // this->blobs_[0] = 1 * mean_ + moving_average_fraction_ * this->blobs_[0]
        // 简述部分的 [F2] 步
        caffe_cpu_axpby(mean_.count(), Dtype(1), mean_.cpu_data(),
            moving_average_fraction_, this->blobs_[0]->mutable_cpu_data());
        // m = num * height * width 
        int m = bottom[0]->count()/channels_;
    
        Dtype bias_correction_factor = m > 1 ? Dtype(m)/(m-1) : 1;
        // this->blobs_[1] = bias_correction_factor * variance_ + moving_average_fraction_ * this->blobs_[1]
        // 无偏估计方差 m/(m-1)
        // 简述部分的 [F3] 步
        caffe_cpu_axpby(variance_.count(), bias_correction_factor,
            variance_.cpu_data(), moving_average_fraction_,
            this->blobs_[1]->mutable_cpu_data());
      }
    
      // normalize variance
      // 方差归一化
      // variance_ = variance_ + eps_ 添加一个很小的值
      caffe_add_scalar(variance_.count(), eps_, variance_.mutable_cpu_data());
      // 对 variance_ 的每个值进行操作,求开方
      caffe_sqrt(variance_.count(), variance_.cpu_data(),
                 variance_.mutable_cpu_data());
    
      // replicate variance to input size
      // 下面两个 gemm 函数将 channels_ 个值的方差 variance_ 扩展到 channels_ * num * height * width
      // num_by_chans_ = 1 * batch_sum_multiplier_ * variance_
      caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, num, channels_, 1, 1,
          batch_sum_multiplier_.cpu_data(), variance_.cpu_data(), 0.,
          num_by_chans_.mutable_cpu_data());
      // temp_ = 1.0 * num_by_chans_ * spatial_sum_multiplier_
      caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, channels_ * num,
          spatial_dim, 1, 1., num_by_chans_.cpu_data(),
          spatial_sum_multiplier_.cpu_data(), 0., temp_.mutable_cpu_data());
    
      // 逐元素操作,top_data[i] = top_data[i] / temp_[i] 
      caffe_div(temp_.count(), top_data, temp_.cpu_data(), top_data);
      // TODO(cdoersch): The caching is only needed because later in-place layers
      //                 might clobber the data.  Can we skip this if they won't?
      // 将 top_data 的计算结果 copy 到 x_norm_. 
      caffe_copy(x_norm_.count(), top_data, x_norm_.mutable_cpu_data());
    }
    
    
    // 参考简述中的反向计算公式.  
    template <typename Dtype>
    void BatchNormLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
        const vector<bool>& propagate_down,
        const vector<Blob<Dtype>*>& bottom) {
      const Dtype* top_diff; // 梯度
      if (bottom[0] != top[0]) {
        top_diff = top[0]->cpu_diff();
      } else {
        caffe_copy(x_norm_.count(), top[0]->cpu_diff(), x_norm_.mutable_cpu_diff());
        top_diff = x_norm_.cpu_diff();
      }
      Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
      if (use_global_stats_) {
        caffe_div(temp_.count(), top_diff, temp_.cpu_data(), bottom_diff);
        return;
      }
      const Dtype* top_data = x_norm_.cpu_data();
      int num = bottom[0]->shape()[0];
      int spatial_dim = bottom[0]->count()/(bottom[0]->shape(0)*channels_);
      // if Y = (X-mean(X))/(sqrt(var(X)+eps)), then
      //
      // dE(Y)/dX =
      //   (dE/dY - mean(dE/dY) - mean(dE/dY \cdot Y) \cdot Y)
      //     ./ sqrt(var(X) + eps)
      //
      // where \cdot and ./ are hadamard product and elementwise division,
      // respectively, dE/dY is the top diff, and mean/var/sum are all computed
      // along all dimensions except the channels dimension.  In the above
      // equation, the operations allow for expansion (i.e. broadcast) along all
      // dimensions except the channels dimension where required.
    
      // sum(dE/dY \cdot Y)
      caffe_mul(temp_.count(), top_data, top_diff, bottom_diff);
      caffe_cpu_gemv<Dtype>(CblasNoTrans, channels_ * num, spatial_dim, 1.,
          bottom_diff, spatial_sum_multiplier_.cpu_data(), 0.,
          num_by_chans_.mutable_cpu_data());
      caffe_cpu_gemv<Dtype>(CblasTrans, num, channels_, 1.,
          num_by_chans_.cpu_data(), batch_sum_multiplier_.cpu_data(), 0.,
          mean_.mutable_cpu_data());
    
      // reshape (broadcast) the above
      caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, num, channels_, 1, 1,
          batch_sum_multiplier_.cpu_data(), mean_.cpu_data(), 0.,
          num_by_chans_.mutable_cpu_data());
      caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, channels_ * num,
          spatial_dim, 1, 1., num_by_chans_.cpu_data(),
          spatial_sum_multiplier_.cpu_data(), 0., bottom_diff);
    
      // sum(dE/dY \cdot Y) \cdot Y
      caffe_mul(temp_.count(), top_data, bottom_diff, bottom_diff);
    
      // sum(dE/dY)-sum(dE/dY \cdot Y) \cdot Y
      caffe_cpu_gemv<Dtype>(CblasNoTrans, channels_ * num, spatial_dim, 1.,
          top_diff, spatial_sum_multiplier_.cpu_data(), 0.,
          num_by_chans_.mutable_cpu_data());
      caffe_cpu_gemv<Dtype>(CblasTrans, num, channels_, 1.,
          num_by_chans_.cpu_data(), batch_sum_multiplier_.cpu_data(), 0.,
          mean_.mutable_cpu_data());
      // reshape (broadcast) the above to make
      // sum(dE/dY)-sum(dE/dY \cdot Y) \cdot Y
      caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, num, channels_, 1, 1,
          batch_sum_multiplier_.cpu_data(), mean_.cpu_data(), 0.,
          num_by_chans_.mutable_cpu_data());
      caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, num * channels_,
          spatial_dim, 1, 1., num_by_chans_.cpu_data(),
          spatial_sum_multiplier_.cpu_data(), 1., bottom_diff);
    
      // dE/dY - mean(dE/dY)-mean(dE/dY \cdot Y) \cdot Y
      caffe_cpu_axpby(temp_.count(), Dtype(1), top_diff,
          Dtype(-1. / (num * spatial_dim)), bottom_diff);
    
      // note: temp_ still contains sqrt(var(X)+eps), computed during the forward
      // pass.
      caffe_div(temp_.count(), bottom_diff, temp_.cpu_data(), bottom_diff);
    }
    
    
    #ifdef CPU_ONLY
    STUB_GPU(BatchNormLayer);
    #endif
    
    INSTANTIATE_CLASS(BatchNormLayer);
    REGISTER_LAYER_CLASS(BatchNorm);
    }  // namespace caffe
    

    (二)、batchnorm forward函数实现

    template <typename Dtype>
    void BatchNormLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
        const vector<Blob<Dtype>*>& top) {
      const Dtype* bottom_data = bottom[0]->cpu_data();
      Dtype* top_data = top[0]->mutable_cpu_data();
      int num = bottom[0]->shape(0);
      int spatial_dim = bottom[0]->count()/(bottom[0]->shape(0)*channels_);
    
      // 如果不是就地操作,首先将bottom的数据复制到top
      if (bottom[0] != top[0]) {
        caffe_copy(bottom[0]->count(), bottom_data, top_data);
      }
    
      // 如果使用全局统计量,我们需要先计算出真正的mean和var
      if (use_global_stats_) {
        // use the stored mean/variance estimates.
        const Dtype scale_factor = this->blobs_[2]->cpu_data()[0] == 0 ?
            0 : 1 / this->blobs_[2]->cpu_data()[0];
        // mean = blobs[0] / blobs[2]
        caffe_cpu_scale(variance_.count(), scale_factor,
            this->blobs_[0]->cpu_data(), mean_.mutable_cpu_data());
        // var = blobs[1] / blobs[2]
        caffe_cpu_scale(variance_.count(), scale_factor,
            this->blobs_[1]->cpu_data(), variance_.mutable_cpu_data());
      } else {
        // 不使用全局统计量时,我们要根据当前batch的mean和var做规范化
        // compute mean
        // spatial_sum_multiplier_是全1向量
        // batch_sum_multiplier_也是全1向量
        // gemv做矩阵与向量相乘 y = alpha*A*x + beta*y。
        // 下面式子是将bottom_data这个矩阵与一个全1向量相乘,
        // 相当于是在统计行和。
        // 注意第二个参数channels_ * num指矩阵的行数,第三个参数是矩阵的列数
        // 所以这是在计算每个channel的feature map的和
        // 结果out[n][c]是指输入第n个sample的第c个channel的和
        // 同时,传入了 1. / (num * spatial_dim) 作为因子乘到结果上面,作用见下面
        caffe_cpu_gemv<Dtype>(CblasNoTrans, channels_ * num, spatial_dim,
            1. / (num * spatial_dim), bottom_data,
            spatial_sum_multiplier_.cpu_data(), 0.,
            num_by_chans_.mutable_cpu_data());
        // 道理和上面相同,注意下面通过传入CblasTrans,指定了矩阵要转置。所以是在求列和
        // 这样,就求出了各个channel的和。
        // 上面不是已经除了 num * spatial_dim 吗?这就是求和元素的总数量
        // 到此,我们就完成了对当前batch的平均值的求解
        caffe_cpu_gemv<Dtype>(CblasTrans, num, channels_, 1.,
            num_by_chans_.cpu_data(), batch_sum_multiplier_.cpu_data(), 0.,
            mean_.mutable_cpu_data());
      }
    
      // subtract mean
      // gemm是在做矩阵与矩阵相乘 C = alpha*A*B + beta*C
      // 下面这个是在做broadcasting subtraction
      caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, num, channels_, 1, 1,
          batch_sum_multiplier_.cpu_data(), mean_.cpu_data(), 0.,
          num_by_chans_.mutable_cpu_data());
      caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, channels_ * num,
          spatial_dim, 1, -1, num_by_chans_.cpu_data(),
          spatial_sum_multiplier_.cpu_data(), 1., top_data);
    
      // 计算当前的var
      if (!use_global_stats_) {
        // compute variance using var(X) = E((X-EX)^2)
        caffe_sqr<Dtype>(top[0]->count(), top_data,
                         temp_.mutable_cpu_data());  // (X-EX)^2
        caffe_cpu_gemv<Dtype>(CblasNoTrans, channels_ * num, spatial_dim,
            1. / (num * spatial_dim), temp_.cpu_data(),
            spatial_sum_multiplier_.cpu_data(), 0.,
            num_by_chans_.mutable_cpu_data());
        caffe_cpu_gemv<Dtype>(CblasTrans, num, channels_, 1.,
            num_by_chans_.cpu_data(), batch_sum_multiplier_.cpu_data(), 0.,
            variance_.mutable_cpu_data());  // E((X_EX)^2)
    
        // compute and save moving average
        // 做滑动平均,更新全局统计量,这里可以参见上面的式子
        this->blobs_[2]->mutable_cpu_data()[0] *= moving_average_fraction_;
        this->blobs_[2]->mutable_cpu_data()[0] += 1;
        caffe_cpu_axpby(mean_.count(), Dtype(1), mean_.cpu_data(),
            moving_average_fraction_, this->blobs_[0]->mutable_cpu_data());
        int m = bottom[0]->count()/channels_;
        Dtype bias_correction_factor = m > 1 ? Dtype(m)/(m-1) : 1;
        caffe_cpu_axpby(variance_.count(), bias_correction_factor,
            variance_.cpu_data(), moving_average_fraction_,
            this->blobs_[1]->mutable_cpu_data());
      }
    
      // normalize variance
      caffe_add_scalar(variance_.count(), eps_, variance_.mutable_cpu_data());
      caffe_sqrt(variance_.count(), variance_.cpu_data(),
                 variance_.mutable_cpu_data());
    
      // replicate variance to input size
      // 同样是在做broadcasting
      caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, num, channels_, 1, 1,
          batch_sum_multiplier_.cpu_data(), variance_.cpu_data(), 0.,
          num_by_chans_.mutable_cpu_data());
      caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, channels_ * num,
          spatial_dim, 1, 1., num_by_chans_.cpu_data(),
          spatial_sum_multiplier_.cpu_data(), 0., temp_.mutable_cpu_data());
      caffe_div(temp_.count(), top_data, temp_.cpu_data(), top_data);
      // TODO(cdoersch): The caching is only needed because later in-place layers
      //                 might clobber the data.  Can we skip this if they won't?
      caffe_copy(x_norm_.count(), top_data,
          x_norm_.mutable_cpu_data());
    }
    

    三、参考链接
    https://xmfbit.github.io/2018/01/08/caffe-batch-norm/
    https://cloud.tencent.com/developer/article/1391839
    https://blog.csdn.net/seven_first/article/details/47378697#2-caffecpugemv-%E5%87%BD%E6%95%B0
    https://blog.csdn.net/malefactor/article/details/51476961

    相关文章

      网友评论

          本文标题:BatchNorm caffe C++源码解析

          本文链接:https://www.haomeiwen.com/subject/eqjjmktx.html