解析batchnorm维度问题
一、 一句话描述batchnorm的过程:
- 对一个batch内的每个channel内的数据,减去均值除以方差,从而将这些数据变化到标准正态分布上面【中心极限定理】。
- 但是如果只这样做的话就会让非线性激活函数失去作用,因此之后再加上放缩因子,使得数据分布稍稍偏离标准正态分布一点。
- (个人感觉batchnorm很多此一举,直接用一个非线性激活函数不就行了)
- (batchnorm的本质问题还涉及到独立同分布定理,即默认训练集数据分布和测试集数据分布为同一个分布,因神经网络传输过程中参数改变,所以每一层的输出分布与输入数据分布不同,所以要通过batch变换分布。这一块感觉自己不太清楚,过几天更新的时候补上)
-
如果ML系统实例集合<X,Y>中的输入值X的分布老是变,这不符合IID假设啊,那您怎么让我稳定的学规律啊,这不得引入迁移学习才能搞定吗,我们的ML系统还得去学习怎么迎合这种分布变化啊。
对于深度学习这种包含很多隐层的网络结构,在训练过程中,因为各层参数老在变,所以每个隐层都会面临covariate shift的问题,也就是在训练过程中,隐层的输入分布老是变来变去,这就是所谓的“Internal Covariate Shift”,Internal指的是深层网络的隐层,是发生在网络内部的事情,而不是covariate shift问题只发生在输入层。
batchnorm流程.png
二、caffe C++源码解析【注释】
image.pngimage.png
(一)、batch_norm_layer.cpp
#include <algorithm>
#include <vector>
#include "caffe/layers/batch_norm_layer.hpp"
#include "caffe/util/math_functions.hpp"
namespace caffe {
template <typename Dtype>
void BatchNormLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
const vector<Blob<Dtype>*>& top) {
BatchNormParameter param = this->layer_param_.batch_norm_param(); // batchnorm 参数
moving_average_fraction_ = param.moving_average_fraction(); // 滑动平均参数
// 训练时,mean 和 variance 基于 mini-batch 计算
// 测试时,mean 和 variance 基于整个 dataset.
use_global_stats_ = this->phase_ == TEST;
if (param.has_use_global_stats())
use_global_stats_ = param.use_global_stats();
if (bottom[0]->num_axes() == 1)
channels_ = 1;
else
channels_ = bottom[0]->shape(1);
eps_ = param.eps();
if (this->blobs_.size() > 0) {
LOG(INFO) << "Skipping parameter initialization";
} else {
this->blobs_.resize(3); // 存储的学习参数
vector<int> sz;
sz.push_back(channels_);
this->blobs_[0].reset(new Blob<Dtype>(sz)); // 均值滑动平均值,channels_ 大小的数组
this->blobs_[1].reset(new Blob<Dtype>(sz)); // 方差滑动平均值,channels_ 大小的数组
sz[0] = 1;
this->blobs_[2].reset(new Blob<Dtype>(sz)); // 滑动平均系数,大小为 1 的数组
for (int i = 0; i < 3; ++i) {
caffe_set(this->blobs_[i]->count(), Dtype(0),
this->blobs_[i]->mutable_cpu_data()); // 值初始化为 0
}
}
// Mask statistics from optimization by setting local learning rates
// for mean, variance, and the bias correction to zero.
for (int i = 0; i < this->blobs_.size(); ++i) {
if (this->layer_param_.param_size() == i) {
ParamSpec* fixed_param_spec = this->layer_param_.add_param();
fixed_param_spec->set_lr_mult(0.f);
} else {
CHECK_EQ(this->layer_param_.param(i).lr_mult(), 0.f)
<< "Cannot configure batch normalization statistics as layer "
<< "parameters.";
}
}
}
template <typename Dtype>
void BatchNormLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
const vector<Blob<Dtype>*>& top) {
// 如果 bottom 是一维的,均值和方差的个数为1;否则,等于 channels
if (bottom[0]->num_axes() >= 1)
CHECK_EQ(bottom[0]->shape(1), channels_);
top[0]->ReshapeLike(*bottom[0]); // top[0] 与输入 bottom[0] 的形状一致
vector<int> sz;
sz.push_back(channels_);
mean_.Reshape(sz); // 存储均值
variance_.Reshape(sz); // 存储方差
temp_.ReshapeLike(*bottom[0]); // 存储减去均值 mean_ 后的每个数的方差
x_norm_.ReshapeLike(*bottom[0]);
sz[0] = bottom[0]->shape(0);
batch_sum_multiplier_.Reshape(sz); // batch size
// 空间维度, height*width
int spatial_dim = bottom[0]->count()/(channels_*bottom[0]->shape(0));
if (spatial_sum_multiplier_.num_axes() == 0 ||
spatial_sum_multiplier_.shape(0) != spatial_dim) {
sz[0] = spatial_dim;
spatial_sum_multiplier_.Reshape(sz);
Dtype* multiplier_data = spatial_sum_multiplier_.mutable_cpu_data();
// spatial_sum_multiplier_ 初始化值为1,其尺寸为 height*width
caffe_set(spatial_sum_multiplier_.count(), Dtype(1), multiplier_data);
}
int numbychans = channels_*bottom[0]->shape(0); // channels * batchsize
if (num_by_chans_.num_axes() == 0 ||
num_by_chans_.shape(0) != numbychans) {
sz[0] = numbychans;
num_by_chans_.Reshape(sz);
caffe_set(batch_sum_multiplier_.count(), Dtype(1),
batch_sum_multiplier_.mutable_cpu_data()); // 初始化值为 1
}
}
// Forward 函数,计算均值和方差,以矩阵-向量乘积的方式.
template <typename Dtype>
void BatchNormLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
const vector<Blob<Dtype>*>& top) {
const Dtype* bottom_data = bottom[0]->cpu_data();
Dtype* top_data = top[0]->mutable_cpu_data();
int num = bottom[0]->shape(0); //
int spatial_dim = bottom[0]->count()/(bottom[0]->shape(0)*channels_); // height*width
// 判断 BN 层的输入输出是否是同一 blob
if (bottom[0] != top[0]) {
caffe_copy(bottom[0]->count(), bottom_data, top_data);
}
if (use_global_stats_) {
// 如果 use_global_stats_ = 1,则使用预定义的均值和方差估计值.
// use the stored mean/variance estimates.
const Dtype scale_factor = this->blobs_[2]->cpu_data()[0] == 0 ?
0 : 1 / this->blobs_[2]->cpu_data()[0];
caffe_cpu_scale(variance_.count(), scale_factor,
this->blobs_[0]->cpu_data(), mean_.mutable_cpu_data()); // 乘以缩放因子
caffe_cpu_scale(variance_.count(), scale_factor,
this->blobs_[1]->cpu_data(), variance_.mutable_cpu_data());
} else {
// 如果 use_global_stats_ = 0
// compute mean
// 均值计算
// num_by_chans_ = (1. / (num * spatial_dim)) * bottom_data * spatial_sum_multiplier_
// channels*num 行; spatial_dim 列
// 共 channels * num 个值
caffe_cpu_gemv<Dtype>(CblasNoTrans, channels_ * num, spatial_dim,
1. / (num * spatial_dim), bottom_data,
spatial_sum_multiplier_.cpu_data(), 0.,
num_by_chans_.mutable_cpu_data());
// mean_ = 1 * num_by_chans_ * batch_sum_multiplier_
// num 行; channels 列
// 每个通道值相加,得到 channel 个值
caffe_cpu_gemv<Dtype>(CblasTrans, num, channels_, 1.,
num_by_chans_.cpu_data(), batch_sum_multiplier_.cpu_data(), 0.,
mean_.mutable_cpu_data());
}
// subtract mean
// 减均值
// num_by_chans_ = 1 * batch_sum_multiplier_ * mean_
caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, num, channels_, 1, 1,
batch_sum_multiplier_.cpu_data(), mean_.cpu_data(), 0.,
num_by_chans_.mutable_cpu_data());
// top_data = -1 * num_by_chans_ * + spatial_sum_multiplier_ + 1.0 * top_data
// top_data 中的数据减去均值 mean_
caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, channels_ * num,
spatial_dim, 1, -1, num_by_chans_.cpu_data(),
spatial_sum_multiplier_.cpu_data(), 1., top_data);
if (!use_global_stats_) {
// 如果 use_global_stats_ = 0,计算方差
// compute variance using var(X) = E((X-EX)^2)
// 对向量的每一个值求其方差,得到结果为 temp_
caffe_sqr<Dtype>(top[0]->count(), top_data, temp_.mutable_cpu_data()); // (X-EX)^2
// num_by_chans_ = (1. / (num * spatial_dim)) * temp_ * spatial_sum_multiplier_
caffe_cpu_gemv<Dtype>(CblasNoTrans, channels_ * num, spatial_dim,
1. / (num * spatial_dim), temp_.cpu_data(),
spatial_sum_multiplier_.cpu_data(), 0.,
num_by_chans_.mutable_cpu_data()); // 矩阵向量乘
// variance_ = 1.0 * num_by_chans_ * batch_sum_multiplier_
caffe_cpu_gemv<Dtype>(CblasTrans, num, channels_, 1.,
num_by_chans_.cpu_data(), batch_sum_multiplier_.cpu_data(), 0.,
variance_.mutable_cpu_data()); // E((X_EX)^2)
// compute and save moving average
// 计算并保存滑动平均值
// 简述部分的 [F1] 步
this->blobs_[2]->mutable_cpu_data()[0] *= moving_average_fraction_;
this->blobs_[2]->mutable_cpu_data()[0] += 1;
// this->blobs_[0] = 1 * mean_ + moving_average_fraction_ * this->blobs_[0]
// 简述部分的 [F2] 步
caffe_cpu_axpby(mean_.count(), Dtype(1), mean_.cpu_data(),
moving_average_fraction_, this->blobs_[0]->mutable_cpu_data());
// m = num * height * width
int m = bottom[0]->count()/channels_;
Dtype bias_correction_factor = m > 1 ? Dtype(m)/(m-1) : 1;
// this->blobs_[1] = bias_correction_factor * variance_ + moving_average_fraction_ * this->blobs_[1]
// 无偏估计方差 m/(m-1)
// 简述部分的 [F3] 步
caffe_cpu_axpby(variance_.count(), bias_correction_factor,
variance_.cpu_data(), moving_average_fraction_,
this->blobs_[1]->mutable_cpu_data());
}
// normalize variance
// 方差归一化
// variance_ = variance_ + eps_ 添加一个很小的值
caffe_add_scalar(variance_.count(), eps_, variance_.mutable_cpu_data());
// 对 variance_ 的每个值进行操作,求开方
caffe_sqrt(variance_.count(), variance_.cpu_data(),
variance_.mutable_cpu_data());
// replicate variance to input size
// 下面两个 gemm 函数将 channels_ 个值的方差 variance_ 扩展到 channels_ * num * height * width
// num_by_chans_ = 1 * batch_sum_multiplier_ * variance_
caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, num, channels_, 1, 1,
batch_sum_multiplier_.cpu_data(), variance_.cpu_data(), 0.,
num_by_chans_.mutable_cpu_data());
// temp_ = 1.0 * num_by_chans_ * spatial_sum_multiplier_
caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, channels_ * num,
spatial_dim, 1, 1., num_by_chans_.cpu_data(),
spatial_sum_multiplier_.cpu_data(), 0., temp_.mutable_cpu_data());
// 逐元素操作,top_data[i] = top_data[i] / temp_[i]
caffe_div(temp_.count(), top_data, temp_.cpu_data(), top_data);
// TODO(cdoersch): The caching is only needed because later in-place layers
// might clobber the data. Can we skip this if they won't?
// 将 top_data 的计算结果 copy 到 x_norm_.
caffe_copy(x_norm_.count(), top_data, x_norm_.mutable_cpu_data());
}
// 参考简述中的反向计算公式.
template <typename Dtype>
void BatchNormLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
const vector<bool>& propagate_down,
const vector<Blob<Dtype>*>& bottom) {
const Dtype* top_diff; // 梯度
if (bottom[0] != top[0]) {
top_diff = top[0]->cpu_diff();
} else {
caffe_copy(x_norm_.count(), top[0]->cpu_diff(), x_norm_.mutable_cpu_diff());
top_diff = x_norm_.cpu_diff();
}
Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
if (use_global_stats_) {
caffe_div(temp_.count(), top_diff, temp_.cpu_data(), bottom_diff);
return;
}
const Dtype* top_data = x_norm_.cpu_data();
int num = bottom[0]->shape()[0];
int spatial_dim = bottom[0]->count()/(bottom[0]->shape(0)*channels_);
// if Y = (X-mean(X))/(sqrt(var(X)+eps)), then
//
// dE(Y)/dX =
// (dE/dY - mean(dE/dY) - mean(dE/dY \cdot Y) \cdot Y)
// ./ sqrt(var(X) + eps)
//
// where \cdot and ./ are hadamard product and elementwise division,
// respectively, dE/dY is the top diff, and mean/var/sum are all computed
// along all dimensions except the channels dimension. In the above
// equation, the operations allow for expansion (i.e. broadcast) along all
// dimensions except the channels dimension where required.
// sum(dE/dY \cdot Y)
caffe_mul(temp_.count(), top_data, top_diff, bottom_diff);
caffe_cpu_gemv<Dtype>(CblasNoTrans, channels_ * num, spatial_dim, 1.,
bottom_diff, spatial_sum_multiplier_.cpu_data(), 0.,
num_by_chans_.mutable_cpu_data());
caffe_cpu_gemv<Dtype>(CblasTrans, num, channels_, 1.,
num_by_chans_.cpu_data(), batch_sum_multiplier_.cpu_data(), 0.,
mean_.mutable_cpu_data());
// reshape (broadcast) the above
caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, num, channels_, 1, 1,
batch_sum_multiplier_.cpu_data(), mean_.cpu_data(), 0.,
num_by_chans_.mutable_cpu_data());
caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, channels_ * num,
spatial_dim, 1, 1., num_by_chans_.cpu_data(),
spatial_sum_multiplier_.cpu_data(), 0., bottom_diff);
// sum(dE/dY \cdot Y) \cdot Y
caffe_mul(temp_.count(), top_data, bottom_diff, bottom_diff);
// sum(dE/dY)-sum(dE/dY \cdot Y) \cdot Y
caffe_cpu_gemv<Dtype>(CblasNoTrans, channels_ * num, spatial_dim, 1.,
top_diff, spatial_sum_multiplier_.cpu_data(), 0.,
num_by_chans_.mutable_cpu_data());
caffe_cpu_gemv<Dtype>(CblasTrans, num, channels_, 1.,
num_by_chans_.cpu_data(), batch_sum_multiplier_.cpu_data(), 0.,
mean_.mutable_cpu_data());
// reshape (broadcast) the above to make
// sum(dE/dY)-sum(dE/dY \cdot Y) \cdot Y
caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, num, channels_, 1, 1,
batch_sum_multiplier_.cpu_data(), mean_.cpu_data(), 0.,
num_by_chans_.mutable_cpu_data());
caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, num * channels_,
spatial_dim, 1, 1., num_by_chans_.cpu_data(),
spatial_sum_multiplier_.cpu_data(), 1., bottom_diff);
// dE/dY - mean(dE/dY)-mean(dE/dY \cdot Y) \cdot Y
caffe_cpu_axpby(temp_.count(), Dtype(1), top_diff,
Dtype(-1. / (num * spatial_dim)), bottom_diff);
// note: temp_ still contains sqrt(var(X)+eps), computed during the forward
// pass.
caffe_div(temp_.count(), bottom_diff, temp_.cpu_data(), bottom_diff);
}
#ifdef CPU_ONLY
STUB_GPU(BatchNormLayer);
#endif
INSTANTIATE_CLASS(BatchNormLayer);
REGISTER_LAYER_CLASS(BatchNorm);
} // namespace caffe
(二)、batchnorm forward函数实现
template <typename Dtype>
void BatchNormLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
const vector<Blob<Dtype>*>& top) {
const Dtype* bottom_data = bottom[0]->cpu_data();
Dtype* top_data = top[0]->mutable_cpu_data();
int num = bottom[0]->shape(0);
int spatial_dim = bottom[0]->count()/(bottom[0]->shape(0)*channels_);
// 如果不是就地操作,首先将bottom的数据复制到top
if (bottom[0] != top[0]) {
caffe_copy(bottom[0]->count(), bottom_data, top_data);
}
// 如果使用全局统计量,我们需要先计算出真正的mean和var
if (use_global_stats_) {
// use the stored mean/variance estimates.
const Dtype scale_factor = this->blobs_[2]->cpu_data()[0] == 0 ?
0 : 1 / this->blobs_[2]->cpu_data()[0];
// mean = blobs[0] / blobs[2]
caffe_cpu_scale(variance_.count(), scale_factor,
this->blobs_[0]->cpu_data(), mean_.mutable_cpu_data());
// var = blobs[1] / blobs[2]
caffe_cpu_scale(variance_.count(), scale_factor,
this->blobs_[1]->cpu_data(), variance_.mutable_cpu_data());
} else {
// 不使用全局统计量时,我们要根据当前batch的mean和var做规范化
// compute mean
// spatial_sum_multiplier_是全1向量
// batch_sum_multiplier_也是全1向量
// gemv做矩阵与向量相乘 y = alpha*A*x + beta*y。
// 下面式子是将bottom_data这个矩阵与一个全1向量相乘,
// 相当于是在统计行和。
// 注意第二个参数channels_ * num指矩阵的行数,第三个参数是矩阵的列数
// 所以这是在计算每个channel的feature map的和
// 结果out[n][c]是指输入第n个sample的第c个channel的和
// 同时,传入了 1. / (num * spatial_dim) 作为因子乘到结果上面,作用见下面
caffe_cpu_gemv<Dtype>(CblasNoTrans, channels_ * num, spatial_dim,
1. / (num * spatial_dim), bottom_data,
spatial_sum_multiplier_.cpu_data(), 0.,
num_by_chans_.mutable_cpu_data());
// 道理和上面相同,注意下面通过传入CblasTrans,指定了矩阵要转置。所以是在求列和
// 这样,就求出了各个channel的和。
// 上面不是已经除了 num * spatial_dim 吗?这就是求和元素的总数量
// 到此,我们就完成了对当前batch的平均值的求解
caffe_cpu_gemv<Dtype>(CblasTrans, num, channels_, 1.,
num_by_chans_.cpu_data(), batch_sum_multiplier_.cpu_data(), 0.,
mean_.mutable_cpu_data());
}
// subtract mean
// gemm是在做矩阵与矩阵相乘 C = alpha*A*B + beta*C
// 下面这个是在做broadcasting subtraction
caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, num, channels_, 1, 1,
batch_sum_multiplier_.cpu_data(), mean_.cpu_data(), 0.,
num_by_chans_.mutable_cpu_data());
caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, channels_ * num,
spatial_dim, 1, -1, num_by_chans_.cpu_data(),
spatial_sum_multiplier_.cpu_data(), 1., top_data);
// 计算当前的var
if (!use_global_stats_) {
// compute variance using var(X) = E((X-EX)^2)
caffe_sqr<Dtype>(top[0]->count(), top_data,
temp_.mutable_cpu_data()); // (X-EX)^2
caffe_cpu_gemv<Dtype>(CblasNoTrans, channels_ * num, spatial_dim,
1. / (num * spatial_dim), temp_.cpu_data(),
spatial_sum_multiplier_.cpu_data(), 0.,
num_by_chans_.mutable_cpu_data());
caffe_cpu_gemv<Dtype>(CblasTrans, num, channels_, 1.,
num_by_chans_.cpu_data(), batch_sum_multiplier_.cpu_data(), 0.,
variance_.mutable_cpu_data()); // E((X_EX)^2)
// compute and save moving average
// 做滑动平均,更新全局统计量,这里可以参见上面的式子
this->blobs_[2]->mutable_cpu_data()[0] *= moving_average_fraction_;
this->blobs_[2]->mutable_cpu_data()[0] += 1;
caffe_cpu_axpby(mean_.count(), Dtype(1), mean_.cpu_data(),
moving_average_fraction_, this->blobs_[0]->mutable_cpu_data());
int m = bottom[0]->count()/channels_;
Dtype bias_correction_factor = m > 1 ? Dtype(m)/(m-1) : 1;
caffe_cpu_axpby(variance_.count(), bias_correction_factor,
variance_.cpu_data(), moving_average_fraction_,
this->blobs_[1]->mutable_cpu_data());
}
// normalize variance
caffe_add_scalar(variance_.count(), eps_, variance_.mutable_cpu_data());
caffe_sqrt(variance_.count(), variance_.cpu_data(),
variance_.mutable_cpu_data());
// replicate variance to input size
// 同样是在做broadcasting
caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, num, channels_, 1, 1,
batch_sum_multiplier_.cpu_data(), variance_.cpu_data(), 0.,
num_by_chans_.mutable_cpu_data());
caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, channels_ * num,
spatial_dim, 1, 1., num_by_chans_.cpu_data(),
spatial_sum_multiplier_.cpu_data(), 0., temp_.mutable_cpu_data());
caffe_div(temp_.count(), top_data, temp_.cpu_data(), top_data);
// TODO(cdoersch): The caching is only needed because later in-place layers
// might clobber the data. Can we skip this if they won't?
caffe_copy(x_norm_.count(), top_data,
x_norm_.mutable_cpu_data());
}
三、参考链接
https://xmfbit.github.io/2018/01/08/caffe-batch-norm/
https://cloud.tencent.com/developer/article/1391839
https://blog.csdn.net/seven_first/article/details/47378697#2-caffecpugemv-%E5%87%BD%E6%95%B0
https://blog.csdn.net/malefactor/article/details/51476961
网友评论