message LayerParameter {
optional BatchNormParameter batch_norm_param = 139;
message BatchNormParameter {
// If false, normalization is performed over the current mini-batch
// and global statistics are accumulated (but not yet used) by a moving
// average.
// 如果 use_global_stats = 0,则对当前 mini-batch 内的数据归一化; 同时 global statistics 通过滑动平均逐渐累加.
// If true, those accumulated mean and variance values are used for the
// normalization.
// 如果 use_global_stats = 1,则采用累加的 均值和方差 对数据进行归一化.
// By default, it is set to false when the network is in the training
// phase and true when the network is in the testing phase.
// 默认情况下,网络训练时 use_global_stats = 0;网络测试时 use_global_stats = 1.
optional bool use_global_stats = 1;
// What fraction of the moving average remains each iteration?
// 滑动平均时每次迭代保留的百分比?
// Smaller values make the moving average decay faster, giving more
// weight to the recent values.
// 较小的值使得平均累加过程衰退较快,给予最近的值较大的权重
// Each iteration updates the moving average @f$S_{t- 1}@f$ with the
// current mean @f$ Y_t @f$ by
// @f$ S_t = (1-\beta)Y_t + \beta \cdot S_{t-1} @f$, where @f$ \beta @f$
// is the moving_average_fraction parameter.
optional float moving_average_fraction = 2 [default = .999];
// Small value to add to the variance estimate so that we don't divide by
// zero.
// 保持数值稳定性
optional float eps = 3 [default = 1e-5];
3. batch_norm_layer.hpp
#ifndef CAFFE_BATCHNORM_LAYER_HPP_
#define CAFFE_BATCHNORM_LAYER_HPP_
#include <vector>
#include "caffe/blob.hpp"
#include "caffe/layer.hpp"
#include "caffe/proto/caffe.pb.h"
namespace caffe {
* @brief Normalizes the input to have 0-mean and/or unit (1) variance across
* the batch.
* BatchNorm 功能:
* 将 mini-batch 的输入归一化为均值为0或方差为1.
* This layer computes Batch Normalization as described in [1]. For each channel
* in the data (i.e. axis 1), it subtracts the mean and divides by the variance,
* where both statistics are computed across both spatial dimensions and across
* the different examples in the batch.
* 对数据中的每一 channel,如 axis=1,BatchNorm 首先减均值,然后除以其方差.
* 其中,均值和方差是对 mini-batch 内的不同样本的所有 spatial 维度进行计算得到.
* By default, during training time, the network is computing global
* mean/variance statistics via a running average, which is then used at test
* time to allow deterministic outputs for each input. You can manually toggle
* whether the network is accumulating or using the statistics via the
* use_global_stats option. For reference, these statistics are kept in the
* layer's three blobs: (0) mean, (1) variance, and (2) moving average factor.
* 默认情况,训练时,网络通过平均累加,计算全局均值和方差值,然后用于测试来计算每一个输入的输出.
* 可以通过手工设置 use_global_stats 参数,来控制网络是采用累加还是统计值.
* 统计值被保存在网络层的三个 blobs:(0) mean, (1) variance, and (2) moving average factor
* Note that the original paper also included a per-channel learned bias and
* scaling factor. To implement this in Caffe, define a `ScaleLayer` configured
* with `bias_term: true` after each `BatchNormLayer` to handle both the bias
* and scaling factor.
* 原始论文中还包括一个 per-channel 的学习 bias 和一个 scaling 因子.
* 因此,Caffe 实现中,在每个 BatchNormLayer 后的 ScaleLayer 中配置 bias_term: true 来处理 bias 和 scaling 因子.
* [1] S. Ioffe and C. Szegedy, "Batch Normalization: Accelerating Deep Network
* Training by Reducing Internal Covariate Shift." arXiv preprint
* arXiv:1502.03167 (2015).
* TODO(dox): thorough documentation for Forward, Backward, and proto params.
template <typename Dtype>
class BatchNormLayer : public Layer<Dtype> {
public:
explicit BatchNormLayer(const LayerParameter& param)
: Layer<Dtype>(param) {}
virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
const vector<Blob<Dtype>*>& top);
virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
const vector<Blob<Dtype>*>& top);
virtual inline const char* type() const { return "BatchNorm"; }
virtual inline int ExactNumBottomBlobs() const { return 1; }
virtual inline int ExactNumTopBlobs() const { return 1; }
protected:
virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
const vector<Blob<Dtype>*>& top);
virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
const vector<Blob<Dtype>*>& top);
virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
Blob<Dtype> mean_, variance_, temp_, x_norm_; // 均值,方差,
bool use_global_stats_;
Dtype moving_average_fraction_;
int channels_;
Dtype eps_;
// extra temporarary variables is used to carry out sums/broadcasting
// using BLAS
Blob<Dtype> batch_sum_multiplier_;
Blob<Dtype> num_by_chans_;
Blob<Dtype> spatial_sum_multiplier_;
} // namespace caffe
#endif // CAFFE_BATCHNORM_LAYER_HPP_
message LayerParameter {
optional ScaleParameter scale_param = 142;
message ScaleParameter {
// The first axis of bottom[0] (the first input Blob) along which to apply
// bottom[1] (the second input Blob). May be negative to index from the end
// (e.g., -1 for the last axis).
// 根据 bottom[0] 指定 bottom[1] 的形状
// For example, if bottom[0] is 4D with shape 100x3x40x60, the output
// top[0] will have the same shape, and bottom[1] may have any of the
// following shapes (for the given value of axis):
// (axis == 0 == -4) 100; 100x3; 100x3x40; 100x3x40x60
// (axis == 1 == -3) 3; 3x40; 3x40x60
// (axis == 2 == -2) 40; 40x60
// (axis == 3 == -1) 60
// Furthermore, bottom[1] may have the empty shape (regardless of the value of
// "axis") -- a scalar multiplier.
// 例如,如果 bottom[0] 的 shape 为 100x3x40x60,则 top[0] 输出相同的 shape;
// bottom[1] 可以包含上面 shapes 中的任一种(对于给定 axis 值).
// 而且,bottom[1] 可以是 empty shape 的,没有任何的 axis 值,只是一个标量的乘子.
optional int32 axis = 1 [default = 1];
// (num_axes is ignored unless just one bottom is given and the scale is
// a learned parameter of the layer. Otherwise, num_axes is determined by the
// number of axes by the second bottom.)
// (忽略 num_axes 参数,除非只给定一个 bottom 及 scale 是网络层的一个学习到的参数.
// 否则,num_axes 是由第二个 bottom 的数量来决定的.)
// The number of axes of the input (bottom[0]) covered by the scale
// parameter, or -1 to cover all axes of bottom[0] starting from `axis`.
// Set num_axes := 0, to multiply with a zero-axis Blob: a scalar.
// bottom[0] 的 num_axes 是由 scale 参数覆盖的;
optional int32 num_axes = 2 [default = 1];
// (filler is ignored unless just one bottom is given and the scale is
// a learned parameter of the layer.)
// (忽略 filler 参数,除非只给定一个 bottom 及 scale 是网络层的一个学习到的参数.
// The initialization for the learned scale parameter.
// scale 参数学习的初始化
// Default is the unit (1) initialization, resulting in the ScaleLayer
// initially performing the identity operation.
// 默认是单位初始化,使 Scale 层初始进行单位操作.
optional FillerParameter filler = 3;
// Whether to also learn a bias (equivalent to a ScaleLayer+BiasLayer, but
// may be more efficient). Initialized with bias_filler (defaults to 0).
// 是否学习 bias,等价于 ScaleLayer+BiasLayer,只不过效率更高
// 采用 bias_filler 进行初始化. 默认为 0.
optional bool bias_term = 4 [default = false];
optional FillerParameter bias_filler = 5;
}
#ifndef CAFFE_SCALE_LAYER_HPP_
#define CAFFE_SCALE_LAYER_HPP_
#include <vector>
#include "caffe/blob.hpp"
#include "caffe/layer.hpp"
#include "caffe/proto/caffe.pb.h"
#include "caffe/layers/bias_layer.hpp"
namespace caffe {
* @brief Computes the elementwise product of two input Blobs, with the shape of
* the latter Blob "broadcast" to match the shape of the former.
* Equivalent to tiling the latter Blob, then computing the elementwise
* product. Note: for efficiency and convenience, this layer can
* additionally perform a "broadcast" sum too when `bias_term: true`
* is set.
* The latter, scale input may be omitted, in which case it's learned as
* parameter of the layer (as is the bias, if it is included).
template <typename Dtype>
class ScaleLayer: public Layer<Dtype> {
public:
explicit ScaleLayer(const LayerParameter& param)
: Layer<Dtype>(param) {}
virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
const vector<Blob<Dtype>*>& top);
virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
const vector<Blob<Dtype>*>& top);
virtual inline const char* type() const { return "Scale"; }
// Scale
virtual inline int MinBottomBlobs() const { return 1; }
virtual inline int MaxBottomBlobs() const { return 2; }
virtual inline int ExactNumTopBlobs() const { return 1; }
protected:
* In the below shape specifications, @f$ i @f$ denotes the value of the
* `axis` field given by `this->layer_param_.scale_param().axis()`, after
* canonicalization (i.e., conversion from negative to positive index,
* if applicable).
* @param bottom input Blob vector (length 2)
* -# @f$ (d_0 \times ... \times
* d_i \times ... \times d_j \times ... \times d_n) @f$
* the first factor @f$ x @f$
* -# @f$ (d_i \times ... \times d_j) @f$
* the second factor @f$ y @f$
* @param top output Blob vector (length 1)
* -# @f$ (d_0 \times ... \times
* d_i \times ... \times d_j \times ... \times d_n) @f$
* the product @f$ z = x y @f$ computed after "broadcasting" y.
* Equivalent to tiling @f$ y @f$ to have the same shape as @f$ x @f$,
* then computing the elementwise product.
virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
const vector<Blob<Dtype>*>& top);
virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
const vector<Blob<Dtype>*>& top);
virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
shared_ptr<Layer<Dtype> > bias_layer_;
vector<Blob<Dtype>*> bias_bottom_vec_;
vector<bool> bias_propagate_down_;
int bias_param_id_;
Blob<Dtype> sum_multiplier_;
Blob<Dtype> sum_result_;
Blob<Dtype> temp_;
int axis_;
int outer_dim_, scale_dim_, inner_dim_;
} // namespace caffe
#endif // CAFFE_SCALE_LAYER_HPP_
7 scale_layer.cpp
#include <algorithm>
#include <vector>
#include "caffe/filler.hpp"
#include "caffe/layer_factory.hpp"
#include "caffe/layers/scale_layer.hpp"
#include "caffe/util/math_functions.hpp"
namespace caffe {
template <typename Dtype>
void ScaleLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
const vector<Blob<Dtype>*>& top) {
const ScaleParameter& param = this->layer_param_.scale_param(); // scale 参数
// 判断 bottom blobs 是否已经有值
if (bottom.size() == 1 && this->blobs_.size() > 0) {
LOG(INFO) << "Skipping parameter initialization";
} else if (bottom.size() == 1) {
// scale is a learned parameter; initialize it
// 待学习参数 scale,初始化
axis_ = bottom[0]->CanonicalAxisIndex(param.axis()); //
const int num_axes = param.num_axes();
CHECK_GE(num_axes, -1) << "num_axes must be non-negative, "
<< "or -1 to extend to the end of bottom[0]";
if (num_axes >= 0) {
CHECK_GE(bottom[0]->num_axes(), axis_ + num_axes)
<< "scale blob's shape extends past bottom[0]'s shape when applied "
<< "starting with bottom[0] axis = " << axis_;
this->blobs_.resize(1); // gamma
const vector<int>::const_iterator& shape_start =
bottom[0]->shape().begin() + axis_;
const vector<int>::const_iterator& shape_end =
(num_axes == -1) ? bottom[0]->shape().end() : (shape_start + num_axes);
vector<int> scale_shape(shape_start, shape_end);
this->blobs_[0].reset(new Blob<Dtype>(scale_shape));
FillerParameter filler_param(param.filler());
if (!param.has_filler()) {
// 未初始化时,初始化值为 1
// Default to unit (1) filler for identity operation.
filler_param.set_type("constant");
filler_param.set_value(1);
shared_ptr<Filler<Dtype> > filler(GetFiller<Dtype>(filler_param));
filler->Fill(this->blobs_[0].get());
if (param.bias_term()) { // 是否需要处理 bias 项
LayerParameter layer_param(this->layer_param_);
layer_param.set_type("Bias");
BiasParameter* bias_param = layer_param.mutable_bias_param();
bias_param->set_axis(param.axis());
if (bottom.size() > 1) {
bias_param->set_num_axes(bottom[1]->num_axes());
} else {
bias_param->set_num_axes(param.num_axes());
bias_param->mutable_filler()->CopyFrom(param.bias_filler());
bias_layer_ = LayerRegistry<Dtype>::CreateLayer(layer_param);
bias_bottom_vec_.resize(1);
bias_bottom_vec_[0] = bottom[0];
bias_layer_->SetUp(bias_bottom_vec_, top);
if (this->blobs_.size() + bottom.size() < 3) {
// case: blobs.size == 1 && bottom.size == 1
// or blobs.size == 0 && bottom.size == 2
bias_param_id_ = this->blobs_.size();
this->blobs_.resize(bias_param_id_ + 1);
this->blobs_[bias_param_id_] = bias_layer_->blobs()[0];
} else {
// bias param already initialized
bias_param_id_ = this->blobs_.size() - 1;
bias_layer_->blobs()[0] = this->blobs_[bias_param_id_];
bias_propagate_down_.resize(1, false);
this->param_propagate_down_.resize(this->blobs_.size(), true);
template <typename Dtype>
void ScaleLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
const vector<Blob<Dtype>*>& top) {
const ScaleParameter& param = this->layer_param_.scale_param();
Blob<Dtype>* scale = (bottom.size() > 1) ? bottom[1] : this->blobs_[0].get();
// Always set axis_ == 0 in special case where scale is a scalar
// (num_axes == 0). Mathematically equivalent for any choice of axis_, so the
// actual setting can be safely ignored; and computation is most efficient
// with axis_ == 0 and (therefore) outer_dim_ == 1. (Setting axis_ to
// bottom[0]->num_axes() - 1, giving inner_dim_ == 1, would be equally
// performant.)
axis_ = (scale->num_axes() == 0) ?
0 : bottom[0]->CanonicalAxisIndex(param.axis());
CHECK_GE(bottom[0]->num_axes(), axis_ + scale->num_axes())
<< "scale blob's shape extends past bottom[0]'s shape when applied "
<< "starting with bottom[0] axis = " << axis_;
for (int i = 0; i < scale->num_axes(); ++i) {
CHECK_EQ(bottom[0]->shape(axis_ + i), scale->shape(i))
<< "dimension mismatch between bottom[0]->shape(" << axis_ + i
<< ") and scale->shape(" << i << ")";
outer_dim_ = bottom[0]->count(0, axis_);
scale_dim_ = scale->count();
inner_dim_ = bottom[0]->count(axis_ + scale->num_axes());
// 如果 top 层和 bottom 层同名,则进行 in-place 计算
if (bottom[0] == top[0]) { // in-place computation
temp_.ReshapeLike(*bottom[0]);
} else {
top[0]->ReshapeLike(*bottom[0]);
sum_result_.Reshape(vector<int>(1, outer_dim_ * scale_dim_));
const int sum_mult_size = std::max(outer_dim_, inner_dim_);
sum_multiplier_.Reshape(vector<int>(1, sum_mult_size));
if (sum_multiplier_.cpu_data()[sum_mult_size - 1] != Dtype(1)) {
caffe_set(sum_mult_size, Dtype(1), sum_multiplier_.mutable_cpu_data());
if (bias_layer_) {
bias_bottom_vec_[0] = top[0];
bias_layer_->Reshape(bias_bottom_vec_, top);
template <typename Dtype>
void ScaleLayer<Dtype>::Forward_cpu(
const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
const Dtype* bottom_data = bottom[0]->cpu_data();
if (bottom[0] == top[0]) {
// In-place computation; need to store bottom data before overwriting it.
// Note that this is only necessary for Backward; we could skip this if not
// doing Backward, but Caffe currently provides no way of knowing whether
// we'll need to do Backward at the time of the Forward call.
// in-place 计算,需要先临时复制一份,再进行计算.
caffe_copy(bottom[0]->count(), bottom[0]->cpu_data(),
temp_.mutable_cpu_data());
const Dtype* scale_data =
((bottom.size() > 1) ? bottom[1] : this->blobs_[0].get())->cpu_data();
Dtype* top_data = top[0]->mutable_cpu_data();
for (int n = 0; n < outer_dim_; ++n) {
for (int d = 0; d < scale_dim_; ++d) {
const Dtype factor = scale_data[d];
caffe_cpu_scale(inner_dim_, factor, bottom_data, top_data);
bottom_data += inner_dim_;
top_data += inner_dim_;
if (bias_layer_) {
bias_layer_->Forward(bias_bottom_vec_, top);
template <typename Dtype>
void ScaleLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
if (bias_layer_ &&
this->param_propagate_down_[this->param_propagate_down_.size() - 1]) {
bias_layer_->Backward(top, bias_propagate_down_, bias_bottom_vec_);
const bool scale_param = (bottom.size() == 1);
Blob<Dtype>* scale = scale_param ? this->blobs_[0].get() : bottom[1];
if ((!scale_param && propagate_down[1]) ||
(scale_param && this->param_propagate_down_[0])) {
const Dtype* top_diff = top[0]->cpu_diff();
const bool in_place = (bottom[0] == top[0]);
const Dtype* bottom_data = (in_place ? &temp_ : bottom[0])->cpu_data();
// Hack: store big eltwise product in bottom[0] diff, except in the special
// case where this layer itself does the eltwise product, in which case we
// can store it directly in the scale diff, and we're done.
// If we're computing in-place (and not doing eltwise computation), this
// hack doesn't work and we store the product in temp_.
const bool is_eltwise = (bottom[0]->count() == scale->count());
Dtype* product = (is_eltwise ? scale->mutable_cpu_diff() :
(in_place ? temp_.mutable_cpu_data() : bottom[0]->mutable_cpu_diff()));
caffe_mul(top[0]->count(), top_diff, bottom_data, product);
if (!is_eltwise) {
Dtype* sum_result = NULL;
if (inner_dim_ == 1) {
sum_result = product;
} else if (sum_result_.count() == 1) {
const Dtype* sum_mult = sum_multiplier_.cpu_data();
Dtype* scale_diff = scale->mutable_cpu_diff();
if (scale_param) {
Dtype result = caffe_cpu_dot(inner_dim_, product, sum_mult);
*scale_diff += result;
} else {
*scale_diff = caffe_cpu_dot(inner_dim_, product, sum_mult);
} else {
const Dtype* sum_mult = sum_multiplier_.cpu_data();
sum_result = (outer_dim_ == 1) ?
scale->mutable_cpu_diff() : sum_result_.mutable_cpu_data();
caffe_cpu_gemv(CblasNoTrans, sum_result_.count(), inner_dim_,
Dtype(1), product, sum_mult, Dtype(0), sum_result);
if (outer_dim_ != 1) {
const Dtype* sum_mult = sum_multiplier_.cpu_data();
Dtype* scale_diff = scale->mutable_cpu_diff();
if (scale_dim_ == 1) {
if (scale_param) {
Dtype result = caffe_cpu_dot(outer_dim_, sum_mult, sum_result);
*scale_diff += result;
} else {
*scale_diff = caffe_cpu_dot(outer_dim_, sum_mult, sum_result);
} else {
caffe_cpu_gemv(CblasTrans, outer_dim_, scale_dim_,
Dtype(1), sum_result, sum_mult, Dtype(scale_param),
scale_diff);
if (propagate_down[0]) {
const Dtype* top_diff = top[0]->cpu_diff();
const Dtype* scale_data = scale->cpu_data();
Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
for (int n = 0; n < outer_dim_; ++n) {
for (int d = 0; d < scale_dim_; ++d) {
const Dtype factor = scale_data[d];
caffe_cpu_scale(inner_dim_, factor, top_diff, bottom_diff);
bottom_diff += inner_dim_;
top_diff += inner_dim_;
#ifdef CPU_ONLY
STUB_GPU(ScaleLayer);
#endif