| /* |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, |
| * software distributed under the License is distributed on an |
| * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| * KIND, either express or implied. See the License for the |
| * specific language governing permissions and limitations |
| * under the License. |
| */ |
| |
| /*! |
| * \file batch_norm-inl.h |
| * \brief |
| * \author Bing Xu, Chris Olivier, Da Zheng |
| */ |
| #ifndef MXNET_OPERATOR_NN_BATCH_NORM_INL_H_ |
| #define MXNET_OPERATOR_NN_BATCH_NORM_INL_H_ |
| |
| #include <dmlc/logging.h> |
| #include <dmlc/parameter.h> |
| #include <mxnet/operator.h> |
| |
| #include <mshadow/base.h> |
| |
| #include <map> |
| #include <string> |
| #include <utility> |
| #include <vector> |
| |
| #include "../mshadow_op.h" |
| #include "../mxnet_op.h" |
| #include "../operator_common.h" |
| |
| #ifdef __GNUG__ |
| #pragma GCC diagnostic push |
| #pragma GCC diagnostic ignored "-Wunused-local-typedefs" |
| #endif |
| |
| /*! \brief inverse standard deviation <-> variance */ |
| #define VARIANCE_TO_INVSTD(__var$, __eps$) (1.0 / std::sqrt((__var$) + (__eps$))) |
| #define INVSTD_TO_VARIANCE(__invstd$, __eps$) ((1.0 / ((__invstd$) * (__invstd$))) - (__eps$)) |
| |
| namespace mxnet { |
| namespace op { |
| |
| namespace batchnorm { |
| enum BatchNormOpInputs { |
| kData, |
| kGamma, |
| kBeta, |
| kInMovingMean, |
| kInMovingVar |
| }; // kGamma: weights, kBeta: biases |
| enum BatchNormOpOutputs { kOut, kMean, kVar }; // req, out_data |
| enum BatchNormOpResource { kTempSpace }; |
| enum BatchNormOpAuxiliary { kMovingMean, kMovingVar }; // aux_states |
| |
| /*! \brief Default channel axis if none specified in the params */ |
| constexpr int DEFAULT_AXIS = 1; |
| } // namespace batchnorm |
| |
| /*! \brief Parameters for BatchNorm operator */ |
| namespace quantized_batchnorm { |
| enum QuantizedBatchNormOpInputs { |
| kData, |
| kGamma, |
| kBeta, |
| kInMovingMean, |
| kInMovingVar, |
| kDataMin, |
| kDataMax |
| }; |
| enum QuantizedBatchNormOutputs { kOut, kOutMin, kOutMax }; |
| enum QuantizedBatchNormOpAuxiliary { kMovingMean, kMovingVar }; |
| } // namespace quantized_batchnorm |
| |
| /*! \brief Parameters for BatchNoram operator */ |
| struct BatchNormParam : public dmlc::Parameter<BatchNormParam> { |
| double eps; |
| float momentum; |
| bool fix_gamma; |
| bool use_global_stats; |
| bool output_mean_var; |
| int axis; |
| bool cudnn_off; |
| |
| dmlc::optional<float> min_calib_range; // min float value calculated from calibration dataset |
| dmlc::optional<float> max_calib_range; // max float value calculated from calibration dataset |
| |
| DMLC_DECLARE_PARAMETER(BatchNormParam) { |
| DMLC_DECLARE_FIELD(eps).set_default(1e-3f).describe( |
| "Epsilon to prevent div 0. " |
| "Must be no less than CUDNN_BN_MIN_EPSILON " |
| "defined in cudnn.h when using cudnn (usually 1e-5)"); |
| DMLC_DECLARE_FIELD(momentum).set_default(0.9f).describe("Momentum for moving average"); |
| DMLC_DECLARE_FIELD(fix_gamma).set_default(true).describe("Fix gamma while training"); |
| DMLC_DECLARE_FIELD(use_global_stats) |
| .set_default(false) |
| .describe( |
| "Whether use global moving statistics instead of local batch-norm. " |
| "This will force change batch-norm into a scale shift operator."); |
| DMLC_DECLARE_FIELD(output_mean_var) |
| .set_default(false) |
| .describe("Output the mean and inverse std "); |
| DMLC_DECLARE_FIELD(axis) |
| .set_default(mxnet::op::batchnorm::DEFAULT_AXIS) |
| .describe("Specify which shape axis the channel is specified"); |
| DMLC_DECLARE_FIELD(cudnn_off).set_default(false).describe( |
| "Do not select CUDNN operator, if available"); |
| DMLC_DECLARE_FIELD(min_calib_range) |
| .set_default(dmlc::optional<float>()) |
| .describe( |
| "The minimum scalar value in the form of float32 obtained " |
| "through calibration. If present, it will be used to by " |
| "quantized batch norm op to calculate primitive scale." |
| "Note: this calib_range is to calib bn output."); |
| DMLC_DECLARE_FIELD(max_calib_range) |
| .set_default(dmlc::optional<float>()) |
| .describe( |
| "The maximum scalar value in the form of float32 obtained " |
| "through calibration. If present, it will be used to by " |
| "quantized batch norm op to calculate primitive scale." |
| "Note: this calib_range is to calib bn output."); |
| } |
| |
| bool operator==(const BatchNormParam& other) const { |
| bool flag = this->eps == other.eps && this->momentum == other.momentum && |
| this->fix_gamma == other.fix_gamma && |
| this->use_global_stats == other.use_global_stats && |
| this->output_mean_var == other.output_mean_var && this->axis == other.axis && |
| this->cudnn_off == other.cudnn_off && |
| this->min_calib_range.has_value() == other.min_calib_range.has_value() && |
| this->max_calib_range.has_value() == other.max_calib_range.has_value(); |
| if (this->min_calib_range.has_value() && other.min_calib_range.has_value() && |
| this->max_calib_range.has_value() && other.max_calib_range.has_value()) { |
| flag = flag && this->min_calib_range.value() == other.min_calib_range.value() && |
| this->max_calib_range.value() == other.max_calib_range.value(); |
| } |
| return flag; |
| } |
| void SetAttrDict(std::unordered_map<std::string, std::string>* dict) { |
| std::ostringstream eps_s, momentum_s, fix_gamma_s, use_global_stats_s, output_mean_var_s, |
| axis_s, cudnn_off_s, min_calib_range_s, max_calib_range_s; |
| eps_s << eps; |
| momentum_s << momentum; |
| fix_gamma_s << fix_gamma; |
| use_global_stats_s << use_global_stats; |
| output_mean_var_s << output_mean_var; |
| axis_s << axis; |
| cudnn_off_s << cudnn_off; |
| min_calib_range_s << min_calib_range; |
| max_calib_range_s << max_calib_range; |
| (*dict)["eps"] = eps_s.str(); |
| (*dict)["momentum"] = momentum_s.str(); |
| (*dict)["fix_gamma"] = fix_gamma_s.str(); |
| (*dict)["use_global_stats"] = use_global_stats_s.str(); |
| (*dict)["output_mean_var"] = output_mean_var_s.str(); |
| (*dict)["axis"] = axis_s.str(); |
| (*dict)["cudnn_off"] = cudnn_off_s.str(); |
| (*dict)["min_calib_range"] = min_calib_range_s.str(); |
| (*dict)["max_calib_range"] = max_calib_range_s.str(); |
| } |
| }; |
| |
| } // namespace op |
| } // namespace mxnet |
| |
| namespace std { |
| template <> |
| struct hash<mxnet::op::BatchNormParam> { |
| size_t operator()(const mxnet::op::BatchNormParam& val) { |
| size_t ret = 0; |
| ret = dmlc::HashCombine(ret, val.momentum); |
| ret = dmlc::HashCombine(ret, val.fix_gamma); |
| ret = dmlc::HashCombine(ret, val.use_global_stats); |
| ret = dmlc::HashCombine(ret, val.output_mean_var); |
| ret = dmlc::HashCombine(ret, val.axis); |
| return ret; |
| } |
| }; |
| } // namespace std |
| |
| namespace mxnet { |
| namespace op { |
| |
| static inline bool IsBNWriting(const OpReqType ort) { |
| return ort == kWriteTo || ort == kWriteInplace; |
| } |
| |
| template <typename xpu, typename DType, typename AccReal> |
| void BatchNormForwardImpl(mshadow::Stream<cpu>* stream, |
| const OpContext& ctx, |
| const BatchNormParam& param, |
| const std::vector<TBlob>& in_data, |
| const std::vector<OpReqType>& req, |
| const std::vector<TBlob>& out_data, |
| const std::vector<TBlob>& aux_states); |
| |
| template <typename xpu, typename DType, typename AccReal> |
| void BatchNormBackwardImpl(mshadow::Stream<cpu>* stream, |
| const OpContext& ctx, |
| const BatchNormParam& param, |
| const std::vector<TBlob>& out_grad, |
| const std::vector<TBlob>& in_data, |
| const std::vector<TBlob>& out_data, |
| const std::vector<OpReqType>& req, |
| const std::vector<TBlob>& in_grad, |
| const std::vector<TBlob>& aux_states); |
| |
| #if MXNET_USE_CUDA |
| template <typename xpu, typename DType, typename AccReal> |
| void BatchNormForwardImpl(mshadow::Stream<gpu>* stream, |
| const OpContext& ctx, |
| const BatchNormParam& param, |
| const std::vector<TBlob>& in_data, |
| const std::vector<OpReqType>& req, |
| const std::vector<TBlob>& out_data, |
| const std::vector<TBlob>& aux_states); |
| template <typename xpu, typename DType, typename AccReal> |
| void BatchNormBackwardImpl(mshadow::Stream<gpu>* stream, |
| const OpContext& ctx, |
| const BatchNormParam& param, |
| const std::vector<TBlob>& out_grad, |
| const std::vector<TBlob>& in_data, |
| const std::vector<TBlob>& out_data, |
| const std::vector<OpReqType>& req, |
| const std::vector<TBlob>& in_grad, |
| const std::vector<TBlob>& aux_states); |
| #endif // MXNET_USE_CUDA |
| |
| /*! |
| * \brief perform a forward operation of Operator, save the output to TBlob. |
| * \param ctx runtime context available to this call |
| * \param in_data array of input data, it is const |
| * \param req the request types of saving operation, can only be kWriteTo or kWriteInplace. |
| * \param out_data array of output data, pointer is used to indicate that this is holder |
| * the space of TBlob in out_data must be pre-allocated with InferShape |
| * \param aux_states Auxiliary states of operator. Normally operator doesn't |
| * need, special case like Batch Norm requires. |
| * \sa OpReqType, OpContext |
| */ |
| template <typename xpu, typename DType, typename AccReal> |
| void BatchNormForward(const OpContext& ctx, |
| const BatchNormParam& param, |
| const std::vector<TBlob>& in_data, |
| const std::vector<OpReqType>& req, |
| const std::vector<TBlob>& out_data, |
| const std::vector<TBlob>& aux_states) { |
| using namespace mshadow; |
| using namespace mshadow::expr; |
| |
| CHECK_EQ(in_data.size(), 3U); |
| CHECK_EQ(aux_states.size(), 2U); |
| if (ctx.is_train) { |
| CHECK_EQ(out_data.size(), 3U); |
| CHECK_EQ(req.size(), 3U); |
| } else { |
| CHECK_GE(out_data.size(), 1U); |
| CHECK_GE(req.size(), 1U); |
| CHECK_EQ(req[batchnorm::kOut], kWriteTo); |
| } |
| Stream<xpu>* s = ctx.get_stream<xpu>(); |
| BatchNormForwardImpl<xpu, DType, AccReal>(s, ctx, param, in_data, req, out_data, aux_states); |
| } |
| |
| /*! |
| * \brief Perform a Backward Operation, write gradient to the in_grad. |
| * |
| * \note |
| * Convention: |
| * out_grad.size() == OperatorProperty.NumVisibleOutputs() |
| * out_data.size() == OperatorProperty.NumOutputs() |
| * out_data can contain additional invisible returns that remembers the |
| * state carried from the Forward pass. For example mask in the dropout. |
| * The gradients are passed from visible returns in this function. |
| * |
| * \par |
| * Not all the TBlobs in the arguments will be available |
| * if you override the DeclareBackwardDependency of corresponding OperatorProperty class. |
| * Only the dependencies you declared will be available at corresponding position, |
| * the rest of the parameters are simply dummy where you will get a nullptr. |
| * You will be safe if you use the default DeclareBackwardDependency. |
| * But only declare what you need will give engine more chance for optimization. |
| * |
| * \param ctx runtime context available to this call |
| * \param out_grad the gradient value we get from of the Operator. |
| * \param in_data the array of input data. |
| * \param out_data the array of output data. |
| * \param req request types of the saving operation, can be all types. |
| * \param in_grad the array of gradient we need to write to. |
| * \param aux_states Auxiliary states of operator. Normally operator doesn't need |
| * \sa OperatorProperty, OpReqType, OpContext |
| */ |
| template <typename xpu, typename DType, typename AccReal> |
| void BatchNormBackward(const OpContext& ctx, |
| const BatchNormParam& param, |
| const std::vector<TBlob>& inputs, |
| const std::vector<OpReqType>& req, |
| const std::vector<TBlob>& outputs) { |
| CHECK_EQ(inputs.size(), 8U); |
| CHECK_EQ(outputs.size(), 3U); |
| |
| std::vector<TBlob> out_grad(1); |
| std::vector<TBlob> out_data(3); |
| std::vector<TBlob> in_data(3); |
| std::vector<TBlob> aux_states(2); |
| |
| out_grad[0] = inputs[0]; |
| out_data[batchnorm::kMean] = inputs[1]; |
| out_data[batchnorm::kVar] = inputs[2]; |
| in_data[batchnorm::kData] = inputs[3]; |
| in_data[batchnorm::kGamma] = inputs[4]; |
| in_data[batchnorm::kBeta] = inputs[5]; |
| aux_states[batchnorm::kMovingMean] = inputs[6]; |
| aux_states[batchnorm::kMovingVar] = inputs[7]; |
| const std::vector<TBlob>& in_grad = outputs; |
| mshadow::Stream<xpu>* s = ctx.get_stream<xpu>(); |
| BatchNormBackwardImpl<xpu, DType, AccReal>( |
| s, ctx, param, out_grad, in_data, out_data, req, in_grad, aux_states); |
| } |
| |
| template <typename xpu> |
| void BatchNormCompute(const nnvm::NodeAttrs& attrs, |
| const OpContext& ctx, |
| const std::vector<TBlob>& inputs, |
| const std::vector<OpReqType>& req, |
| const std::vector<TBlob>& outputs) { |
| const BatchNormParam& param = nnvm::get<BatchNormParam>(attrs.parsed); |
| CHECK_EQ(inputs.size(), 5U); |
| std::vector<TBlob> in_data(inputs.begin(), inputs.begin() + batchnorm::kInMovingMean); |
| std::vector<TBlob> aux_states(inputs.begin() + batchnorm::kInMovingMean, inputs.end()); |
| MSHADOW_REAL_TYPE_SWITCH_EX(inputs[0].type_flag_, DType, AccReal, { |
| BatchNormForward<xpu, DType, AccReal>(ctx, param, in_data, req, outputs, aux_states); |
| }); |
| } |
| |
| template <typename xpu> |
| void BatchNormGradCompute(const nnvm::NodeAttrs& attrs, |
| const OpContext& ctx, |
| const std::vector<TBlob>& inputs, |
| const std::vector<OpReqType>& req, |
| const std::vector<TBlob>& outputs) { |
| CHECK_EQ(inputs.size(), 8U); |
| const BatchNormParam& param = nnvm::get<BatchNormParam>(attrs.parsed); |
| |
| MSHADOW_REAL_TYPE_SWITCH_EX(inputs[0].type_flag_, DType, AccReal, { |
| BatchNormBackward<xpu, DType, AccReal>(ctx, param, inputs, req, outputs); |
| }); |
| } |
| |
| #if DMLC_USE_CXX11 |
| |
| namespace batchnorm { |
| |
| template <typename DType> |
| class BNTensor3 { |
| enum { OUTER, CHANNEL, INNER, COUNT }; |
| |
| public: |
| inline BNTensor3(const TBlob& blob, const int indexOfChannel) |
| : dptr_(blob.dptr<DType>()), |
| indexOfChannel_(static_cast<size_t>( |
| indexOfChannel < 0 ? (static_cast<int>(blob.shape_.ndim()) + indexOfChannel) : |
| indexOfChannel)) { |
| CHECK_EQ(blob.type_flag_, mshadow::DataType<DType>::kFlag); |
| shape_[OUTER] = 1; |
| for (size_t i = 0; i < indexOfChannel_; ++i) { |
| shape_[OUTER] *= blob.shape_[i]; |
| } |
| shape_[CHANNEL] = blob.shape_[indexOfChannel_]; |
| shape_[INNER] = 1; |
| for (size_t i = indexOfChannel_ + 1, n = blob.shape_.ndim(); i < n; ++i) { |
| shape_[INNER] *= blob.shape_[i]; |
| } |
| } |
| |
| inline BNTensor3(DType* p, const mxnet::TShape& shape, const int indexOfChannel) |
| : dptr_(p), |
| indexOfChannel_(static_cast<size_t>(indexOfChannel < 0 ? |
| (static_cast<int>(shape.ndim()) + indexOfChannel) : |
| indexOfChannel)) { |
| shape_[OUTER] = 1; |
| for (size_t i = 0; i < indexOfChannel_; ++i) { |
| shape_[OUTER] *= shape[i]; |
| } |
| shape_[CHANNEL] = shape[indexOfChannel_]; |
| shape_[INNER] = 1; |
| for (size_t i = indexOfChannel_ + 1, n = shape.ndim(); i < n; ++i) { |
| shape_[INNER] *= shape[i]; |
| } |
| } |
| |
| MSHADOW_FORCE_INLINE bool IsEmpty() const { |
| return dptr_ == nullptr; |
| } |
| |
| MSHADOW_XINLINE size_t Size() const { |
| size_t n = 1; |
| for (int i = 0; i < COUNT; ++i) { |
| n *= shape_[i]; |
| } |
| return n; |
| } |
| |
| MSHADOW_XINLINE size_t ChannelCount() const { |
| return shape_[CHANNEL]; |
| } |
| |
| MSHADOW_XINLINE size_t OuterSize() const { |
| return shape_[OUTER]; |
| } |
| |
| MSHADOW_XINLINE size_t InnerSize() const { |
| return shape_[INNER]; |
| } |
| |
| /*! \brief start of a given channel's spatial data */ |
| MSHADOW_XINLINE size_t StartOffset(const size_t channel) const { |
| return channel * InnerSize(); |
| } |
| |
| /*! \brief This is the amount to skip to next same-channel data |
| * This is the number of bytes to skip from one past the end of the current spatial data |
| * to the next start of the same channel's "spatial data" |
| * It is assume that the pointer being calculated points just beyond the |
| * end of the last blobk of spatial data |
| * i.e. RGBRGB <-- 2 |
| * RRGGBB <-- 4 |
| **/ |
| MSHADOW_XINLINE size_t SkipLengthToNextSameChannelData() const { |
| return (ChannelCount() - 1) * InnerSize(); |
| } |
| |
| MSHADOW_XINLINE size_t offset(const size_t outer, const size_t channel, const size_t i) const { |
| const size_t spatial_size = InnerSize(); |
| const size_t skip_length = SkipLengthToNextSameChannelData(); |
| size_t off = StartOffset(channel); |
| off += outer * shape_[CHANNEL] * shape_[INNER]; |
| const size_t skips = i / spatial_size; |
| off += (1 + skip_length) * skips; |
| off += i % spatial_size; |
| return off; |
| } |
| |
| MSHADOW_XINLINE DType& get_ref(const size_t batch, const size_t channel, const size_t i) { |
| const size_t off = offset(batch, channel, i); |
| return dptr_[off]; |
| } |
| |
| MSHADOW_XINLINE const DType& get_ref(const size_t batch, |
| const size_t channel, |
| const size_t i) const { |
| const size_t off = offset(batch, channel, i); |
| return dptr_[off]; |
| } |
| |
| DType* dptr_; |
| size_t indexOfChannel_; |
| size_t shape_[COUNT]; |
| }; |
| |
| inline int GetRealAxis(const mxnet::TShape& shape, int axis) { |
| if (axis < 0) { |
| axis += shape.ndim(); |
| } |
| return axis; |
| } |
| |
| extern volatile bool disable_mkl; |
| |
| } // namespace batchnorm |
| |
| #endif // DMLC_USE_CXX11 |
| } // namespace op |
| } // namespace mxnet |
| |
| #ifdef __GNUG__ |
| #pragma GCC diagnostic pop |
| #endif |
| |
| #endif // MXNET_OPERATOR_NN_BATCH_NORM_INL_H_ |