src/operator/batch_norm-inl.h - mxnet-test - Git at Google

 /*!
  * Copyright (c) 2017 by Contributors
  * \file batch_norm-inl.h
  * \brief
  * \author Bing Xu, Chris Olivier
  */
 #ifndef MXNET_OPERATOR_BATCH_NORM_INL_H_
 #define MXNET_OPERATOR_BATCH_NORM_INL_H_

 #include <dmlc/logging.h>
 #include <dmlc/parameter.h>
 #include <mxnet/operator.h>
 #include <mshadow/base.h>
 #include <map>
 #include <vector>
 #include <string>
 #include <utility>
 #include "./mshadow_op.h"
 #include "./operator_common.h"
 #include "mxnet_op.h"

 #ifdef __GNUG__
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wunused-local-typedefs"
 #endif

 namespace mxnet {
 namespace op {

 namespace batchnorm {
 enum BatchNormOpInputs {kData, kGamma, kBeta};  // kGamma: weights, kBeta: biases
 enum BatchNormOpOutputs {kOut, kMean, kVar};  // req, out_data
 enum BatchNormOpAuxiliary {kMovingMean, kMovingVar};  // aux_states

 /*! \brief Default channel axis if none specified int he params */
 constexpr int DEFAULT_AXIS = 1;
 }  // namespace batchnorm

 /*! \brief Parameters for BatchNoram operator */
 struct BatchNormParam : public dmlc::Parameter<BatchNormParam> {
   float eps;
   float momentum;
   bool fix_gamma;
   bool use_global_stats;
   bool output_mean_var;
   int axis;
   bool cudnn_off;
   DMLC_DECLARE_PARAMETER(BatchNormParam) {
     DMLC_DECLARE_FIELD(eps).set_default(1e-3f)
     .describe("Epsilon to prevent div 0. "
               "Must be bigger than CUDNN_BN_MIN_EPSILON "
               "defined in cudnn.h when using cudnn (usually 1e-5)");
     DMLC_DECLARE_FIELD(momentum).set_default(0.9f)
     .describe("Momentum for moving average");
     DMLC_DECLARE_FIELD(fix_gamma).set_default(true)
     .describe("Fix gamma while training");
     DMLC_DECLARE_FIELD(use_global_stats).set_default(false)
     .describe("Whether use global moving statistics instead of local batch-norm. "
               "This will force change batch-norm into a scale shift operator.");
     DMLC_DECLARE_FIELD(output_mean_var).set_default(false)
     .describe("Output All,normal mean and var");
     DMLC_DECLARE_FIELD(axis).set_default(mxnet::op::batchnorm::DEFAULT_AXIS)
       .describe("Specify which shape axis the channel is specified");
     DMLC_DECLARE_FIELD(cudnn_off).set_default(false)
       .describe("Do not select CUDNN operator, if available");
   }
 };

 /*! \brief Batch normalization operator */
 template <typename xpu, typename DType, typename AccReal>
 class BatchNormOp : public Operator {
  public:
   explicit BatchNormOp(BatchNormParam param) {
     this->param_ = param;
   }

   static inline bool IsWriting(const OpReqType ort) {
     return ort == kWriteTo || ort == kWriteInplace;
   }

   /*!
    * \brief perform a forward operation of Operator, save the output to TBlob.
    * \param ctx runtime context available to this call
    * \param in_data array of input data, it is const
    * \param req the request types of saving operation, can only be kWriteTo or kWriteInplace.
    * \param out_data array of output data, pointer is used to indicate that this is holder
    *        the space of TBlob in out_data must be pre-allocated with InferShape
    * \param aux_states Auxiliary states of operator. Normally operator doesn't
    *        need, epecial case like Batch Norm requires.
    * \sa OpReqType, OpContext
    */
   virtual void Forward(const OpContext &ctx,
                        const std::vector<TBlob> &in_data,
                        const std::vector<OpReqType> &req,
                        const std::vector<TBlob> &out_data,
                        const std::vector<TBlob> &aux_states) {
     using namespace mshadow;
     using namespace mshadow::expr;

     CHECK_EQ(in_data.size(), 3U);
     CHECK_EQ(aux_states.size(), 2U);
     if (ctx.is_train) {
       CHECK_EQ(out_data.size(), 3U);
       CHECK_EQ(req.size(), 3U);
     } else {
       CHECK_GE(out_data.size(), 1U);
       CHECK_GE(req.size(), 1U);
       CHECK_EQ(req[batchnorm::kOut], kWriteTo);
     }
     Stream<xpu> *s = ctx.get_stream<xpu>();
     DoForward(s, ctx, in_data, req, out_data, aux_states);
   }

   /*!
    * \brief Perform a Backward Operation, write gradient to the in_grad.
    *
    * \note
    * Convention:
    *   out_grad.size() == OperatorProperty.NumVisibleOutputs()
    *   out_data.size() == OperatorProperty.NumOutputs()
    * out_data can contain additional invisible returns that remembers the
    * state carried from the Forward pass. For example mask in the dropout.
    * The gradients are passed from visible returns in this function.
    *
    * \par
    * Not all the TBlobs in the arguments will be available
    * if you override the DeclareBackwardDependency of corresponding OperatorProperty class.
    * Only the dependencies you declared will be available at corresponding position,
    * the rest of the parameters are simply dummy where you will get a nullptr.
    * You will be safe if you use the default DeclareBackwardDependency.
    * But only declare what you need will give engine more chance for optimization.
    *
    * \param ctx runtime context available to this call
    * \param out_grad the gradient value we get from of the Operator.
    * \param in_data the array of input data.
    * \param out_data the array of output data.
    * \param req request types of the saving operation, can be all types.
    * \param in_grad the array of gradient we need to write to.
    * \param aux_states Auxiliary states of operator. Normally operator doesn't need
    * \sa OperatorProperty, OpReqType, OpContext
    */
   virtual void Backward(const OpContext &ctx,
                         const std::vector<TBlob> &out_grad,
                         const std::vector<TBlob> &in_data,
                         const std::vector<TBlob> &out_data,
                         const std::vector<OpReqType> &req,
                         const std::vector<TBlob> &in_grad,
                         const std::vector<TBlob> &aux_states) {
     CHECK_EQ(out_grad.size(), param_.output_mean_var ? 3U : 1U);
     CHECK_EQ(in_data.size(), 3U);
     CHECK_EQ(out_data.size(), 3U);
     CHECK_EQ(in_grad.size(), 3U);
     mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
     DoBackward(s, ctx, out_grad, in_data,
                out_data, req, in_grad, aux_states);
   }

  private:
   void DoForward(mshadow::Stream<cpu> *stream,
                  const OpContext &ctx,
                  const std::vector<TBlob> &in_data,
                  const std::vector<OpReqType> &req,
                  const std::vector<TBlob> &out_data,
                  const std::vector<TBlob> &aux_states);

   void DoBackward(mshadow::Stream<cpu> *stream,
                   const OpContext &ctx,
                   const std::vector<TBlob> &out_grad,
                   const std::vector<TBlob> &in_data,
                   const std::vector<TBlob> &out_data,
                   const std::vector<OpReqType> &req,
                   const std::vector<TBlob> &in_grad,
                   const std::vector<TBlob> &aux_states);

 #if MXNET_USE_CUDA
   void DoForward(mshadow::Stream<gpu> *stream,
                  const OpContext &ctx,
                  const std::vector<TBlob> &in_data,
                  const std::vector<OpReqType> &req,
                  const std::vector<TBlob> &out_data,
                  const std::vector<TBlob> &aux_states);
   void DoBackward(mshadow::Stream<gpu> *stream,
                   const OpContext &ctx,
                   const std::vector<TBlob> &out_grad,
                   const std::vector<TBlob> &in_data,
                   const std::vector<TBlob> &out_data,
                   const std::vector<OpReqType> &req,
                   const std::vector<TBlob> &in_grad,
                   const std::vector<TBlob> &aux_states);
 #endif  // MXNET_USE_CUDA

   /*! \brief Batch normalization operator parameters */
   BatchNormParam param_;
 };  // class BatchNormOp

 template<typename xpu>
 Operator *CreateOp(BatchNormParam param, const int dtype, const TShape& shape);

 #if DMLC_USE_CXX11
 class BatchNormProp : public OperatorProperty {
  public:
   void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) override {
     param_.Init(kwargs);
   }

   std::map<std::string, std::string> GetParams() const override {
     return param_.__DICT__();
   }

   bool InferShape(std::vector<TShape> *in_shape,
                   std::vector<TShape> *out_shape,
                   std::vector<TShape> *aux_shape) const override {
     using namespace mshadow;
     CHECK_EQ(in_shape->size(), 3U) << "Input:[data, gamma, beta]";
     const TShape &dshape = in_shape->at(0);

     const size_t channelAxis = static_cast<size_t>(param_.axis < 0
                             ? static_cast<int>(dshape.ndim()) + param_.axis
                             : param_.axis);
     CHECK_LT(channelAxis, dshape.ndim()) << "Channel axis out of range: " << param_.axis;

     const int channelCount = dshape[channelAxis];

     if (dshape.ndim() == 0) {
       return false;
     }

     in_shape->at(1) = TShape(Shape1(channelCount));
     in_shape->at(2) = TShape(Shape1(channelCount));

     out_shape->clear();
     out_shape->push_back(dshape);                // kOut
     out_shape->push_back(Shape1(channelCount));  // kMean
     out_shape->push_back(Shape1(channelCount));  // kVar

     aux_shape->clear();
     aux_shape->push_back(Shape1(channelCount));  // kMovingMean
     aux_shape->push_back(Shape1(channelCount));  // kMovingVar
     return true;
   }

   bool InferType(std::vector<int> *in_type,
                  std::vector<int> *out_type,
                  std::vector<int> *aux_type) const override {
     using namespace mshadow;
     CHECK_GE(in_type->size(), 1U);
     const int dtype = (*in_type)[0];
     CHECK_NE(dtype, -1) << "First input must have specified type";
     // For float16 input type beta, gamma, mean, and average are stored in float32.
     // For other input types, these parameters have the same type as input
     // NOTE: This requirement is from cuDNN (v. 4 and 5)
     int dtype_param;
     MSHADOW_REAL_TYPE_SWITCH_EX(dtype, DTypeX, AccRealX, {
          dtype_param = mshadow::DataType<AccRealX>::kFlag; });
     for (index_t i = 1; i < in_type->size(); ++i) {
       if ((*in_type)[i] == -1) {
         (*in_type)[i] = dtype_param;
       } else {
         CHECK_EQ((*in_type)[i], dtype_param) << "This layer requires uniform type. "
                                              << "Expected " << dtype_param << " v.s. given "
                                              << (*in_type)[i] << " at " << ListArguments()[i];
       }
     }
     for (index_t i = 0; i < aux_type->size(); ++i) {
       if ((*aux_type)[i] != -1) {
         CHECK_EQ((*aux_type)[i], dtype_param) << "This layer requires uniform type. "
                                               << "Expected " << dtype_param << " v.s. given "
                                               << (*aux_type)[i] << " at " << ListArguments()[i];
       }
     }
     const size_t n_aux = this->ListAuxiliaryStates().size();
     aux_type->clear();
     for (size_t i = 0; i < n_aux; ++i) {
       aux_type->push_back(dtype_param);
     }
     const size_t n_out = this->ListOutputs().size();
     out_type->clear();
     out_type->push_back(dtype);
     for (size_t i = 1; i < n_out; ++i) {
       out_type->push_back(dtype_param);
     }
     return true;
   }

   OperatorProperty* Copy() const override {
     auto ptr = new BatchNormProp();
     ptr->param_ = param_;
     return ptr;
   }

   std::string TypeString() const override {
     return "BatchNorm";
   }

   std::vector<int> DeclareBackwardDependency(
     const std::vector<int> &out_grad,
     const std::vector<int> &in_data,
     const std::vector<int> &out_data) const override {
     return {out_grad[batchnorm::kOut],
             out_data[batchnorm::kMean],
             out_data[batchnorm::kVar],
             in_data[batchnorm::kData],
             in_data[batchnorm::kGamma]
            };
   }

   int NumVisibleOutputs() const override {
     if (param_.output_mean_var) {
       return 3;
     }
     return 1;
   }

   int NumOutputs() const override {
     return 3;
   }

   std::vector<std::string> ListArguments() const override {
     return {"data", "gamma", "beta"};
   }

   std::vector<std::string> ListOutputs() const override {
     return {"output", "mean", "var"};
   }

   std::vector<std::string> ListAuxiliaryStates() const override {
     return {"moving_mean", "moving_var"};
   }

   Operator* CreateOperator(Context ctx) const override {
       LOG(FATAL) << "Not Implemented.";
       return NULL;
   }

   Operator* CreateOperatorEx(Context ctx, std::vector<TShape> *in_shape,
       std::vector<int> *in_type) const override;

   inline const BatchNormParam& getParam() const {
     return param_;
   }

  private:
   BatchNormParam param_;
 };  // class BatchNormProp

 namespace batchnorm {

 template<typename DType>
 class BNTensor3 {
   enum { OUTER, CHANNEL, INNER, COUNT };

  public:
   inline BNTensor3(const TBlob& blob, const int indexOfChannel)
     : dptr_(blob.dptr<DType>())
       , indexOfChannel_(static_cast<size_t>(indexOfChannel < 0
                                ? (static_cast<int>(blob.shape_.ndim()) + indexOfChannel)
                                : indexOfChannel)) {
     shape_[OUTER] = 1;
     for (size_t i = 0; i < indexOfChannel_; ++i) {
       shape_[OUTER] *= blob.shape_[i];
     }
     shape_[CHANNEL] = blob.shape_[indexOfChannel_];
     shape_[INNER] = 1;
     for (size_t i = indexOfChannel_ + 1, n = blob.shape_.ndim(); i < n; ++i) {
       shape_[INNER] *= blob.shape_[i];
     }
   }

   inline BNTensor3(DType *p, const TShape& shape, const int indexOfChannel)
     : dptr_(p)
       , indexOfChannel_(static_cast<size_t>(indexOfChannel < 0
                                ? (static_cast<int>(shape.ndim()) + indexOfChannel)
                                : indexOfChannel)) {
     shape_[OUTER] = 1;
     for (size_t i = 0; i < indexOfChannel_; ++i) {
       shape_[OUTER] *= shape[i];
     }
     shape_[CHANNEL] = shape[indexOfChannel_];
     shape_[INNER] = 1;
     for (size_t i = indexOfChannel_ + 1, n = shape.ndim(); i < n; ++i) {
       shape_[INNER] *= shape[i];
     }
   }

   MSHADOW_FORCE_INLINE bool IsEmpty() const {
     return dptr_ == nullptr;
   }

   MSHADOW_XINLINE size_t Size() const {
     size_t n = 1;
     for (int i = 0; i < COUNT; ++i) {
       n *= shape_[i];
     }
     return n;
   }

   MSHADOW_XINLINE size_t ChannelCount() const {
     return shape_[CHANNEL];
   }

   MSHADOW_XINLINE size_t OuterSize() const {
     return shape_[OUTER];
   }

   MSHADOW_XINLINE size_t InnerSize() const {
     return shape_[INNER];
   }

   /*! \brief start of a given channel's spatial data */
   MSHADOW_XINLINE size_t StartOffset(const size_t channel) const {
     return channel * InnerSize();
   }

   /*! \brief This is the amount to skip to next same-channel data
    * This is the number of bytes to skip from one past the end of the current spatial data
    * to the next start of the same channel's "spatial data"
    * It is assume that the pointer being calculated points just beyond the
    * end of the last blobk of spatial data
    * i.e. RGBRGB <-- 2
    *      RRGGBB <-- 4
    **/
   MSHADOW_XINLINE size_t SkipLengthToNextSameChannelData() const {
     return (ChannelCount() - 1) * InnerSize();
   }

   MSHADOW_XINLINE size_t offset(const size_t outer,
                                 const size_t channel,
                                 const size_t i) const {
     const size_t spatial_size = InnerSize();
     const size_t skip_length = SkipLengthToNextSameChannelData();
     size_t off = StartOffset(channel);
     off += outer * shape_[CHANNEL] * shape_[INNER];
     const size_t skips = i / spatial_size;
     off += (1 + skip_length) * skips;
     off += i % spatial_size;
     return off;
   }

   MSHADOW_XINLINE DType& get_ref(const size_t batch,
                                  const size_t channel,
                                  const size_t i) {
     const size_t off = offset(batch, channel, i);
     return dptr_[off];
   }

   MSHADOW_XINLINE const DType& get_ref(const size_t batch,
                                        const size_t channel,
                                        const size_t i) const {
     const size_t off = offset(batch, channel, i);
     return dptr_[off];
   }

   DType *dptr_;
   size_t indexOfChannel_;
   size_t shape_[COUNT];
 };

 inline int GetRealAxis(const TShape& shape, int axis) {
   if (axis < 0) {
     axis += shape.ndim();
   }
   return axis;
 }

 extern volatile bool disable_mkl;

 }  // namespace batchnorm

 #endif  // DMLC_USE_CXX11
 }  // namespace op
 }  // namespace mxnet

 #ifdef __GNUG__
 #pragma GCC diagnostic pop
 #endif

 #endif  // MXNET_OPERATOR_BATCH_NORM_INL_H_
	/*!
	* Copyright (c) 2017 by Contributors
	* \file batch_norm-inl.h
	* \brief
	* \author Bing Xu, Chris Olivier
	*/
	#ifndef MXNET_OPERATOR_BATCH_NORM_INL_H_
	#define MXNET_OPERATOR_BATCH_NORM_INL_H_

	#include <dmlc/logging.h>
	#include <dmlc/parameter.h>
	#include <mxnet/operator.h>
	#include <mshadow/base.h>
	#include <map>
	#include <vector>
	#include <string>
	#include <utility>
	#include "./mshadow_op.h"
	#include "./operator_common.h"
	#include "mxnet_op.h"

	#ifdef __GNUG__
	#pragma GCC diagnostic push
	#pragma GCC diagnostic ignored "-Wunused-local-typedefs"
	#endif

	namespace mxnet {
	namespace op {

	namespace batchnorm {
	enum BatchNormOpInputs {kData, kGamma, kBeta}; // kGamma: weights, kBeta: biases
	enum BatchNormOpOutputs {kOut, kMean, kVar}; // req, out_data
	enum BatchNormOpAuxiliary {kMovingMean, kMovingVar}; // aux_states

	/! \brief Default channel axis if none specified int he params /
	constexpr int DEFAULT_AXIS = 1;
	} // namespace batchnorm

	/! \brief Parameters for BatchNoram operator /
	struct BatchNormParam : public dmlc::Parameter<BatchNormParam> {
	float eps;
	float momentum;
	bool fix_gamma;
	bool use_global_stats;
	bool output_mean_var;
	int axis;
	bool cudnn_off;
	DMLC_DECLARE_PARAMETER(BatchNormParam) {
	DMLC_DECLARE_FIELD(eps).set_default(1e-3f)
	.describe("Epsilon to prevent div 0. "
	"Must be bigger than CUDNN_BN_MIN_EPSILON "
	"defined in cudnn.h when using cudnn (usually 1e-5)");
	DMLC_DECLARE_FIELD(momentum).set_default(0.9f)
	.describe("Momentum for moving average");
	DMLC_DECLARE_FIELD(fix_gamma).set_default(true)
	.describe("Fix gamma while training");
	DMLC_DECLARE_FIELD(use_global_stats).set_default(false)
	.describe("Whether use global moving statistics instead of local batch-norm. "
	"This will force change batch-norm into a scale shift operator.");
	DMLC_DECLARE_FIELD(output_mean_var).set_default(false)
	.describe("Output All,normal mean and var");
	DMLC_DECLARE_FIELD(axis).set_default(mxnet::op::batchnorm::DEFAULT_AXIS)
	.describe("Specify which shape axis the channel is specified");
	DMLC_DECLARE_FIELD(cudnn_off).set_default(false)
	.describe("Do not select CUDNN operator, if available");
	}
	};

	/! \brief Batch normalization operator /
	template <typename xpu, typename DType, typename AccReal>
	class BatchNormOp : public Operator {
	public:
	explicit BatchNormOp(BatchNormParam param) {
	this->param_ = param;
	}

	static inline bool IsWriting(const OpReqType ort) {
	return ort == kWriteTo \|\| ort == kWriteInplace;
	}

	/*!
	* \brief perform a forward operation of Operator, save the output to TBlob.
	* \param ctx runtime context available to this call
	* \param in_data array of input data, it is const
	* \param req the request types of saving operation, can only be kWriteTo or kWriteInplace.
	* \param out_data array of output data, pointer is used to indicate that this is holder
	* the space of TBlob in out_data must be pre-allocated with InferShape
	* \param aux_states Auxiliary states of operator. Normally operator doesn't
	* need, epecial case like Batch Norm requires.
	* \sa OpReqType, OpContext
	*/
	virtual void Forward(const OpContext &ctx,
	const std::vector<TBlob> &in_data,
	const std::vector<OpReqType> &req,
	const std::vector<TBlob> &out_data,
	const std::vector<TBlob> &aux_states) {
	using namespace mshadow;
	using namespace mshadow::expr;

	CHECK_EQ(in_data.size(), 3U);
	CHECK_EQ(aux_states.size(), 2U);
	if (ctx.is_train) {
	CHECK_EQ(out_data.size(), 3U);
	CHECK_EQ(req.size(), 3U);
	} else {
	CHECK_GE(out_data.size(), 1U);
	CHECK_GE(req.size(), 1U);
	CHECK_EQ(req[batchnorm::kOut], kWriteTo);
	}
	Stream<xpu> *s = ctx.get_stream<xpu>();
	DoForward(s, ctx, in_data, req, out_data, aux_states);
	}

	/*!
	* \brief Perform a Backward Operation, write gradient to the in_grad.
	*
	* \note
	* Convention:
	* out_grad.size() == OperatorProperty.NumVisibleOutputs()
	* out_data.size() == OperatorProperty.NumOutputs()
	* out_data can contain additional invisible returns that remembers the
	* state carried from the Forward pass. For example mask in the dropout.
	* The gradients are passed from visible returns in this function.
	*
	* \par
	* Not all the TBlobs in the arguments will be available
	* if you override the DeclareBackwardDependency of corresponding OperatorProperty class.
	* Only the dependencies you declared will be available at corresponding position,
	* the rest of the parameters are simply dummy where you will get a nullptr.
	* You will be safe if you use the default DeclareBackwardDependency.
	* But only declare what you need will give engine more chance for optimization.
	*
	* \param ctx runtime context available to this call
	* \param out_grad the gradient value we get from of the Operator.
	* \param in_data the array of input data.
	* \param out_data the array of output data.
	* \param req request types of the saving operation, can be all types.
	* \param in_grad the array of gradient we need to write to.
	* \param aux_states Auxiliary states of operator. Normally operator doesn't need
	* \sa OperatorProperty, OpReqType, OpContext
	*/
	virtual void Backward(const OpContext &ctx,
	const std::vector<TBlob> &out_grad,
	const std::vector<TBlob> &in_data,
	const std::vector<TBlob> &out_data,
	const std::vector<OpReqType> &req,
	const std::vector<TBlob> &in_grad,
	const std::vector<TBlob> &aux_states) {
	CHECK_EQ(out_grad.size(), param_.output_mean_var ? 3U : 1U);
	CHECK_EQ(in_data.size(), 3U);
	CHECK_EQ(out_data.size(), 3U);
	CHECK_EQ(in_grad.size(), 3U);
	mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
	DoBackward(s, ctx, out_grad, in_data,
	out_data, req, in_grad, aux_states);
	}

	private:
	void DoForward(mshadow::Stream<cpu> *stream,
	const OpContext &ctx,
	const std::vector<TBlob> &in_data,
	const std::vector<OpReqType> &req,
	const std::vector<TBlob> &out_data,
	const std::vector<TBlob> &aux_states);

	void DoBackward(mshadow::Stream<cpu> *stream,
	const OpContext &ctx,
	const std::vector<TBlob> &out_grad,
	const std::vector<TBlob> &in_data,
	const std::vector<TBlob> &out_data,
	const std::vector<OpReqType> &req,
	const std::vector<TBlob> &in_grad,
	const std::vector<TBlob> &aux_states);

	#if MXNET_USE_CUDA
	void DoForward(mshadow::Stream<gpu> *stream,
	const OpContext &ctx,
	const std::vector<TBlob> &in_data,
	const std::vector<OpReqType> &req,
	const std::vector<TBlob> &out_data,
	const std::vector<TBlob> &aux_states);
	void DoBackward(mshadow::Stream<gpu> *stream,
	const OpContext &ctx,
	const std::vector<TBlob> &out_grad,
	const std::vector<TBlob> &in_data,
	const std::vector<TBlob> &out_data,
	const std::vector<OpReqType> &req,
	const std::vector<TBlob> &in_grad,
	const std::vector<TBlob> &aux_states);
	#endif // MXNET_USE_CUDA

	/! \brief Batch normalization operator parameters /
	BatchNormParam param_;
	}; // class BatchNormOp

	template<typename xpu>
	Operator *CreateOp(BatchNormParam param, const int dtype, const TShape& shape);

	#if DMLC_USE_CXX11
	class BatchNormProp : public OperatorProperty {
	public:
	void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) override {
	param_.Init(kwargs);
	}

	std::map<std::string, std::string> GetParams() const override {
	return param_.__DICT__();
	}

	bool InferShape(std::vector<TShape> *in_shape,
	std::vector<TShape> *out_shape,
	std::vector<TShape> *aux_shape) const override {
	using namespace mshadow;
	CHECK_EQ(in_shape->size(), 3U) << "Input:[data, gamma, beta]";
	const TShape &dshape = in_shape->at(0);

	const size_t channelAxis = static_cast<size_t>(param_.axis < 0
	? static_cast<int>(dshape.ndim()) + param_.axis
	: param_.axis);
	CHECK_LT(channelAxis, dshape.ndim()) << "Channel axis out of range: " << param_.axis;

	const int channelCount = dshape[channelAxis];

	if (dshape.ndim() == 0) {
	return false;
	}

	in_shape->at(1) = TShape(Shape1(channelCount));
	in_shape->at(2) = TShape(Shape1(channelCount));

	out_shape->clear();
	out_shape->push_back(dshape); // kOut
	out_shape->push_back(Shape1(channelCount)); // kMean
	out_shape->push_back(Shape1(channelCount)); // kVar

	aux_shape->clear();
	aux_shape->push_back(Shape1(channelCount)); // kMovingMean
	aux_shape->push_back(Shape1(channelCount)); // kMovingVar
	return true;
	}

	bool InferType(std::vector<int> *in_type,
	std::vector<int> *out_type,
	std::vector<int> *aux_type) const override {
	using namespace mshadow;
	CHECK_GE(in_type->size(), 1U);
	const int dtype = (*in_type)[0];
	CHECK_NE(dtype, -1) << "First input must have specified type";
	// For float16 input type beta, gamma, mean, and average are stored in float32.
	// For other input types, these parameters have the same type as input
	// NOTE: This requirement is from cuDNN (v. 4 and 5)
	int dtype_param;
	MSHADOW_REAL_TYPE_SWITCH_EX(dtype, DTypeX, AccRealX, {
	dtype_param = mshadow::DataType<AccRealX>::kFlag; });
	for (index_t i = 1; i < in_type->size(); ++i) {
	if ((*in_type)[i] == -1) {
	(*in_type)[i] = dtype_param;
	} else {
	CHECK_EQ((*in_type)[i], dtype_param) << "This layer requires uniform type. "
	<< "Expected " << dtype_param << " v.s. given "
	<< (*in_type)[i] << " at " << ListArguments()[i];
	}
	}
	for (index_t i = 0; i < aux_type->size(); ++i) {
	if ((*aux_type)[i] != -1) {
	CHECK_EQ((*aux_type)[i], dtype_param) << "This layer requires uniform type. "
	<< "Expected " << dtype_param << " v.s. given "
	<< (*aux_type)[i] << " at " << ListArguments()[i];
	}
	}
	const size_t n_aux = this->ListAuxiliaryStates().size();
	aux_type->clear();
	for (size_t i = 0; i < n_aux; ++i) {
	aux_type->push_back(dtype_param);
	}
	const size_t n_out = this->ListOutputs().size();
	out_type->clear();
	out_type->push_back(dtype);
	for (size_t i = 1; i < n_out; ++i) {
	out_type->push_back(dtype_param);
	}
	return true;
	}

	OperatorProperty* Copy() const override {
	auto ptr = new BatchNormProp();
	ptr->param_ = param_;
	return ptr;
	}

	std::string TypeString() const override {
	return "BatchNorm";
	}

	std::vector<int> DeclareBackwardDependency(
	const std::vector<int> &out_grad,
	const std::vector<int> &in_data,
	const std::vector<int> &out_data) const override {
	return {out_grad[batchnorm::kOut],
	out_data[batchnorm::kMean],
	out_data[batchnorm::kVar],
	in_data[batchnorm::kData],
	in_data[batchnorm::kGamma]
	};
	}

	int NumVisibleOutputs() const override {
	if (param_.output_mean_var) {
	return 3;
	}
	return 1;
	}

	int NumOutputs() const override {
	return 3;
	}

	std::vector<std::string> ListArguments() const override {
	return {"data", "gamma", "beta"};
	}

	std::vector<std::string> ListOutputs() const override {
	return {"output", "mean", "var"};
	}

	std::vector<std::string> ListAuxiliaryStates() const override {
	return {"moving_mean", "moving_var"};
	}

	Operator* CreateOperator(Context ctx) const override {
	LOG(FATAL) << "Not Implemented.";
	return NULL;
	}

	Operator* CreateOperatorEx(Context ctx, std::vector<TShape> *in_shape,
	std::vector<int> *in_type) const override;

	inline const BatchNormParam& getParam() const {
	return param_;
	}

	private:
	BatchNormParam param_;
	}; // class BatchNormProp

	namespace batchnorm {

	template<typename DType>
	class BNTensor3 {
	enum { OUTER, CHANNEL, INNER, COUNT };

	public:
	inline BNTensor3(const TBlob& blob, const int indexOfChannel)
	: dptr_(blob.dptr<DType>())
	, indexOfChannel_(static_cast<size_t>(indexOfChannel < 0
	? (static_cast<int>(blob.shape_.ndim()) + indexOfChannel)
	: indexOfChannel)) {
	shape_[OUTER] = 1;
	for (size_t i = 0; i < indexOfChannel_; ++i) {
	shape_[OUTER] *= blob.shape_[i];
	}
	shape_[CHANNEL] = blob.shape_[indexOfChannel_];
	shape_[INNER] = 1;
	for (size_t i = indexOfChannel_ + 1, n = blob.shape_.ndim(); i < n; ++i) {
	shape_[INNER] *= blob.shape_[i];
	}
	}

	inline BNTensor3(DType *p, const TShape& shape, const int indexOfChannel)
	: dptr_(p)
	, indexOfChannel_(static_cast<size_t>(indexOfChannel < 0
	? (static_cast<int>(shape.ndim()) + indexOfChannel)
	: indexOfChannel)) {
	shape_[OUTER] = 1;
	for (size_t i = 0; i < indexOfChannel_; ++i) {
	shape_[OUTER] *= shape[i];
	}
	shape_[CHANNEL] = shape[indexOfChannel_];
	shape_[INNER] = 1;
	for (size_t i = indexOfChannel_ + 1, n = shape.ndim(); i < n; ++i) {
	shape_[INNER] *= shape[i];
	}
	}

	MSHADOW_FORCE_INLINE bool IsEmpty() const {
	return dptr_ == nullptr;
	}

	MSHADOW_XINLINE size_t Size() const {
	size_t n = 1;
	for (int i = 0; i < COUNT; ++i) {
	n *= shape_[i];
	}
	return n;
	}

	MSHADOW_XINLINE size_t ChannelCount() const {
	return shape_[CHANNEL];
	}

	MSHADOW_XINLINE size_t OuterSize() const {
	return shape_[OUTER];
	}

	MSHADOW_XINLINE size_t InnerSize() const {
	return shape_[INNER];
	}

	/! \brief start of a given channel's spatial data /
	MSHADOW_XINLINE size_t StartOffset(const size_t channel) const {
	return channel * InnerSize();
	}

	/*! \brief This is the amount to skip to next same-channel data
	* This is the number of bytes to skip from one past the end of the current spatial data
	* to the next start of the same channel's "spatial data"
	* It is assume that the pointer being calculated points just beyond the
	* end of the last blobk of spatial data
	* i.e. RGBRGB <-- 2
	* RRGGBB <-- 4
	**/
	MSHADOW_XINLINE size_t SkipLengthToNextSameChannelData() const {
	return (ChannelCount() - 1) * InnerSize();
	}

	MSHADOW_XINLINE size_t offset(const size_t outer,
	const size_t channel,
	const size_t i) const {
	const size_t spatial_size = InnerSize();
	const size_t skip_length = SkipLengthToNextSameChannelData();
	size_t off = StartOffset(channel);
	off += outer * shape_[CHANNEL] * shape_[INNER];
	const size_t skips = i / spatial_size;
	off += (1 + skip_length) * skips;
	off += i % spatial_size;
	return off;
	}

	MSHADOW_XINLINE DType& get_ref(const size_t batch,
	const size_t channel,
	const size_t i) {
	const size_t off = offset(batch, channel, i);
	return dptr_[off];
	}

	MSHADOW_XINLINE const DType& get_ref(const size_t batch,
	const size_t channel,
	const size_t i) const {
	const size_t off = offset(batch, channel, i);
	return dptr_[off];
	}

	DType *dptr_;
	size_t indexOfChannel_;
	size_t shape_[COUNT];
	};

	inline int GetRealAxis(const TShape& shape, int axis) {
	if (axis < 0) {
	axis += shape.ndim();
	}
	return axis;
	}

	extern volatile bool disable_mkl;

	} // namespace batchnorm

	#endif // DMLC_USE_CXX11
	} // namespace op
	} // namespace mxnet

	#ifdef __GNUG__
	#pragma GCC diagnostic pop
	#endif

	#endif // MXNET_OPERATOR_BATCH_NORM_INL_H_