src/operator/nn/convolution-inl.h - mxnet - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *   http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
  */

 /*!
  * \file convolution-inl.h
  * \brief
  * \ref: https://github.com/Yangqing/caffe/wiki/Convolution-in-Caffe:-a-memo
  * \author Bing Xu, Jun Wu, Da Zheng
  */
 #ifndef MXNET_OPERATOR_NN_CONVOLUTION_INL_H_
 #define MXNET_OPERATOR_NN_CONVOLUTION_INL_H_

 #include <mxnet/io.h>
 #include <mxnet/base.h>
 #include <mxnet/ndarray.h>
 #include <mxnet/operator.h>
 #include <mxnet/operator_util.h>
 #include <mxnet/op_attr_types.h>
 #include <dmlc/logging.h>
 #include <dmlc/optional.h>
 #include <algorithm>
 #include <map>
 #include <vector>
 #include <string>
 #include <utility>
 #include "../operator_common.h"
 #include "../linalg.h"
 #include "./im2col.h"

 namespace mxnet {
 namespace op {

 namespace conv {
 enum ConvolutionOpInputs { kData, kWeight, kBias };
 enum ConvolutionOpOutputs { kOut };
 enum ConvolutionOpResource { kTempSpace };
 enum ConvolutionOpCudnnTune { kOff, kLimited, kFastest };
 }  // namespace conv

 struct ConvolutionParam : public dmlc::Parameter<ConvolutionParam> {
   mxnet::TShape kernel;
   mxnet::TShape stride;
   mxnet::TShape dilate;
   mxnet::TShape pad;
   uint32_t num_filter;
   uint32_t num_group;
   uint64_t workspace;
   bool no_bias;
   dmlc::optional<int> cudnn_tune;
   bool cudnn_off;
   dmlc::optional<int> layout;
   DMLC_DECLARE_PARAMETER(ConvolutionParam) {
     DMLC_DECLARE_FIELD(kernel).describe("Convolution kernel size: (w,), (h, w) or (d, h, w)");
     DMLC_DECLARE_FIELD(stride)
         .set_default(mxnet::TShape(0, 0))
         .describe(
             "Convolution stride: (w,), (h, w) or (d, h, w). Defaults to 1 for each dimension.");
     DMLC_DECLARE_FIELD(dilate)
         .set_default(mxnet::TShape(0, 0))
         .describe(
             "Convolution dilate: (w,), (h, w) or (d, h, w). Defaults to 1 for each dimension.");
     DMLC_DECLARE_FIELD(pad)
         .set_default(mxnet::TShape(0, 0))
         .describe("Zero pad for convolution: (w,), (h, w) or (d, h, w). Defaults to no padding.");
     DMLC_DECLARE_FIELD(num_filter)
         .set_lower_bound(1)
         .describe("Convolution filter(channel) number");
     DMLC_DECLARE_FIELD(num_group).set_default(1).describe("Number of group partitions.");
     DMLC_DECLARE_FIELD(workspace).set_default(1024).set_lower_bound(0).describe(
         "Maximum temporary workspace allowed (MB) in convolution."
         "This parameter has two usages. When CUDNN is not used, it determines the "
         "effective batch size of the convolution kernel. When CUDNN is used, it controls "
         "the maximum temporary storage used for tuning the best CUDNN kernel when "
         "`limited_workspace` strategy is used.");
     DMLC_DECLARE_FIELD(no_bias).set_default(false).describe("Whether to disable bias parameter.");
     DMLC_DECLARE_FIELD(cudnn_tune)
         .add_enum("off", conv::kOff)
         .add_enum("limited_workspace", conv::kLimited)
         .add_enum("fastest", conv::kFastest)
         .set_default(dmlc::optional<int>())
         .describe("Whether to pick convolution algo by running performance test.");
     DMLC_DECLARE_FIELD(cudnn_off).set_default(false).describe("Turn off cudnn for this layer.");
     DMLC_DECLARE_FIELD(layout)
         .add_enum("NCW", mshadow::kNCW)
         .add_enum("NCHW", mshadow::kNCHW)
         .add_enum("NCDHW", mshadow::kNCDHW)
         .add_enum("NWC", mshadow::kNWC)
         .add_enum("NHWC", mshadow::kNHWC)
         .add_enum("NDHWC", mshadow::kNDHWC)
         .set_default(dmlc::optional<int>())
         .describe(
             "Set layout for input, output and weight. Empty for\n    "
             "default layout: NCW for 1d, NCHW for 2d and NCDHW for 3d."
             "NHWC and NDHWC are only supported on GPU.");
   }
   // Adjusts kernel size for effects of dilation in the dimension `dim`.
   index_t DilatedKernelSize(int dim) const {
     return 1 + (kernel[dim] - 1) * dilate[dim];
   }

   bool operator==(const ConvolutionParam& other) const {
     return this->kernel == other.kernel && this->stride == other.stride &&
            this->dilate == other.dilate && this->pad == other.pad &&
            this->num_filter == other.num_filter && this->num_group == other.num_group &&
            this->workspace == other.workspace && this->no_bias == other.no_bias &&
            this->cudnn_tune == other.cudnn_tune && this->cudnn_off == other.cudnn_off &&
            this->layout == other.layout;
   }
   std::string CudnnTune2String(int cudnn_tune) {
     switch (cudnn_tune) {
       case conv::kOff:
         return "off";
       case conv::kLimited:
         return "limited_workspace";
       case conv::kFastest:
         return "fastest";
       default:
         LOG(FATAL) << "Unknown cudnn_tune enum " << cudnn_tune;
     }
     LOG(FATAL) << "should not reach here ";
     return "";
   }
   std::string Layout2String(int layout) {
     switch (layout) {
       case mshadow::kNCW:
         return "NCW";
       case mshadow::kNCHW:
         return "NCHW";
       case mshadow::kNCDHW:
         return "NCDHW";
       case mshadow::kNHWC:
         return "NHWC";
       case mshadow::kNDHWC:
         return "NDHWC";
       default:
         LOG(FATAL) << "Unknown layout enum " << layout;
     }
     LOG(FATAL) << "should not reach here ";
     return "";
   }
   void SetAttrDict(std::unordered_map<std::string, std::string>* dict) {
     std::ostringstream kernel_s, stride_s, dilate_s, pad_s, num_filter_s, num_group_s, workspace_s,
         no_bias_s, cudnn_tune_s, cudnn_off_s, layout_s;
     kernel_s << kernel;
     stride_s << stride;
     dilate_s << dilate;
     pad_s << pad;
     num_filter_s << num_filter;
     num_group_s << num_group;
     workspace_s << workspace;
     no_bias_s << no_bias;
     cudnn_tune_s << cudnn_tune;
     cudnn_off_s << cudnn_off;
     layout_s << layout;
     (*dict)["kernel"]     = kernel_s.str();
     (*dict)["stride"]     = stride_s.str();
     (*dict)["dilate"]     = dilate_s.str();
     (*dict)["pad"]        = pad_s.str();
     (*dict)["num_filter"] = num_filter_s.str();
     (*dict)["num_group"]  = num_group_s.str();
     (*dict)["workspace"]  = workspace_s.str();
     (*dict)["no_bias"]    = no_bias_s.str();
     if (cudnn_tune.has_value()) {
       (*dict)["cudnn_tune"] = CudnnTune2String(cudnn_tune.value());
     } else {
       (*dict)["cudnn_tune"] = cudnn_tune_s.str();
     }
     (*dict)["cudnn_off"] = cudnn_off_s.str();
     if (layout.has_value()) {
       (*dict)["layout"] = Layout2String(layout.value());
     } else {
       (*dict)["layout"] = layout_s.str();
     }
   }
 };

 void ConvolutionParamParser(nnvm::NodeAttrs* attrs);

 typedef ParamOpSign<ConvolutionParam> ConvSignature;

 }  // namespace op
 }  // namespace mxnet

 namespace std {
 template <>
 struct hash<mxnet::op::ConvolutionParam> {
   size_t operator()(const mxnet::op::ConvolutionParam& val) {
     size_t ret = 0;
     ret        = dmlc::HashCombine(ret, val.kernel);
     ret        = dmlc::HashCombine(ret, val.stride);
     ret        = dmlc::HashCombine(ret, val.dilate);
     ret        = dmlc::HashCombine(ret, val.pad);
     ret        = dmlc::HashCombine(ret, val.num_filter);
     ret        = dmlc::HashCombine(ret, val.num_group);
     ret        = dmlc::HashCombine(ret, val.workspace);
     ret        = dmlc::HashCombine(ret, val.no_bias);
     ret        = dmlc::HashCombine(ret, val.cudnn_tune);
     ret        = dmlc::HashCombine(ret, val.cudnn_off);
     ret        = dmlc::HashCombine(ret, val.layout);
     return ret;
   }
 };
 }  // namespace std

 namespace mxnet {
 namespace op {

 template <typename xpu, typename DType>
 class ConvolutionOp {
  public:
   void Init(ConvolutionParam p) {
     this->param_ = p;
     // convert MBytes first to Bytes and then to elements.
     param_.workspace = (param_.workspace << 20) / sizeof(DType);
     if (param_.layout.has_value()) {
       CHECK(param_.layout.value() == mshadow::kNCW || param_.layout.value() == mshadow::kNCHW ||
             param_.layout.value() == mshadow::kNCDHW)
           << "Only support NCW, NCHW and NCDHW layout";
     }
   }

   void Forward(const OpContext& ctx,
                const std::vector<TBlob>& in_data,
                const std::vector<OpReqType>& req,
                const std::vector<TBlob>& out_data) {
     using namespace mshadow;
     using namespace mshadow::expr;
     size_t expected = param_.no_bias ? 2 : 3;
     CHECK_EQ(in_data.size(), expected);
     CHECK_EQ(out_data.size(), 1U);
     // CHECK_EQ(req[conv::kOut], kWriteTo);
     _Forward(ctx,
              in_data[conv::kData],
              in_data[conv::kWeight],
              param_.no_bias ? nullptr : &in_data[conv::kBias],
              req[conv::kOut],
              out_data[conv::kOut]);
   }

   void Backward(const OpContext& ctx,
                 const std::vector<TBlob>& out_grad,
                 const std::vector<TBlob>& in_data,
                 const std::vector<OpReqType>& req,
                 const std::vector<TBlob>& in_grad) {
     using namespace mshadow;
     using namespace mshadow::expr;
     CHECK_EQ(out_grad.size(), 1U);
     // We expect 2 inputs: in data and weight. We don't need bias for
     // computing gradient.
     size_t expected = param_.no_bias ? 2 : 3;
     CHECK_EQ(in_data.size(), expected);
     CHECK_EQ(in_grad.size(), expected);
     CHECK_EQ(req.size(), expected);
     CHECK_EQ(in_data[conv::kWeight].CheckContiguous(), true);

     auto workspace = _BackwardData(
         ctx, out_grad[conv::kOut], in_data[conv::kWeight], req[conv::kData], in_grad[conv::kData]);
     _BackwardWeightsBias(workspace,
                          ctx,
                          out_grad[conv::kOut],
                          in_data[conv::kData],
                          req[conv::kWeight],
                          in_grad[conv::kWeight],
                          param_.no_bias ? OpReqType() : req[conv::kBias],
                          param_.no_bias ? nullptr : &in_grad[conv::kBias]);
   }

  private:
   Tensor<xpu, 1, DType> _Forward(const OpContext& ctx,
                                  const TBlob& in_data,
                                  const TBlob& in_weights,
                                  const TBlob* in_bias,
                                  const OpReqType req,
                                  const TBlob& out_data) {
     using namespace mshadow;
     using namespace mshadow::expr;
     LayerSetUp(in_data.shape_, out_data.shape_);
     Stream<xpu>* s = ctx.get_stream<xpu>();
     Tensor<xpu, 1, DType> workspace;

     // initialize weight and col_buffer 3D tensors for using gemm
     index_t M = conv_out_channels_ / group_;
     index_t N = conv_out_spatial_dim_;
     index_t K = kernel_dim_;
     Tensor<xpu, 3, DType> weight_3d =
         in_weights.get_with_shape<xpu, 3, DType>(Shape3(group_, M, K), s);
     Tensor<xpu, 4, DType> output_4d =
         out_data.get_with_shape<xpu, 4, DType>(Shape4(num_, group_, M, N), s);

     // no need to allocating memory and reordering in memory
     if (is_1x1_) {
       Tensor<xpu, 4, DType> input_4d =
           in_data.get_with_shape<xpu, 4, DType>(Shape4(num_, group_, K, N), s);
       for (index_t n = 0; n < num_; ++n) {
         Tensor<xpu, 3, DType> input_3d  = input_4d[n];
         Tensor<xpu, 3, DType> output_3d = output_4d[n];
         for (index_t g = 0; g < group_; ++g) {
           linalg_gemm(weight_3d[g], input_3d[g], output_3d[g], false, false, s, req);
         }
       }
     } else {
       // allocate workspace for col_buffer
       workspace = ctx.requested[conv::kTempSpace].get_space_typed<xpu, 1, DType>(
           Shape1(col_buffer_size_), s);
       // calculate the shape of col_buffer
       mxnet::TShape col_buffer_shape(num_spatial_axes_ + 1, 1);
       col_buffer_shape[0] = conv_in_channels_ * param_.kernel.Size();
       for (int i = 1; i < col_buffer_shape.ndim(); ++i) {
         col_buffer_shape[i] = out_data.shape_[i + 1];
       }
       // create a column buffer using workspace and col_buffer_shape
       TBlob col_buffer(workspace.dptr_, col_buffer_shape, xpu::kDevMask, DataType<DType>::kFlag);
       Tensor<xpu, 3, DType> col_buffer_3d =
           col_buffer.get_with_shape<xpu, 3, DType>(Shape3(group_, K, N), s);
       for (index_t n = 0; n < num_; ++n) {
         // transform image to col_buffer in order to use gemm
         im2col(s,
                in_data.dptr<DType>() + n * input_dim_,
                in_data.shape_,
                col_buffer.shape_,
                param_.kernel,
                param_.pad,
                param_.stride,
                param_.dilate,
                col_buffer.dptr<DType>());
         Tensor<xpu, 3, DType> output_3d = output_4d[n];
         for (index_t g = 0; g < group_; ++g) {
           // Legacy approach shown here for comparison:
           //   Assign(output_3d[g], req, dot(weight_3d[g], col_buffer_3d[g]));
           linalg_gemm(weight_3d[g], col_buffer_3d[g], output_3d[g], false, false, s, req);
         }
       }
     }

     if (bias_term_) {
       CHECK(in_bias != nullptr);
       Tensor<xpu, 1, DType> bias      = in_bias->get<xpu, 1, DType>(s);
       Tensor<xpu, 3, DType> output_3d = out_data.get_with_shape<xpu, 3, DType>(
           Shape3(num_, conv_out_channels_, conv_out_spatial_dim_), s);
       // has bias term, broadcast it to the same shape of output_3d in channel dim
       output_3d += mshadow::expr::broadcast<1>(bias, output_3d.shape_);
     }
     return workspace;
   }

   // Computes dLoss/dData
   Tensor<xpu, 1, DType> _BackwardData(const OpContext& ctx,
                                       const TBlob& out_grad,
                                       const TBlob& weights,
                                       const OpReqType data_grad_req,
                                       const TBlob& data_grad_dst) {
     using namespace mshadow;
     using namespace mshadow::expr;
     CHECK_EQ(weights.CheckContiguous(), true);
     LayerSetUp(data_grad_dst.shape_, out_grad.shape_);
     Stream<xpu>* s = ctx.get_stream<xpu>();
     Tensor<xpu, 1, DType> workspace;

     // initialize weight and col_buffer 3D tensors for using gemm
     index_t M = kernel_dim_;
     index_t N = conv_out_spatial_dim_;
     index_t K = conv_out_channels_ / group_;
     Tensor<xpu, 3, DType> weight_3d =
         weights.get_with_shape<xpu, 3, DType>(Shape3(group_, K, M), s);
     Tensor<xpu, 4, DType> out_grad_4d =
         out_grad.get_with_shape<xpu, 4, DType>(Shape4(num_, group_, K, N), s);

     // no need to allocating memory and reordering in memory
     if (is_1x1_) {
       Tensor<xpu, 4, DType> in_grad_4d =
           data_grad_dst.get_with_shape<xpu, 4, DType>(Shape4(num_, group_, M, N), s);
       for (index_t n = 0; n < num_; ++n) {
         Tensor<xpu, 3, DType> in_grad_3d  = in_grad_4d[n];
         Tensor<xpu, 3, DType> out_grad_3d = out_grad_4d[n];
         for (index_t g = 0; g < group_; ++g) {
           linalg_gemm(weight_3d[g], out_grad_3d[g], in_grad_3d[g], true, false, s);
         }
       }
     } else {
       // allocate workspace for col_buffer
       workspace = ctx.requested[conv::kTempSpace].get_space_typed<xpu, 1, DType>(
           Shape1(col_buffer_size_), s);
       // calculate the shape of col_buffer
       mxnet::TShape col_buffer_shape(num_spatial_axes_ + 1, 1);
       col_buffer_shape[0] = conv_in_channels_ * param_.kernel.Size();
       for (int i = 1; i < col_buffer_shape.ndim(); ++i) {
         col_buffer_shape[i] = out_grad.shape_[i + 1];
       }
       // create a column buffer using workspace and col_buffer_shape
       TBlob col_buffer(workspace.dptr_, col_buffer_shape, xpu::kDevMask, DataType<DType>::kFlag);
       Tensor<xpu, 3, DType> col_buffer_3d =
           col_buffer.get_with_shape<xpu, 3, DType>(Shape3(group_, M, N), s);
       for (index_t n = 0; n < num_; ++n) {
         Tensor<xpu, 3, DType> out_grad_3d = out_grad_4d[n];
         for (index_t g = 0; g < group_; ++g) {
           linalg_gemm(weight_3d[g], out_grad_3d[g], col_buffer_3d[g], true, false, s);
         }
         col2im(s,
                col_buffer.dptr<DType>(),
                data_grad_dst.shape_,
                col_buffer.shape_,
                param_.kernel,
                param_.pad,
                param_.stride,
                param_.dilate,
                data_grad_dst.dptr<DType>() + n * input_dim_,
                data_grad_req);
       }
     }
     return workspace;
   }

   // Computes dLoss/dWeights and dLoss/dBias
   void _BackwardWeightsBias(Tensor<xpu, 1, DType> workspace,
                             const OpContext& ctx,
                             const TBlob& out_grad,
                             const TBlob& data,
                             const OpReqType weights_grad_req,
                             const TBlob& weights_grad_dst,
                             const OpReqType bias_grad_req,
                             const TBlob* const bias_grad_dst) {
     using namespace mshadow;
     using namespace mshadow::expr;
     LayerSetUp(data.shape_, out_grad.shape_);
     Stream<xpu>* s = ctx.get_stream<xpu>();

     // initialize weight and col_buffer 3D tensors for using gemm
     index_t M = kernel_dim_;
     index_t N = conv_out_spatial_dim_;
     index_t K = conv_out_channels_ / group_;
     Tensor<xpu, 4, DType> out_grad_4d =
         out_grad.get_with_shape<xpu, 4, DType>(Shape4(num_, group_, K, N), s);
     Tensor<xpu, 3, DType> dweight_3d =
         weights_grad_dst.get_with_shape<xpu, 3, DType>(Shape3(group_, K, M), s);

     // no need to allocating memory and reordering in memory
     if (is_1x1_) {
       Tensor<xpu, 4, DType> input_4d =
           data.get_with_shape<xpu, 4, DType>(Shape4(num_, group_, M, N), s);
       for (index_t n = 0; n < num_; ++n) {
         Tensor<xpu, 3, DType> input_3d    = input_4d[n];
         Tensor<xpu, 3, DType> out_grad_3d = out_grad_4d[n];
         for (index_t g = 0; g < group_; ++g) {
           auto request = (n == 0) ? weights_grad_req : kAddTo;
           linalg_gemm(out_grad_3d[g], input_3d[g], dweight_3d[g], false, true, s, request);
         }
       }
     } else {
       // allocate workspace for col_buffer
       if (workspace.dptr_ == nullptr) {
         workspace = ctx.requested[conv::kTempSpace].get_space_typed<xpu, 1, DType>(
             Shape1(col_buffer_size_), s);
       }
       // calculate the shape of col_buffer
       mxnet::TShape col_buffer_shape(num_spatial_axes_ + 1, 1);
       col_buffer_shape[0] = conv_in_channels_ * param_.kernel.Size();
       for (int i = 1; i < col_buffer_shape.ndim(); ++i) {
         col_buffer_shape[i] = out_grad.shape_[i + 1];
       }
       // create a column buffer using workspace and col_buffer_shape
       TBlob col_buffer(workspace.dptr_, col_buffer_shape, xpu::kDevMask, DataType<DType>::kFlag);
       Tensor<xpu, 3, DType> col_buffer_3d =
           col_buffer.get_with_shape<xpu, 3, DType>(Shape3(group_, M, N), s);
       for (index_t n = 0; n < num_; ++n) {
         Tensor<xpu, 3, DType> out_grad_3d = out_grad_4d[n];
         // dWeight should accumulate across the batch and group
         im2col(s,
                data.dptr<DType>() + n * input_dim_,
                data.shape_,
                col_buffer.shape_,
                param_.kernel,
                param_.pad,
                param_.stride,
                param_.dilate,
                col_buffer.dptr<DType>());
         for (index_t g = 0; g < group_; ++g) {
           auto request = (n == 0) ? weights_grad_req : kAddTo;
           linalg_gemm(out_grad_3d[g], col_buffer_3d[g], dweight_3d[g], false, true, s, request);
         }
       }
     }

     // bias gradient
     if (bias_term_) {
       CHECK(bias_grad_dst != nullptr);
       Tensor<xpu, 1, DType> dbias = bias_grad_dst->get<xpu, 1, DType>(s);
       Tensor<xpu, 3, DType> dout  = out_grad.get_with_shape<xpu, 3, DType>(
           Shape3(num_, conv_out_channels_, conv_out_spatial_dim_), s);
       ASSIGN_DISPATCH(dbias, bias_grad_req, sumall_except_dim<1>(dout));
     }
   }

   void LayerSetUp(const mxnet::TShape& ishape, const mxnet::TShape& oshape) {
     channel_axis_                    = 1;  // hard code channel axis
     const index_t first_spatial_axis = channel_axis_ + 1;
     const int num_axes               = param_.kernel.ndim() + 2;
     num_spatial_axes_                = num_axes - first_spatial_axis;
     is_1x1_                          = true;
     for (int i = 0; i < param_.kernel.ndim(); ++i) {
       is_1x1_ &= param_.kernel[i] == 1 && param_.stride[i] == 1 && param_.pad[i] == 0;
       if (!is_1x1_)
         break;
     }

     // batch size
     num_ = ishape[0];
     // number of input channels
     channels_             = ishape[1];
     group_                = param_.num_group;
     conv_out_channels_    = param_.num_filter;
     conv_in_channels_     = channels_;
     bias_term_            = !param_.no_bias;
     kernel_dim_           = conv_in_channels_ / group_ * param_.kernel.Size();
     weight_offset_        = conv_out_channels_ * kernel_dim_ / group_;
     conv_out_spatial_dim_ = oshape.ProdShape(2, oshape.ndim());
     col_offset_           = kernel_dim_ * conv_out_spatial_dim_;
     output_offset_        = conv_out_channels_ * conv_out_spatial_dim_ / group_;
     // size of the column buffer used for storing im2col-ed pixels
     col_buffer_size_ = kernel_dim_ * group_ * conv_out_spatial_dim_;
     // input/output image size (#channels * height * width)
     input_dim_          = ishape.ProdShape(1, ishape.ndim());
     output_dim_         = oshape.ProdShape(1, oshape.ndim());
     num_kernels_im2col_ = conv_in_channels_ * conv_out_spatial_dim_;
     num_kernels_col2im_ = input_dim_;
   }

  private:
   ConvolutionParam param_;
   index_t channel_axis_;          // channel axis of the input
   index_t channels_;              // number of channels of input image
   index_t num_spatial_axes_;      // number of spatial axes
   index_t num_;                   // batch size
   index_t group_;                 // number of groups
   index_t conv_out_channels_;     // number of output channels (num_filter)
   index_t conv_out_spatial_dim_;  // number of pixels of output images per channel
   index_t conv_in_channels_;      // number of input channels
   index_t kernel_dim_;            // number of input channels per group * kernel size
   index_t weight_offset_;         // number of output channels per group * kernel_dim_
   index_t col_offset_;
   index_t output_offset_;
   index_t col_buffer_size_;
   index_t input_dim_;
   index_t output_dim_;
   index_t num_kernels_im2col_;
   index_t num_kernels_col2im_;
   bool bias_term_;  // has bias term?
   bool is_1x1_;

   template <typename xpu_, typename DType_>
   friend class DeconvolutionOp;
 };  // class ConvolutionOp

 template <typename xpu>
 void ConvolutionCompute(const nnvm::NodeAttrs& attrs,
                         const OpContext& ctx,
                         const std::vector<TBlob>& inputs,
                         const std::vector<OpReqType>& req,
                         const std::vector<TBlob>& outputs) {
   const ConvolutionParam& param = nnvm::get<ConvolutionParam>(attrs.parsed);
   MSHADOW_REAL_TYPE_SWITCH(inputs[conv::kData].type_flag_, DType, {
     ConvolutionOp<xpu, DType> op;
     op.Init(param);
     op.Forward(ctx, inputs, req, outputs);
   });
 }

 template <typename xpu>
 void ConvolutionGradCompute(const nnvm::NodeAttrs& attrs,
                             const OpContext& ctx,
                             const std::vector<TBlob>& inputs,
                             const std::vector<OpReqType>& req,
                             const std::vector<TBlob>& outputs) {
   const ConvolutionParam& param = nnvm::get<ConvolutionParam>(attrs.parsed);
   std::vector<TBlob> in_data(inputs.begin() + 1, inputs.end());
   const TBlob& out_grad             = inputs[0];
   const std::vector<TBlob>& in_grad = outputs;

   MSHADOW_REAL_TYPE_SWITCH(out_grad.type_flag_, DType, {
     ConvolutionOp<xpu, DType> op;
     op.Init(param);
     op.Backward(ctx, std::vector<TBlob>{out_grad}, in_data, req, in_grad);
   });
 }
 }  // namespace op
 }  // namespace mxnet
 #endif  // MXNET_OPERATOR_NN_CONVOLUTION_INL_H_
	/*
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing,
	* software distributed under the License is distributed on an
	* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	* KIND, either express or implied. See the License for the
	* specific language governing permissions and limitations
	* under the License.
	*/

	/*!
	* \file convolution-inl.h
	* \brief
	* \ref: https://github.com/Yangqing/caffe/wiki/Convolution-in-Caffe:-a-memo
	* \author Bing Xu, Jun Wu, Da Zheng
	*/
	#ifndef MXNET_OPERATOR_NN_CONVOLUTION_INL_H_
	#define MXNET_OPERATOR_NN_CONVOLUTION_INL_H_

	#include <mxnet/io.h>
	#include <mxnet/base.h>
	#include <mxnet/ndarray.h>
	#include <mxnet/operator.h>
	#include <mxnet/operator_util.h>
	#include <mxnet/op_attr_types.h>
	#include <dmlc/logging.h>
	#include <dmlc/optional.h>
	#include <algorithm>
	#include <map>
	#include <vector>
	#include <string>
	#include <utility>
	#include "../operator_common.h"
	#include "../linalg.h"
	#include "./im2col.h"

	namespace mxnet {
	namespace op {

	namespace conv {
	enum ConvolutionOpInputs { kData, kWeight, kBias };
	enum ConvolutionOpOutputs { kOut };
	enum ConvolutionOpResource { kTempSpace };
	enum ConvolutionOpCudnnTune { kOff, kLimited, kFastest };
	} // namespace conv

	struct ConvolutionParam : public dmlc::Parameter<ConvolutionParam> {
	mxnet::TShape kernel;
	mxnet::TShape stride;
	mxnet::TShape dilate;
	mxnet::TShape pad;
	uint32_t num_filter;
	uint32_t num_group;
	uint64_t workspace;
	bool no_bias;
	dmlc::optional<int> cudnn_tune;
	bool cudnn_off;
	dmlc::optional<int> layout;
	DMLC_DECLARE_PARAMETER(ConvolutionParam) {
	DMLC_DECLARE_FIELD(kernel).describe("Convolution kernel size: (w,), (h, w) or (d, h, w)");
	DMLC_DECLARE_FIELD(stride)
	.set_default(mxnet::TShape(0, 0))
	.describe(
	"Convolution stride: (w,), (h, w) or (d, h, w). Defaults to 1 for each dimension.");
	DMLC_DECLARE_FIELD(dilate)
	.set_default(mxnet::TShape(0, 0))
	.describe(
	"Convolution dilate: (w,), (h, w) or (d, h, w). Defaults to 1 for each dimension.");
	DMLC_DECLARE_FIELD(pad)
	.set_default(mxnet::TShape(0, 0))
	.describe("Zero pad for convolution: (w,), (h, w) or (d, h, w). Defaults to no padding.");
	DMLC_DECLARE_FIELD(num_filter)
	.set_lower_bound(1)
	.describe("Convolution filter(channel) number");
	DMLC_DECLARE_FIELD(num_group).set_default(1).describe("Number of group partitions.");
	DMLC_DECLARE_FIELD(workspace).set_default(1024).set_lower_bound(0).describe(
	"Maximum temporary workspace allowed (MB) in convolution."
	"This parameter has two usages. When CUDNN is not used, it determines the "
	"effective batch size of the convolution kernel. When CUDNN is used, it controls "
	"the maximum temporary storage used for tuning the best CUDNN kernel when "
	"`limited_workspace` strategy is used.");
	DMLC_DECLARE_FIELD(no_bias).set_default(false).describe("Whether to disable bias parameter.");
	DMLC_DECLARE_FIELD(cudnn_tune)
	.add_enum("off", conv::kOff)
	.add_enum("limited_workspace", conv::kLimited)
	.add_enum("fastest", conv::kFastest)
	.set_default(dmlc::optional<int>())
	.describe("Whether to pick convolution algo by running performance test.");
	DMLC_DECLARE_FIELD(cudnn_off).set_default(false).describe("Turn off cudnn for this layer.");
	DMLC_DECLARE_FIELD(layout)
	.add_enum("NCW", mshadow::kNCW)
	.add_enum("NCHW", mshadow::kNCHW)
	.add_enum("NCDHW", mshadow::kNCDHW)
	.add_enum("NWC", mshadow::kNWC)
	.add_enum("NHWC", mshadow::kNHWC)
	.add_enum("NDHWC", mshadow::kNDHWC)
	.set_default(dmlc::optional<int>())
	.describe(
	"Set layout for input, output and weight. Empty for\n "
	"default layout: NCW for 1d, NCHW for 2d and NCDHW for 3d."
	"NHWC and NDHWC are only supported on GPU.");
	}
	// Adjusts kernel size for effects of dilation in the dimension `dim`.
	index_t DilatedKernelSize(int dim) const {
	return 1 + (kernel[dim] - 1) * dilate[dim];
	}

	bool operator==(const ConvolutionParam& other) const {
	return this->kernel == other.kernel && this->stride == other.stride &&
	this->dilate == other.dilate && this->pad == other.pad &&
	this->num_filter == other.num_filter && this->num_group == other.num_group &&
	this->workspace == other.workspace && this->no_bias == other.no_bias &&
	this->cudnn_tune == other.cudnn_tune && this->cudnn_off == other.cudnn_off &&
	this->layout == other.layout;
	}
	std::string CudnnTune2String(int cudnn_tune) {
	switch (cudnn_tune) {
	case conv::kOff:
	return "off";
	case conv::kLimited:
	return "limited_workspace";
	case conv::kFastest:
	return "fastest";
	default:
	LOG(FATAL) << "Unknown cudnn_tune enum " << cudnn_tune;
	}
	LOG(FATAL) << "should not reach here ";
	return "";
	}
	std::string Layout2String(int layout) {
	switch (layout) {
	case mshadow::kNCW:
	return "NCW";
	case mshadow::kNCHW:
	return "NCHW";
	case mshadow::kNCDHW:
	return "NCDHW";
	case mshadow::kNHWC:
	return "NHWC";
	case mshadow::kNDHWC:
	return "NDHWC";
	default:
	LOG(FATAL) << "Unknown layout enum " << layout;
	}
	LOG(FATAL) << "should not reach here ";
	return "";
	}
	void SetAttrDict(std::unordered_map<std::string, std::string>* dict) {
	std::ostringstream kernel_s, stride_s, dilate_s, pad_s, num_filter_s, num_group_s, workspace_s,
	no_bias_s, cudnn_tune_s, cudnn_off_s, layout_s;
	kernel_s << kernel;
	stride_s << stride;
	dilate_s << dilate;
	pad_s << pad;
	num_filter_s << num_filter;
	num_group_s << num_group;
	workspace_s << workspace;
	no_bias_s << no_bias;
	cudnn_tune_s << cudnn_tune;
	cudnn_off_s << cudnn_off;
	layout_s << layout;
	(*dict)["kernel"] = kernel_s.str();
	(*dict)["stride"] = stride_s.str();
	(*dict)["dilate"] = dilate_s.str();
	(*dict)["pad"] = pad_s.str();
	(*dict)["num_filter"] = num_filter_s.str();
	(*dict)["num_group"] = num_group_s.str();
	(*dict)["workspace"] = workspace_s.str();
	(*dict)["no_bias"] = no_bias_s.str();
	if (cudnn_tune.has_value()) {
	(*dict)["cudnn_tune"] = CudnnTune2String(cudnn_tune.value());
	} else {
	(*dict)["cudnn_tune"] = cudnn_tune_s.str();
	}
	(*dict)["cudnn_off"] = cudnn_off_s.str();
	if (layout.has_value()) {
	(*dict)["layout"] = Layout2String(layout.value());
	} else {
	(*dict)["layout"] = layout_s.str();
	}
	}
	};

	void ConvolutionParamParser(nnvm::NodeAttrs* attrs);

	typedef ParamOpSign<ConvolutionParam> ConvSignature;

	} // namespace op
	} // namespace mxnet

	namespace std {
	template <>
	struct hash<mxnet::op::ConvolutionParam> {
	size_t operator()(const mxnet::op::ConvolutionParam& val) {
	size_t ret = 0;
	ret = dmlc::HashCombine(ret, val.kernel);
	ret = dmlc::HashCombine(ret, val.stride);
	ret = dmlc::HashCombine(ret, val.dilate);
	ret = dmlc::HashCombine(ret, val.pad);
	ret = dmlc::HashCombine(ret, val.num_filter);
	ret = dmlc::HashCombine(ret, val.num_group);
	ret = dmlc::HashCombine(ret, val.workspace);
	ret = dmlc::HashCombine(ret, val.no_bias);
	ret = dmlc::HashCombine(ret, val.cudnn_tune);
	ret = dmlc::HashCombine(ret, val.cudnn_off);
	ret = dmlc::HashCombine(ret, val.layout);
	return ret;
	}
	};
	} // namespace std

	namespace mxnet {
	namespace op {

	template <typename xpu, typename DType>
	class ConvolutionOp {
	public:
	void Init(ConvolutionParam p) {
	this->param_ = p;
	// convert MBytes first to Bytes and then to elements.
	param_.workspace = (param_.workspace << 20) / sizeof(DType);
	if (param_.layout.has_value()) {
	CHECK(param_.layout.value() == mshadow::kNCW \|\| param_.layout.value() == mshadow::kNCHW \|\|
	param_.layout.value() == mshadow::kNCDHW)
	<< "Only support NCW, NCHW and NCDHW layout";
	}
	}

	void Forward(const OpContext& ctx,
	const std::vector<TBlob>& in_data,
	const std::vector<OpReqType>& req,
	const std::vector<TBlob>& out_data) {
	using namespace mshadow;
	using namespace mshadow::expr;
	size_t expected = param_.no_bias ? 2 : 3;
	CHECK_EQ(in_data.size(), expected);
	CHECK_EQ(out_data.size(), 1U);
	// CHECK_EQ(req[conv::kOut], kWriteTo);
	_Forward(ctx,
	in_data[conv::kData],
	in_data[conv::kWeight],
	param_.no_bias ? nullptr : &in_data[conv::kBias],
	req[conv::kOut],
	out_data[conv::kOut]);
	}

	void Backward(const OpContext& ctx,
	const std::vector<TBlob>& out_grad,
	const std::vector<TBlob>& in_data,
	const std::vector<OpReqType>& req,
	const std::vector<TBlob>& in_grad) {
	using namespace mshadow;
	using namespace mshadow::expr;
	CHECK_EQ(out_grad.size(), 1U);
	// We expect 2 inputs: in data and weight. We don't need bias for
	// computing gradient.
	size_t expected = param_.no_bias ? 2 : 3;
	CHECK_EQ(in_data.size(), expected);
	CHECK_EQ(in_grad.size(), expected);
	CHECK_EQ(req.size(), expected);
	CHECK_EQ(in_data[conv::kWeight].CheckContiguous(), true);

	auto workspace = _BackwardData(
	ctx, out_grad[conv::kOut], in_data[conv::kWeight], req[conv::kData], in_grad[conv::kData]);
	_BackwardWeightsBias(workspace,
	ctx,
	out_grad[conv::kOut],
	in_data[conv::kData],
	req[conv::kWeight],
	in_grad[conv::kWeight],
	param_.no_bias ? OpReqType() : req[conv::kBias],
	param_.no_bias ? nullptr : &in_grad[conv::kBias]);
	}

	private:
	Tensor<xpu, 1, DType> _Forward(const OpContext& ctx,
	const TBlob& in_data,
	const TBlob& in_weights,
	const TBlob* in_bias,
	const OpReqType req,
	const TBlob& out_data) {
	using namespace mshadow;
	using namespace mshadow::expr;
	LayerSetUp(in_data.shape_, out_data.shape_);
	Stream<xpu>* s = ctx.get_stream<xpu>();
	Tensor<xpu, 1, DType> workspace;

	// initialize weight and col_buffer 3D tensors for using gemm
	index_t M = conv_out_channels_ / group_;
	index_t N = conv_out_spatial_dim_;
	index_t K = kernel_dim_;
	Tensor<xpu, 3, DType> weight_3d =
	in_weights.get_with_shape<xpu, 3, DType>(Shape3(group_, M, K), s);
	Tensor<xpu, 4, DType> output_4d =
	out_data.get_with_shape<xpu, 4, DType>(Shape4(num_, group_, M, N), s);

	// no need to allocating memory and reordering in memory
	if (is_1x1_) {
	Tensor<xpu, 4, DType> input_4d =
	in_data.get_with_shape<xpu, 4, DType>(Shape4(num_, group_, K, N), s);
	for (index_t n = 0; n < num_; ++n) {
	Tensor<xpu, 3, DType> input_3d = input_4d[n];
	Tensor<xpu, 3, DType> output_3d = output_4d[n];
	for (index_t g = 0; g < group_; ++g) {
	linalg_gemm(weight_3d[g], input_3d[g], output_3d[g], false, false, s, req);
	}
	}
	} else {
	// allocate workspace for col_buffer
	workspace = ctx.requested[conv::kTempSpace].get_space_typed<xpu, 1, DType>(
	Shape1(col_buffer_size_), s);
	// calculate the shape of col_buffer
	mxnet::TShape col_buffer_shape(num_spatial_axes_ + 1, 1);
	col_buffer_shape[0] = conv_in_channels_ * param_.kernel.Size();
	for (int i = 1; i < col_buffer_shape.ndim(); ++i) {
	col_buffer_shape[i] = out_data.shape_[i + 1];
	}
	// create a column buffer using workspace and col_buffer_shape
	TBlob col_buffer(workspace.dptr_, col_buffer_shape, xpu::kDevMask, DataType<DType>::kFlag);
	Tensor<xpu, 3, DType> col_buffer_3d =
	col_buffer.get_with_shape<xpu, 3, DType>(Shape3(group_, K, N), s);
	for (index_t n = 0; n < num_; ++n) {
	// transform image to col_buffer in order to use gemm
	im2col(s,
	in_data.dptr<DType>() + n * input_dim_,
	in_data.shape_,
	col_buffer.shape_,
	param_.kernel,
	param_.pad,
	param_.stride,
	param_.dilate,
	col_buffer.dptr<DType>());
	Tensor<xpu, 3, DType> output_3d = output_4d[n];
	for (index_t g = 0; g < group_; ++g) {
	// Legacy approach shown here for comparison:
	// Assign(output_3d[g], req, dot(weight_3d[g], col_buffer_3d[g]));
	linalg_gemm(weight_3d[g], col_buffer_3d[g], output_3d[g], false, false, s, req);
	}
	}
	}

	if (bias_term_) {
	CHECK(in_bias != nullptr);
	Tensor<xpu, 1, DType> bias = in_bias->get<xpu, 1, DType>(s);
	Tensor<xpu, 3, DType> output_3d = out_data.get_with_shape<xpu, 3, DType>(
	Shape3(num_, conv_out_channels_, conv_out_spatial_dim_), s);
	// has bias term, broadcast it to the same shape of output_3d in channel dim
	output_3d += mshadow::expr::broadcast<1>(bias, output_3d.shape_);
	}
	return workspace;
	}

	// Computes dLoss/dData
	Tensor<xpu, 1, DType> _BackwardData(const OpContext& ctx,
	const TBlob& out_grad,
	const TBlob& weights,
	const OpReqType data_grad_req,
	const TBlob& data_grad_dst) {
	using namespace mshadow;
	using namespace mshadow::expr;
	CHECK_EQ(weights.CheckContiguous(), true);
	LayerSetUp(data_grad_dst.shape_, out_grad.shape_);
	Stream<xpu>* s = ctx.get_stream<xpu>();
	Tensor<xpu, 1, DType> workspace;

	// initialize weight and col_buffer 3D tensors for using gemm
	index_t M = kernel_dim_;
	index_t N = conv_out_spatial_dim_;
	index_t K = conv_out_channels_ / group_;
	Tensor<xpu, 3, DType> weight_3d =
	weights.get_with_shape<xpu, 3, DType>(Shape3(group_, K, M), s);
	Tensor<xpu, 4, DType> out_grad_4d =
	out_grad.get_with_shape<xpu, 4, DType>(Shape4(num_, group_, K, N), s);

	// no need to allocating memory and reordering in memory
	if (is_1x1_) {
	Tensor<xpu, 4, DType> in_grad_4d =
	data_grad_dst.get_with_shape<xpu, 4, DType>(Shape4(num_, group_, M, N), s);
	for (index_t n = 0; n < num_; ++n) {
	Tensor<xpu, 3, DType> in_grad_3d = in_grad_4d[n];
	Tensor<xpu, 3, DType> out_grad_3d = out_grad_4d[n];
	for (index_t g = 0; g < group_; ++g) {
	linalg_gemm(weight_3d[g], out_grad_3d[g], in_grad_3d[g], true, false, s);
	}
	}
	} else {
	// allocate workspace for col_buffer
	workspace = ctx.requested[conv::kTempSpace].get_space_typed<xpu, 1, DType>(
	Shape1(col_buffer_size_), s);
	// calculate the shape of col_buffer
	mxnet::TShape col_buffer_shape(num_spatial_axes_ + 1, 1);
	col_buffer_shape[0] = conv_in_channels_ * param_.kernel.Size();
	for (int i = 1; i < col_buffer_shape.ndim(); ++i) {
	col_buffer_shape[i] = out_grad.shape_[i + 1];
	}
	// create a column buffer using workspace and col_buffer_shape
	TBlob col_buffer(workspace.dptr_, col_buffer_shape, xpu::kDevMask, DataType<DType>::kFlag);
	Tensor<xpu, 3, DType> col_buffer_3d =
	col_buffer.get_with_shape<xpu, 3, DType>(Shape3(group_, M, N), s);
	for (index_t n = 0; n < num_; ++n) {
	Tensor<xpu, 3, DType> out_grad_3d = out_grad_4d[n];
	for (index_t g = 0; g < group_; ++g) {
	linalg_gemm(weight_3d[g], out_grad_3d[g], col_buffer_3d[g], true, false, s);
	}
	col2im(s,
	col_buffer.dptr<DType>(),
	data_grad_dst.shape_,
	col_buffer.shape_,
	param_.kernel,
	param_.pad,
	param_.stride,
	param_.dilate,
	data_grad_dst.dptr<DType>() + n * input_dim_,
	data_grad_req);
	}
	}
	return workspace;
	}

	// Computes dLoss/dWeights and dLoss/dBias
	void _BackwardWeightsBias(Tensor<xpu, 1, DType> workspace,
	const OpContext& ctx,
	const TBlob& out_grad,
	const TBlob& data,
	const OpReqType weights_grad_req,
	const TBlob& weights_grad_dst,
	const OpReqType bias_grad_req,
	const TBlob* const bias_grad_dst) {
	using namespace mshadow;
	using namespace mshadow::expr;
	LayerSetUp(data.shape_, out_grad.shape_);
	Stream<xpu>* s = ctx.get_stream<xpu>();

	// initialize weight and col_buffer 3D tensors for using gemm
	index_t M = kernel_dim_;
	index_t N = conv_out_spatial_dim_;
	index_t K = conv_out_channels_ / group_;
	Tensor<xpu, 4, DType> out_grad_4d =
	out_grad.get_with_shape<xpu, 4, DType>(Shape4(num_, group_, K, N), s);
	Tensor<xpu, 3, DType> dweight_3d =
	weights_grad_dst.get_with_shape<xpu, 3, DType>(Shape3(group_, K, M), s);

	// no need to allocating memory and reordering in memory
	if (is_1x1_) {
	Tensor<xpu, 4, DType> input_4d =
	data.get_with_shape<xpu, 4, DType>(Shape4(num_, group_, M, N), s);
	for (index_t n = 0; n < num_; ++n) {
	Tensor<xpu, 3, DType> input_3d = input_4d[n];
	Tensor<xpu, 3, DType> out_grad_3d = out_grad_4d[n];
	for (index_t g = 0; g < group_; ++g) {
	auto request = (n == 0) ? weights_grad_req : kAddTo;
	linalg_gemm(out_grad_3d[g], input_3d[g], dweight_3d[g], false, true, s, request);
	}
	}
	} else {
	// allocate workspace for col_buffer
	if (workspace.dptr_ == nullptr) {
	workspace = ctx.requested[conv::kTempSpace].get_space_typed<xpu, 1, DType>(
	Shape1(col_buffer_size_), s);
	}
	// calculate the shape of col_buffer
	mxnet::TShape col_buffer_shape(num_spatial_axes_ + 1, 1);
	col_buffer_shape[0] = conv_in_channels_ * param_.kernel.Size();
	for (int i = 1; i < col_buffer_shape.ndim(); ++i) {
	col_buffer_shape[i] = out_grad.shape_[i + 1];
	}
	// create a column buffer using workspace and col_buffer_shape
	TBlob col_buffer(workspace.dptr_, col_buffer_shape, xpu::kDevMask, DataType<DType>::kFlag);
	Tensor<xpu, 3, DType> col_buffer_3d =
	col_buffer.get_with_shape<xpu, 3, DType>(Shape3(group_, M, N), s);
	for (index_t n = 0; n < num_; ++n) {
	Tensor<xpu, 3, DType> out_grad_3d = out_grad_4d[n];
	// dWeight should accumulate across the batch and group
	im2col(s,
	data.dptr<DType>() + n * input_dim_,
	data.shape_,
	col_buffer.shape_,
	param_.kernel,
	param_.pad,
	param_.stride,
	param_.dilate,
	col_buffer.dptr<DType>());
	for (index_t g = 0; g < group_; ++g) {
	auto request = (n == 0) ? weights_grad_req : kAddTo;
	linalg_gemm(out_grad_3d[g], col_buffer_3d[g], dweight_3d[g], false, true, s, request);
	}
	}
	}

	// bias gradient
	if (bias_term_) {
	CHECK(bias_grad_dst != nullptr);
	Tensor<xpu, 1, DType> dbias = bias_grad_dst->get<xpu, 1, DType>(s);
	Tensor<xpu, 3, DType> dout = out_grad.get_with_shape<xpu, 3, DType>(
	Shape3(num_, conv_out_channels_, conv_out_spatial_dim_), s);
	ASSIGN_DISPATCH(dbias, bias_grad_req, sumall_except_dim<1>(dout));
	}
	}

	void LayerSetUp(const mxnet::TShape& ishape, const mxnet::TShape& oshape) {
	channel_axis_ = 1; // hard code channel axis
	const index_t first_spatial_axis = channel_axis_ + 1;
	const int num_axes = param_.kernel.ndim() + 2;
	num_spatial_axes_ = num_axes - first_spatial_axis;
	is_1x1_ = true;
	for (int i = 0; i < param_.kernel.ndim(); ++i) {
	is_1x1_ &= param_.kernel[i] == 1 && param_.stride[i] == 1 && param_.pad[i] == 0;
	if (!is_1x1_)
	break;
	}

	// batch size
	num_ = ishape[0];
	// number of input channels
	channels_ = ishape[1];
	group_ = param_.num_group;
	conv_out_channels_ = param_.num_filter;
	conv_in_channels_ = channels_;
	bias_term_ = !param_.no_bias;
	kernel_dim_ = conv_in_channels_ / group_ * param_.kernel.Size();
	weight_offset_ = conv_out_channels_ * kernel_dim_ / group_;
	conv_out_spatial_dim_ = oshape.ProdShape(2, oshape.ndim());
	col_offset_ = kernel_dim_ * conv_out_spatial_dim_;
	output_offset_ = conv_out_channels_ * conv_out_spatial_dim_ / group_;
	// size of the column buffer used for storing im2col-ed pixels
	col_buffer_size_ = kernel_dim_ * group_ * conv_out_spatial_dim_;
	// input/output image size (#channels * height * width)
	input_dim_ = ishape.ProdShape(1, ishape.ndim());
	output_dim_ = oshape.ProdShape(1, oshape.ndim());
	num_kernels_im2col_ = conv_in_channels_ * conv_out_spatial_dim_;
	num_kernels_col2im_ = input_dim_;
	}

	private:
	ConvolutionParam param_;
	index_t channel_axis_; // channel axis of the input
	index_t channels_; // number of channels of input image
	index_t num_spatial_axes_; // number of spatial axes
	index_t num_; // batch size
	index_t group_; // number of groups
	index_t conv_out_channels_; // number of output channels (num_filter)
	index_t conv_out_spatial_dim_; // number of pixels of output images per channel
	index_t conv_in_channels_; // number of input channels
	index_t kernel_dim_; // number of input channels per group * kernel size
	index_t weight_offset_; // number of output channels per group * kernel_dim_
	index_t col_offset_;
	index_t output_offset_;
	index_t col_buffer_size_;
	index_t input_dim_;
	index_t output_dim_;
	index_t num_kernels_im2col_;
	index_t num_kernels_col2im_;
	bool bias_term_; // has bias term?
	bool is_1x1_;

	template <typename xpu_, typename DType_>
	friend class DeconvolutionOp;
	}; // class ConvolutionOp

	template <typename xpu>
	void ConvolutionCompute(const nnvm::NodeAttrs& attrs,
	const OpContext& ctx,
	const std::vector<TBlob>& inputs,
	const std::vector<OpReqType>& req,
	const std::vector<TBlob>& outputs) {
	const ConvolutionParam& param = nnvm::get<ConvolutionParam>(attrs.parsed);
	MSHADOW_REAL_TYPE_SWITCH(inputs[conv::kData].type_flag_, DType, {
	ConvolutionOp<xpu, DType> op;
	op.Init(param);
	op.Forward(ctx, inputs, req, outputs);
	});
	}

	template <typename xpu>
	void ConvolutionGradCompute(const nnvm::NodeAttrs& attrs,
	const OpContext& ctx,
	const std::vector<TBlob>& inputs,
	const std::vector<OpReqType>& req,
	const std::vector<TBlob>& outputs) {
	const ConvolutionParam& param = nnvm::get<ConvolutionParam>(attrs.parsed);
	std::vector<TBlob> in_data(inputs.begin() + 1, inputs.end());
	const TBlob& out_grad = inputs[0];
	const std::vector<TBlob>& in_grad = outputs;

	MSHADOW_REAL_TYPE_SWITCH(out_grad.type_flag_, DType, {
	ConvolutionOp<xpu, DType> op;
	op.Init(param);
	op.Backward(ctx, std::vector<TBlob>{out_grad}, in_data, req, in_grad);
	});
	}
	} // namespace op
	} // namespace mxnet
	#endif // MXNET_OPERATOR_NN_CONVOLUTION_INL_H_