blob: a034e4408217fd24516d2d03131eff71a710d7ab [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
/*!
* \file convolution-inl.h
* \brief
* \ref: https://github.com/Yangqing/caffe/wiki/Convolution-in-Caffe:-a-memo
* \author Bing Xu, Jun Wu, Da Zheng
*/
#ifndef MXNET_OPERATOR_NN_CONVOLUTION_INL_H_
#define MXNET_OPERATOR_NN_CONVOLUTION_INL_H_
#include <mxnet/io.h>
#include <mxnet/base.h>
#include <mxnet/ndarray.h>
#include <mxnet/operator.h>
#include <mxnet/operator_util.h>
#include <mxnet/op_attr_types.h>
#include <dmlc/logging.h>
#include <dmlc/optional.h>
#include <algorithm>
#include <map>
#include <vector>
#include <string>
#include <utility>
#include "../operator_common.h"
#include "../linalg.h"
#include "./im2col.h"
namespace mxnet {
namespace op {
namespace conv {
enum ConvolutionOpInputs { kData, kWeight, kBias };
enum ConvolutionOpOutputs { kOut };
enum ConvolutionOpResource { kTempSpace };
enum ConvolutionOpCudnnTune { kOff, kLimited, kFastest };
} // namespace conv
struct ConvolutionParam : public dmlc::Parameter<ConvolutionParam> {
mxnet::TShape kernel;
mxnet::TShape stride;
mxnet::TShape dilate;
mxnet::TShape pad;
uint32_t num_filter;
uint32_t num_group;
uint64_t workspace;
bool no_bias;
dmlc::optional<int> cudnn_tune;
bool cudnn_off;
dmlc::optional<int> layout;
DMLC_DECLARE_PARAMETER(ConvolutionParam) {
DMLC_DECLARE_FIELD(kernel).describe("Convolution kernel size: (w,), (h, w) or (d, h, w)");
DMLC_DECLARE_FIELD(stride)
.set_default(mxnet::TShape(0, 0))
.describe(
"Convolution stride: (w,), (h, w) or (d, h, w). Defaults to 1 for each dimension.");
DMLC_DECLARE_FIELD(dilate)
.set_default(mxnet::TShape(0, 0))
.describe(
"Convolution dilate: (w,), (h, w) or (d, h, w). Defaults to 1 for each dimension.");
DMLC_DECLARE_FIELD(pad)
.set_default(mxnet::TShape(0, 0))
.describe("Zero pad for convolution: (w,), (h, w) or (d, h, w). Defaults to no padding.");
DMLC_DECLARE_FIELD(num_filter)
.set_lower_bound(1)
.describe("Convolution filter(channel) number");
DMLC_DECLARE_FIELD(num_group).set_default(1).describe("Number of group partitions.");
DMLC_DECLARE_FIELD(workspace).set_default(1024).set_lower_bound(0).describe(
"Maximum temporary workspace allowed (MB) in convolution."
"This parameter has two usages. When CUDNN is not used, it determines the "
"effective batch size of the convolution kernel. When CUDNN is used, it controls "
"the maximum temporary storage used for tuning the best CUDNN kernel when "
"`limited_workspace` strategy is used.");
DMLC_DECLARE_FIELD(no_bias).set_default(false).describe("Whether to disable bias parameter.");
DMLC_DECLARE_FIELD(cudnn_tune)
.add_enum("off", conv::kOff)
.add_enum("limited_workspace", conv::kLimited)
.add_enum("fastest", conv::kFastest)
.set_default(dmlc::optional<int>())
.describe("Whether to pick convolution algo by running performance test.");
DMLC_DECLARE_FIELD(cudnn_off).set_default(false).describe("Turn off cudnn for this layer.");
DMLC_DECLARE_FIELD(layout)
.add_enum("NCW", mshadow::kNCW)
.add_enum("NCHW", mshadow::kNCHW)
.add_enum("NCDHW", mshadow::kNCDHW)
.add_enum("NWC", mshadow::kNWC)
.add_enum("NHWC", mshadow::kNHWC)
.add_enum("NDHWC", mshadow::kNDHWC)
.set_default(dmlc::optional<int>())
.describe(
"Set layout for input, output and weight. Empty for\n "
"default layout: NCW for 1d, NCHW for 2d and NCDHW for 3d."
"NHWC and NDHWC are only supported on GPU.");
}
// Adjusts kernel size for effects of dilation in the dimension `dim`.
index_t DilatedKernelSize(int dim) const {
return 1 + (kernel[dim] - 1) * dilate[dim];
}
bool operator==(const ConvolutionParam& other) const {
return this->kernel == other.kernel && this->stride == other.stride &&
this->dilate == other.dilate && this->pad == other.pad &&
this->num_filter == other.num_filter && this->num_group == other.num_group &&
this->workspace == other.workspace && this->no_bias == other.no_bias &&
this->cudnn_tune == other.cudnn_tune && this->cudnn_off == other.cudnn_off &&
this->layout == other.layout;
}
std::string CudnnTune2String(int cudnn_tune) {
switch (cudnn_tune) {
case conv::kOff:
return "off";
case conv::kLimited:
return "limited_workspace";
case conv::kFastest:
return "fastest";
default:
LOG(FATAL) << "Unknown cudnn_tune enum " << cudnn_tune;
}
LOG(FATAL) << "should not reach here ";
return "";
}
std::string Layout2String(int layout) {
switch (layout) {
case mshadow::kNCW:
return "NCW";
case mshadow::kNCHW:
return "NCHW";
case mshadow::kNCDHW:
return "NCDHW";
case mshadow::kNHWC:
return "NHWC";
case mshadow::kNDHWC:
return "NDHWC";
default:
LOG(FATAL) << "Unknown layout enum " << layout;
}
LOG(FATAL) << "should not reach here ";
return "";
}
void SetAttrDict(std::unordered_map<std::string, std::string>* dict) {
std::ostringstream kernel_s, stride_s, dilate_s, pad_s, num_filter_s, num_group_s, workspace_s,
no_bias_s, cudnn_tune_s, cudnn_off_s, layout_s;
kernel_s << kernel;
stride_s << stride;
dilate_s << dilate;
pad_s << pad;
num_filter_s << num_filter;
num_group_s << num_group;
workspace_s << workspace;
no_bias_s << no_bias;
cudnn_tune_s << cudnn_tune;
cudnn_off_s << cudnn_off;
layout_s << layout;
(*dict)["kernel"] = kernel_s.str();
(*dict)["stride"] = stride_s.str();
(*dict)["dilate"] = dilate_s.str();
(*dict)["pad"] = pad_s.str();
(*dict)["num_filter"] = num_filter_s.str();
(*dict)["num_group"] = num_group_s.str();
(*dict)["workspace"] = workspace_s.str();
(*dict)["no_bias"] = no_bias_s.str();
if (cudnn_tune.has_value()) {
(*dict)["cudnn_tune"] = CudnnTune2String(cudnn_tune.value());
} else {
(*dict)["cudnn_tune"] = cudnn_tune_s.str();
}
(*dict)["cudnn_off"] = cudnn_off_s.str();
if (layout.has_value()) {
(*dict)["layout"] = Layout2String(layout.value());
} else {
(*dict)["layout"] = layout_s.str();
}
}
};
void ConvolutionParamParser(nnvm::NodeAttrs* attrs);
typedef ParamOpSign<ConvolutionParam> ConvSignature;
} // namespace op
} // namespace mxnet
namespace std {
template <>
struct hash<mxnet::op::ConvolutionParam> {
size_t operator()(const mxnet::op::ConvolutionParam& val) {
size_t ret = 0;
ret = dmlc::HashCombine(ret, val.kernel);
ret = dmlc::HashCombine(ret, val.stride);
ret = dmlc::HashCombine(ret, val.dilate);
ret = dmlc::HashCombine(ret, val.pad);
ret = dmlc::HashCombine(ret, val.num_filter);
ret = dmlc::HashCombine(ret, val.num_group);
ret = dmlc::HashCombine(ret, val.workspace);
ret = dmlc::HashCombine(ret, val.no_bias);
ret = dmlc::HashCombine(ret, val.cudnn_tune);
ret = dmlc::HashCombine(ret, val.cudnn_off);
ret = dmlc::HashCombine(ret, val.layout);
return ret;
}
};
} // namespace std
namespace mxnet {
namespace op {
template <typename xpu, typename DType>
class ConvolutionOp {
public:
void Init(ConvolutionParam p) {
this->param_ = p;
// convert MBytes first to Bytes and then to elements.
param_.workspace = (param_.workspace << 20) / sizeof(DType);
if (param_.layout.has_value()) {
CHECK(param_.layout.value() == mshadow::kNCW || param_.layout.value() == mshadow::kNCHW ||
param_.layout.value() == mshadow::kNCDHW)
<< "Only support NCW, NCHW and NCDHW layout";
}
}
void Forward(const OpContext& ctx,
const std::vector<TBlob>& in_data,
const std::vector<OpReqType>& req,
const std::vector<TBlob>& out_data) {
using namespace mshadow;
using namespace mshadow::expr;
size_t expected = param_.no_bias ? 2 : 3;
CHECK_EQ(in_data.size(), expected);
CHECK_EQ(out_data.size(), 1U);
// CHECK_EQ(req[conv::kOut], kWriteTo);
_Forward(ctx,
in_data[conv::kData],
in_data[conv::kWeight],
param_.no_bias ? nullptr : &in_data[conv::kBias],
req[conv::kOut],
out_data[conv::kOut]);
}
void Backward(const OpContext& ctx,
const std::vector<TBlob>& out_grad,
const std::vector<TBlob>& in_data,
const std::vector<OpReqType>& req,
const std::vector<TBlob>& in_grad) {
using namespace mshadow;
using namespace mshadow::expr;
CHECK_EQ(out_grad.size(), 1U);
// We expect 2 inputs: in data and weight. We don't need bias for
// computing gradient.
size_t expected = param_.no_bias ? 2 : 3;
CHECK_EQ(in_data.size(), expected);
CHECK_EQ(in_grad.size(), expected);
CHECK_EQ(req.size(), expected);
CHECK_EQ(in_data[conv::kWeight].CheckContiguous(), true);
auto workspace = _BackwardData(
ctx, out_grad[conv::kOut], in_data[conv::kWeight], req[conv::kData], in_grad[conv::kData]);
_BackwardWeightsBias(workspace,
ctx,
out_grad[conv::kOut],
in_data[conv::kData],
req[conv::kWeight],
in_grad[conv::kWeight],
param_.no_bias ? OpReqType() : req[conv::kBias],
param_.no_bias ? nullptr : &in_grad[conv::kBias]);
}
private:
Tensor<xpu, 1, DType> _Forward(const OpContext& ctx,
const TBlob& in_data,
const TBlob& in_weights,
const TBlob* in_bias,
const OpReqType req,
const TBlob& out_data) {
using namespace mshadow;
using namespace mshadow::expr;
LayerSetUp(in_data.shape_, out_data.shape_);
Stream<xpu>* s = ctx.get_stream<xpu>();
Tensor<xpu, 1, DType> workspace;
// initialize weight and col_buffer 3D tensors for using gemm
index_t M = conv_out_channels_ / group_;
index_t N = conv_out_spatial_dim_;
index_t K = kernel_dim_;
Tensor<xpu, 3, DType> weight_3d =
in_weights.get_with_shape<xpu, 3, DType>(Shape3(group_, M, K), s);
Tensor<xpu, 4, DType> output_4d =
out_data.get_with_shape<xpu, 4, DType>(Shape4(num_, group_, M, N), s);
// no need to allocating memory and reordering in memory
if (is_1x1_) {
Tensor<xpu, 4, DType> input_4d =
in_data.get_with_shape<xpu, 4, DType>(Shape4(num_, group_, K, N), s);
for (index_t n = 0; n < num_; ++n) {
Tensor<xpu, 3, DType> input_3d = input_4d[n];
Tensor<xpu, 3, DType> output_3d = output_4d[n];
for (index_t g = 0; g < group_; ++g) {
linalg_gemm(weight_3d[g], input_3d[g], output_3d[g], false, false, s, req);
}
}
} else {
// allocate workspace for col_buffer
workspace = ctx.requested[conv::kTempSpace].get_space_typed<xpu, 1, DType>(
Shape1(col_buffer_size_), s);
// calculate the shape of col_buffer
mxnet::TShape col_buffer_shape(num_spatial_axes_ + 1, 1);
col_buffer_shape[0] = conv_in_channels_ * param_.kernel.Size();
for (int i = 1; i < col_buffer_shape.ndim(); ++i) {
col_buffer_shape[i] = out_data.shape_[i + 1];
}
// create a column buffer using workspace and col_buffer_shape
TBlob col_buffer(workspace.dptr_, col_buffer_shape, xpu::kDevMask, DataType<DType>::kFlag);
Tensor<xpu, 3, DType> col_buffer_3d =
col_buffer.get_with_shape<xpu, 3, DType>(Shape3(group_, K, N), s);
for (index_t n = 0; n < num_; ++n) {
// transform image to col_buffer in order to use gemm
im2col(s,
in_data.dptr<DType>() + n * input_dim_,
in_data.shape_,
col_buffer.shape_,
param_.kernel,
param_.pad,
param_.stride,
param_.dilate,
col_buffer.dptr<DType>());
Tensor<xpu, 3, DType> output_3d = output_4d[n];
for (index_t g = 0; g < group_; ++g) {
// Legacy approach shown here for comparison:
// Assign(output_3d[g], req, dot(weight_3d[g], col_buffer_3d[g]));
linalg_gemm(weight_3d[g], col_buffer_3d[g], output_3d[g], false, false, s, req);
}
}
}
if (bias_term_) {
CHECK(in_bias != nullptr);
Tensor<xpu, 1, DType> bias = in_bias->get<xpu, 1, DType>(s);
Tensor<xpu, 3, DType> output_3d = out_data.get_with_shape<xpu, 3, DType>(
Shape3(num_, conv_out_channels_, conv_out_spatial_dim_), s);
// has bias term, broadcast it to the same shape of output_3d in channel dim
output_3d += mshadow::expr::broadcast<1>(bias, output_3d.shape_);
}
return workspace;
}
// Computes dLoss/dData
Tensor<xpu, 1, DType> _BackwardData(const OpContext& ctx,
const TBlob& out_grad,
const TBlob& weights,
const OpReqType data_grad_req,
const TBlob& data_grad_dst) {
using namespace mshadow;
using namespace mshadow::expr;
CHECK_EQ(weights.CheckContiguous(), true);
LayerSetUp(data_grad_dst.shape_, out_grad.shape_);
Stream<xpu>* s = ctx.get_stream<xpu>();
Tensor<xpu, 1, DType> workspace;
// initialize weight and col_buffer 3D tensors for using gemm
index_t M = kernel_dim_;
index_t N = conv_out_spatial_dim_;
index_t K = conv_out_channels_ / group_;
Tensor<xpu, 3, DType> weight_3d =
weights.get_with_shape<xpu, 3, DType>(Shape3(group_, K, M), s);
Tensor<xpu, 4, DType> out_grad_4d =
out_grad.get_with_shape<xpu, 4, DType>(Shape4(num_, group_, K, N), s);
// no need to allocating memory and reordering in memory
if (is_1x1_) {
Tensor<xpu, 4, DType> in_grad_4d =
data_grad_dst.get_with_shape<xpu, 4, DType>(Shape4(num_, group_, M, N), s);
for (index_t n = 0; n < num_; ++n) {
Tensor<xpu, 3, DType> in_grad_3d = in_grad_4d[n];
Tensor<xpu, 3, DType> out_grad_3d = out_grad_4d[n];
for (index_t g = 0; g < group_; ++g) {
linalg_gemm(weight_3d[g], out_grad_3d[g], in_grad_3d[g], true, false, s);
}
}
} else {
// allocate workspace for col_buffer
workspace = ctx.requested[conv::kTempSpace].get_space_typed<xpu, 1, DType>(
Shape1(col_buffer_size_), s);
// calculate the shape of col_buffer
mxnet::TShape col_buffer_shape(num_spatial_axes_ + 1, 1);
col_buffer_shape[0] = conv_in_channels_ * param_.kernel.Size();
for (int i = 1; i < col_buffer_shape.ndim(); ++i) {
col_buffer_shape[i] = out_grad.shape_[i + 1];
}
// create a column buffer using workspace and col_buffer_shape
TBlob col_buffer(workspace.dptr_, col_buffer_shape, xpu::kDevMask, DataType<DType>::kFlag);
Tensor<xpu, 3, DType> col_buffer_3d =
col_buffer.get_with_shape<xpu, 3, DType>(Shape3(group_, M, N), s);
for (index_t n = 0; n < num_; ++n) {
Tensor<xpu, 3, DType> out_grad_3d = out_grad_4d[n];
for (index_t g = 0; g < group_; ++g) {
linalg_gemm(weight_3d[g], out_grad_3d[g], col_buffer_3d[g], true, false, s);
}
col2im(s,
col_buffer.dptr<DType>(),
data_grad_dst.shape_,
col_buffer.shape_,
param_.kernel,
param_.pad,
param_.stride,
param_.dilate,
data_grad_dst.dptr<DType>() + n * input_dim_,
data_grad_req);
}
}
return workspace;
}
// Computes dLoss/dWeights and dLoss/dBias
void _BackwardWeightsBias(Tensor<xpu, 1, DType> workspace,
const OpContext& ctx,
const TBlob& out_grad,
const TBlob& data,
const OpReqType weights_grad_req,
const TBlob& weights_grad_dst,
const OpReqType bias_grad_req,
const TBlob* const bias_grad_dst) {
using namespace mshadow;
using namespace mshadow::expr;
LayerSetUp(data.shape_, out_grad.shape_);
Stream<xpu>* s = ctx.get_stream<xpu>();
// initialize weight and col_buffer 3D tensors for using gemm
index_t M = kernel_dim_;
index_t N = conv_out_spatial_dim_;
index_t K = conv_out_channels_ / group_;
Tensor<xpu, 4, DType> out_grad_4d =
out_grad.get_with_shape<xpu, 4, DType>(Shape4(num_, group_, K, N), s);
Tensor<xpu, 3, DType> dweight_3d =
weights_grad_dst.get_with_shape<xpu, 3, DType>(Shape3(group_, K, M), s);
// no need to allocating memory and reordering in memory
if (is_1x1_) {
Tensor<xpu, 4, DType> input_4d =
data.get_with_shape<xpu, 4, DType>(Shape4(num_, group_, M, N), s);
for (index_t n = 0; n < num_; ++n) {
Tensor<xpu, 3, DType> input_3d = input_4d[n];
Tensor<xpu, 3, DType> out_grad_3d = out_grad_4d[n];
for (index_t g = 0; g < group_; ++g) {
auto request = (n == 0) ? weights_grad_req : kAddTo;
linalg_gemm(out_grad_3d[g], input_3d[g], dweight_3d[g], false, true, s, request);
}
}
} else {
// allocate workspace for col_buffer
if (workspace.dptr_ == nullptr) {
workspace = ctx.requested[conv::kTempSpace].get_space_typed<xpu, 1, DType>(
Shape1(col_buffer_size_), s);
}
// calculate the shape of col_buffer
mxnet::TShape col_buffer_shape(num_spatial_axes_ + 1, 1);
col_buffer_shape[0] = conv_in_channels_ * param_.kernel.Size();
for (int i = 1; i < col_buffer_shape.ndim(); ++i) {
col_buffer_shape[i] = out_grad.shape_[i + 1];
}
// create a column buffer using workspace and col_buffer_shape
TBlob col_buffer(workspace.dptr_, col_buffer_shape, xpu::kDevMask, DataType<DType>::kFlag);
Tensor<xpu, 3, DType> col_buffer_3d =
col_buffer.get_with_shape<xpu, 3, DType>(Shape3(group_, M, N), s);
for (index_t n = 0; n < num_; ++n) {
Tensor<xpu, 3, DType> out_grad_3d = out_grad_4d[n];
// dWeight should accumulate across the batch and group
im2col(s,
data.dptr<DType>() + n * input_dim_,
data.shape_,
col_buffer.shape_,
param_.kernel,
param_.pad,
param_.stride,
param_.dilate,
col_buffer.dptr<DType>());
for (index_t g = 0; g < group_; ++g) {
auto request = (n == 0) ? weights_grad_req : kAddTo;
linalg_gemm(out_grad_3d[g], col_buffer_3d[g], dweight_3d[g], false, true, s, request);
}
}
}
// bias gradient
if (bias_term_) {
CHECK(bias_grad_dst != nullptr);
Tensor<xpu, 1, DType> dbias = bias_grad_dst->get<xpu, 1, DType>(s);
Tensor<xpu, 3, DType> dout = out_grad.get_with_shape<xpu, 3, DType>(
Shape3(num_, conv_out_channels_, conv_out_spatial_dim_), s);
ASSIGN_DISPATCH(dbias, bias_grad_req, sumall_except_dim<1>(dout));
}
}
void LayerSetUp(const mxnet::TShape& ishape, const mxnet::TShape& oshape) {
channel_axis_ = 1; // hard code channel axis
const index_t first_spatial_axis = channel_axis_ + 1;
const int num_axes = param_.kernel.ndim() + 2;
num_spatial_axes_ = num_axes - first_spatial_axis;
is_1x1_ = true;
for (int i = 0; i < param_.kernel.ndim(); ++i) {
is_1x1_ &= param_.kernel[i] == 1 && param_.stride[i] == 1 && param_.pad[i] == 0;
if (!is_1x1_)
break;
}
// batch size
num_ = ishape[0];
// number of input channels
channels_ = ishape[1];
group_ = param_.num_group;
conv_out_channels_ = param_.num_filter;
conv_in_channels_ = channels_;
bias_term_ = !param_.no_bias;
kernel_dim_ = conv_in_channels_ / group_ * param_.kernel.Size();
weight_offset_ = conv_out_channels_ * kernel_dim_ / group_;
conv_out_spatial_dim_ = oshape.ProdShape(2, oshape.ndim());
col_offset_ = kernel_dim_ * conv_out_spatial_dim_;
output_offset_ = conv_out_channels_ * conv_out_spatial_dim_ / group_;
// size of the column buffer used for storing im2col-ed pixels
col_buffer_size_ = kernel_dim_ * group_ * conv_out_spatial_dim_;
// input/output image size (#channels * height * width)
input_dim_ = ishape.ProdShape(1, ishape.ndim());
output_dim_ = oshape.ProdShape(1, oshape.ndim());
num_kernels_im2col_ = conv_in_channels_ * conv_out_spatial_dim_;
num_kernels_col2im_ = input_dim_;
}
private:
ConvolutionParam param_;
index_t channel_axis_; // channel axis of the input
index_t channels_; // number of channels of input image
index_t num_spatial_axes_; // number of spatial axes
index_t num_; // batch size
index_t group_; // number of groups
index_t conv_out_channels_; // number of output channels (num_filter)
index_t conv_out_spatial_dim_; // number of pixels of output images per channel
index_t conv_in_channels_; // number of input channels
index_t kernel_dim_; // number of input channels per group * kernel size
index_t weight_offset_; // number of output channels per group * kernel_dim_
index_t col_offset_;
index_t output_offset_;
index_t col_buffer_size_;
index_t input_dim_;
index_t output_dim_;
index_t num_kernels_im2col_;
index_t num_kernels_col2im_;
bool bias_term_; // has bias term?
bool is_1x1_;
template <typename xpu_, typename DType_>
friend class DeconvolutionOp;
}; // class ConvolutionOp
template <typename xpu>
void ConvolutionCompute(const nnvm::NodeAttrs& attrs,
const OpContext& ctx,
const std::vector<TBlob>& inputs,
const std::vector<OpReqType>& req,
const std::vector<TBlob>& outputs) {
const ConvolutionParam& param = nnvm::get<ConvolutionParam>(attrs.parsed);
MSHADOW_REAL_TYPE_SWITCH(inputs[conv::kData].type_flag_, DType, {
ConvolutionOp<xpu, DType> op;
op.Init(param);
op.Forward(ctx, inputs, req, outputs);
});
}
template <typename xpu>
void ConvolutionGradCompute(const nnvm::NodeAttrs& attrs,
const OpContext& ctx,
const std::vector<TBlob>& inputs,
const std::vector<OpReqType>& req,
const std::vector<TBlob>& outputs) {
const ConvolutionParam& param = nnvm::get<ConvolutionParam>(attrs.parsed);
std::vector<TBlob> in_data(inputs.begin() + 1, inputs.end());
const TBlob& out_grad = inputs[0];
const std::vector<TBlob>& in_grad = outputs;
MSHADOW_REAL_TYPE_SWITCH(out_grad.type_flag_, DType, {
ConvolutionOp<xpu, DType> op;
op.Init(param);
op.Backward(ctx, std::vector<TBlob>{out_grad}, in_data, req, in_grad);
});
}
} // namespace op
} // namespace mxnet
#endif // MXNET_OPERATOR_NN_CONVOLUTION_INL_H_