blob: 9805813d1605ff8d2d8d5cbad1cb472b6b59a934 [file] [log] [blame]
/*!
* Copyright (c) 2015 by Contributors
* \file cudnn_deconvolution-inl.h
* \brief
* \author Wei Wu
*/
#ifndef MXNET_OPERATOR_CUDNN_DECONVOLUTION_INL_H_
#define MXNET_OPERATOR_CUDNN_DECONVOLUTION_INL_H_
#include <algorithm>
#include <vector>
#include "./deconvolution-inl.h"
namespace mxnet {
namespace op {
#if defined(__CUDACC__) && MXNET_USE_CUDNN == 1
template<typename DType>
class CuDNNDeconvolutionOp : public Operator {
public:
explicit CuDNNDeconvolutionOp(DeconvolutionParam param) {
this->param_ = param;
// convert MB to words
param_.workspace = (param_.workspace << 20) / sizeof(DType);
init_cudnn_ = false;
// TODO(xxx): fp16
dtype_ = mshadow::DataType<DType>::kCudnnFlag;
}
~CuDNNDeconvolutionOp() {
if (init_cudnn_) {
CHECK_EQ(cudnnDestroyTensorDescriptor(in_desc_), CUDNN_STATUS_SUCCESS);
CHECK_EQ(cudnnDestroyTensorDescriptor(out_desc_), CUDNN_STATUS_SUCCESS);
CHECK_EQ(cudnnDestroyTensorDescriptor(bias_desc_), CUDNN_STATUS_SUCCESS);
CHECK_EQ(cudnnDestroyFilterDescriptor(filter_desc_), CUDNN_STATUS_SUCCESS);
CHECK_EQ(cudnnDestroyConvolutionDescriptor(conv_desc_), CUDNN_STATUS_SUCCESS);
}
}
virtual void Forward(const OpContext &ctx,
const std::vector<TBlob> &in_data,
const std::vector<OpReqType> &req,
const std::vector<TBlob> &out_data,
const std::vector<TBlob> &aux_args) {
using namespace mshadow;
size_t expected = param_.no_bias ? 2 : 3;
CHECK_EQ(in_data.size(), expected);
CHECK_EQ(out_data.size(), 1U);
Stream<gpu> *s = ctx.get_stream<gpu>();
Tensor<gpu, 4, DType> data = in_data[deconv::kData].get<gpu, 4, DType>(s);
Tensor<gpu, 4, DType> wmat = in_data[deconv::kWeight].get<gpu, 4, DType>(s);
Tensor<gpu, 4, DType> out = out_data[deconv::kOut].get<gpu, 4, DType>(s);
CHECK_EQ(data.CheckContiguous(), true);
CHECK_EQ(wmat.CheckContiguous(), true);
CHECK_EQ(out.CheckContiguous(), true);
if (!init_cudnn_) {
Init(s, in_data, out_data);
}
Tensor<gpu, 1, DType> workspace =
ctx.requested[deconv::kTempSpace].get_space_typed<gpu, 1, DType>(
mshadow::Shape1(forward_workspace_), s);
for (uint32_t g = 0; g < param_.num_group; ++g) {
typename DataType<DType>::ScaleType alpha = 1.0f;
typename DataType<DType>::ScaleType beta = 0.0f;
#if CUDNN_MAJOR <= 4
CHECK_EQ(cudnnConvolutionBackwardData_v3(s->dnn_handle_,
&alpha,
filter_desc_,
wmat.dptr_ + weight_offset_ * g,
in_desc_,
data.dptr_ + data_offset_ * g,
conv_desc_,
back_algo_,
workspace.dptr_,
backward_workspace_byte_,
&beta,
out_desc_,
out.dptr_ + out_offset_ * g), CUDNN_STATUS_SUCCESS);
#elif CUDNN_MAJOR == 5
CHECK_EQ(cudnnConvolutionBackwardData(s->dnn_handle_,
&alpha,
filter_desc_,
wmat.dptr_ + weight_offset_ * g,
in_desc_,
data.dptr_ + data_offset_ * g,
conv_desc_,
back_algo_,
workspace.dptr_,
backward_workspace_byte_,
&beta,
out_desc_,
out.dptr_ + out_offset_ * g), CUDNN_STATUS_SUCCESS);
#endif
if (!param_.no_bias) {
beta = 1.0f;
Tensor<gpu, 1, DType> bias = in_data[deconv::kBias].get<gpu, 1, DType>(s);
#if CUDNN_MAJOR >= 4
CHECK_EQ(cudnnAddTensor(s->dnn_handle_,
&alpha,
bias_desc_,
bias.dptr_ + bias_offset_ * g,
&beta,
out_desc_,
out.dptr_ + out_offset_ * g), CUDNN_STATUS_SUCCESS);
#endif
#if CUDNN_MAJOR == 3
CHECK_EQ(cudnnAddTensor(s->dnn_handle_,
CUDNN_ADD_SAME_C,
&alpha,
bias_desc_,
bias.dptr_ + bias_offset_ * g,
&beta,
out_desc_,
out.dptr_ + out_offset_ * g), CUDNN_STATUS_SUCCESS);
#endif
}
}
}
virtual void Backward(const OpContext &ctx,
const std::vector<TBlob> &out_grad,
const std::vector<TBlob> &in_data,
const std::vector<TBlob> &out_data,
const std::vector<OpReqType> &req,
const std::vector<TBlob> &in_grad,
const std::vector<TBlob> &aux_args) {
using namespace mshadow;
using namespace mshadow::expr;
size_t expected = param_.no_bias == 0 ? 3 : 2;
CHECK_EQ(out_grad.size(), 1U);
CHECK(in_data.size() == expected && in_grad.size() == expected);
CHECK_NE(req[deconv::kWeight], kWriteInplace);
CHECK_NE(req[deconv::kBias], kWriteInplace);
CHECK_NE(req[deconv::kData], kWriteInplace);
Stream<gpu> *s = ctx.get_stream<gpu>();
Tensor<gpu, 4, DType> grad = out_grad[deconv::kOut].get<gpu, 4, DType>(s);
Tensor<gpu, 4, DType> wmat = in_data[deconv::kWeight].get<gpu, 4, DType>(s);
Tensor<gpu, 4, DType> gwmat = in_grad[deconv::kWeight].get<gpu, 4, DType>(s);
Tensor<gpu, 4, DType> data = in_data[deconv::kData].get<gpu, 4, DType>(s);
Tensor<gpu, 4, DType> gdata = in_grad[deconv::kData].get<gpu, 4, DType>(s);
Tensor<gpu, 1, DType> workspace =
ctx.requested[deconv::kTempSpace].get_space_typed<gpu, 1, DType>(
mshadow::Shape1(backward_workspace_), s);
for (uint32_t g = 0; g < param_.num_group; ++g) {
typename DataType<DType>::ScaleType alpha = 1.0f;
typename DataType<DType>::ScaleType bias_beta =
req[deconv::kBias] == kAddTo ? 1.0f : 0.0f;
typename DataType<DType>::ScaleType data_beta =
req[deconv::kData] == kAddTo ? 1.0f : 0.0f;
typename DataType<DType>::ScaleType weight_beta =
req[deconv::kWeight] == kAddTo ? 1.0f : 0.0f;
if (!param_.no_bias) {
CHECK_NE(req[deconv::kBias], kNullOp);
Tensor<gpu, 1, DType> gbias = in_grad[deconv::kBias].get<gpu, 1, DType>(s);
CHECK_EQ(cudnnConvolutionBackwardBias(s->dnn_handle_,
&alpha,
out_desc_,
grad.dptr_ + out_offset_ * g,
&bias_beta,
bias_desc_,
gbias.dptr_ + bias_offset_ * g),
CUDNN_STATUS_SUCCESS);
}
if (req[deconv::kWeight] != kNullOp) {
#if CUDNN_MAJOR <= 4
CHECK_EQ(cudnnConvolutionBackwardFilter_v3(s->dnn_handle_,
&alpha,
out_desc_,
grad.dptr_ + out_offset_ * g,
in_desc_,
data.dptr_ + data_offset_ * g,
conv_desc_,
back_algo_w_,
workspace.dptr_,
backward_workspace_byte_,
&weight_beta,
filter_desc_,
gwmat.dptr_ + weight_offset_ * g), CUDNN_STATUS_SUCCESS);
#elif CUDNN_MAJOR == 5
CHECK_EQ(cudnnConvolutionBackwardFilter(s->dnn_handle_,
&alpha,
out_desc_,
grad.dptr_ + out_offset_ * g,
in_desc_,
data.dptr_ + data_offset_ * g,
conv_desc_,
back_algo_w_,
workspace.dptr_,
backward_workspace_byte_,
&weight_beta,
filter_desc_,
gwmat.dptr_ + weight_offset_ * g), CUDNN_STATUS_SUCCESS);
#endif
}
if (req[deconv::kData] != kNullOp) {
CHECK_EQ(cudnnConvolutionForward(s->dnn_handle_,
&alpha,
out_desc_,
grad.dptr_ + out_offset_ * g,
filter_desc_,
wmat.dptr_ + weight_offset_ * g,
conv_desc_,
algo_,
workspace.dptr_,
forward_workspace_byte_,
&data_beta,
in_desc_,
gdata.dptr_ + data_offset_ * g), CUDNN_STATUS_SUCCESS);
}
}
}
private:
inline void Init(mshadow::Stream<gpu> *s,
const std::vector<TBlob> &in_data,
const std::vector<TBlob> &out_data) {
using namespace mshadow;
#if CUDNN_MAJOR == 5
format_ = CUDNN_TENSOR_NCHW;
#endif
size_t expected = param_.no_bias ? 2 : 3;
CHECK_EQ(in_data.size(), expected);
CHECK_EQ(out_data.size(), 1U);
if (!init_cudnn_) {
init_cudnn_ = true;
size_t workspace_byte = static_cast<size_t>(param_.workspace * sizeof(DType));
size_t back_size = 0;
size_t back_size_w = 0;
Tensor<gpu, 4, DType> data = in_data[deconv::kData].get<gpu, 4, DType>(s);
Tensor<gpu, 4, DType> out = out_data[deconv::kOut].get<gpu, 4, DType>(s);
index_t pad_y, pad_x, adj_y, adj_x;
param_.InferPad(data.size(2), data.size(3), &pad_y, &pad_x, &adj_y, &adj_x);
data_offset_ = data.shape_[1] / param_.num_group * data.shape_[2] * data.shape_[3];
out_offset_ = out.shape_[1] /param_.num_group * out.shape_[2] * out.shape_[3];
weight_offset_ = data.shape_[1] / param_.num_group * param_.num_filter / param_.num_group
* param_.kernel[0] * param_.kernel[1];
CHECK_EQ(cudnnCreateTensorDescriptor(&in_desc_), CUDNN_STATUS_SUCCESS);
CHECK_EQ(cudnnCreateTensorDescriptor(&out_desc_), CUDNN_STATUS_SUCCESS);
CHECK_EQ(cudnnCreateTensorDescriptor(&bias_desc_), CUDNN_STATUS_SUCCESS);
CHECK_EQ(cudnnCreateFilterDescriptor(&filter_desc_), CUDNN_STATUS_SUCCESS);
CHECK_EQ(cudnnCreateConvolutionDescriptor(&conv_desc_), CUDNN_STATUS_SUCCESS);
#if CUDNN_MAJOR <=4
CHECK_EQ(cudnnSetFilter4dDescriptor(filter_desc_,
dtype_,
data.shape_[1] / param_.num_group,
param_.num_filter / param_.num_group,
param_.kernel[0],
param_.kernel[1]), CUDNN_STATUS_SUCCESS);
#elif CUDNN_MAJOR ==5
CHECK_EQ(cudnnSetFilter4dDescriptor(filter_desc_,
dtype_,
format_,
data.shape_[1] / param_.num_group,
param_.num_filter / param_.num_group,
param_.kernel[0],
param_.kernel[1]), CUDNN_STATUS_SUCCESS);
#endif
CHECK_EQ(cudnnSetConvolution2dDescriptor(conv_desc_,
pad_y,
pad_x,
param_.stride[0],
param_.stride[1],
1,
1,
CUDNN_CROSS_CORRELATION), CUDNN_STATUS_SUCCESS);
CHECK_EQ(cudnnSetTensor4dDescriptorEx(in_desc_,
dtype_,
data.shape_[0],
data.shape_[1] / param_.num_group,
data.shape_[2],
data.shape_[3],
data.shape_[1] * data.shape_[2] * data.shape_[3],
data.shape_[2] * data.shape_[3],
data.shape_[3],
1), CUDNN_STATUS_SUCCESS);
CHECK_EQ(cudnnSetTensor4dDescriptorEx(out_desc_,
dtype_,
out.shape_[0],
out.shape_[1] / param_.num_group,
out.shape_[2],
out.shape_[3],
out.shape_[1] * out.shape_[2] * out.shape_[3],
out.shape_[2] * out.shape_[3],
out.shape_[3],
1), CUDNN_STATUS_SUCCESS);
if (!param_.no_bias) {
Tensor<gpu, 1, DType> bias = in_data[deconv::kBias].get<gpu, 1, DType>(s);
bias_offset_ = bias.shape_[0] / param_.num_group;
CHECK_EQ(cudnnSetTensor4dDescriptor(bias_desc_,
CUDNN_TENSOR_NCHW,
dtype_,
1,
bias.shape_[0] / param_.num_group,
1,
1), CUDNN_STATUS_SUCCESS);
}
CHECK_EQ(s->dnn_handle_ownership_, mshadow::Stream<gpu>::OwnHandle);
CHECK_EQ(cudnnGetConvolutionForwardAlgorithm(s->dnn_handle_,
out_desc_,
filter_desc_,
conv_desc_,
in_desc_,
CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
workspace_byte,
&algo_), CUDNN_STATUS_SUCCESS);
CHECK_EQ(cudnnGetConvolutionBackwardFilterAlgorithm(s->dnn_handle_,
out_desc_,
in_desc_,
conv_desc_,
filter_desc_,
CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
workspace_byte,
&back_algo_w_), CUDNN_STATUS_SUCCESS);
CHECK_EQ(cudnnGetConvolutionBackwardDataAlgorithm(s->dnn_handle_,
filter_desc_,
in_desc_,
conv_desc_,
out_desc_,
CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
workspace_byte,
&back_algo_), CUDNN_STATUS_SUCCESS);
CHECK_EQ(cudnnGetConvolutionBackwardDataWorkspaceSize(s->dnn_handle_,
filter_desc_,
in_desc_,
conv_desc_,
out_desc_,
back_algo_,
&back_size), CUDNN_STATUS_SUCCESS);
CHECK_EQ(cudnnGetConvolutionBackwardFilterWorkspaceSize(s->dnn_handle_,
out_desc_,
in_desc_,
conv_desc_,
filter_desc_,
back_algo_w_,
&back_size_w), CUDNN_STATUS_SUCCESS);
backward_workspace_byte_ = std::max(back_size, back_size_w);
CHECK_EQ(cudnnGetConvolutionForwardWorkspaceSize(s->dnn_handle_,
out_desc_,
filter_desc_,
conv_desc_,
in_desc_,
algo_,
&forward_workspace_byte_), CUDNN_STATUS_SUCCESS);
forward_workspace_ = forward_workspace_byte_ / sizeof(DType) + 1;
backward_workspace_ = backward_workspace_byte_ / sizeof(DType) + 1;
}
}
bool init_cudnn_;
size_t forward_workspace_;
size_t backward_workspace_;
size_t forward_workspace_byte_;
size_t backward_workspace_byte_;
size_t data_offset_;
size_t out_offset_;
size_t weight_offset_;
size_t bias_offset_;
cudnnDataType_t dtype_;
cudnnTensorDescriptor_t in_desc_;
cudnnTensorDescriptor_t out_desc_;
cudnnTensorDescriptor_t bias_desc_;
cudnnFilterDescriptor_t filter_desc_;
cudnnConvolutionDescriptor_t conv_desc_;
cudnnConvolutionFwdAlgo_t algo_;
cudnnConvolutionBwdDataAlgo_t back_algo_;
cudnnConvolutionBwdFilterAlgo_t back_algo_w_;
#if CUDNN_MAJOR == 5
cudnnTensorFormat_t format_;
#endif
DeconvolutionParam param_;
};
#endif // __CUDACC__ && CUDNN
} // namespace op
} // namespace mxnet
#endif // MXNET_OPERATOR_CUDNN_DECONVOLUTION_INL_H_