blob: 44e1fef8ba55652e059c940374491497a8fbd3f4 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "./cudnn_convolution.h"
#ifdef USE_CUDNN
#include <cudnn.h>
#include <chrono>
#include "./cudnn_utils.h"
#include "singa/utils/logging.h"
namespace singa {
RegisterLayerClass(cudnn_convolution, CudnnConvolution);
CudnnConvolution::~CudnnConvolution() {
if (bias_desc_ != nullptr)
CUDNN_CHECK(cudnnDestroyTensorDescriptor(bias_desc_));
if (filter_desc_ != nullptr)
CUDNN_CHECK(cudnnDestroyFilterDescriptor(filter_desc_));
if (conv_desc_ != nullptr)
CUDNN_CHECK(cudnnDestroyConvolutionDescriptor(conv_desc_));
if (x_desc_ != nullptr) CUDNN_CHECK(cudnnDestroyTensorDescriptor(x_desc_));
if (y_desc_ != nullptr) CUDNN_CHECK(cudnnDestroyTensorDescriptor(y_desc_));
}
void CudnnConvolution::Setup(const Shape& in_sample, const LayerConf &conf) {
Convolution::Setup(in_sample, conf);
ConvolutionConf conv_conf = conf.convolution_conf();
// convert MB to bytes
workspace_byte_limit_ = conv_conf.workspace_byte_limit() << 20;
prefer_ = ToLowerCase(conv_conf.prefer());
CHECK(prefer_ == "fastest" || prefer_ == "limited_workspace" ||
prefer_ == "no_workspace" || prefer_ == "autotune")
<< "CudnnConvolution only supports four algorithm preferences: fastest, "
"limited_workspace, no_workspace and autotune";
}
void CudnnConvolution::ToDevice(std::shared_ptr<Device> device) {
Convolution::ToDevice(device);
workspace_.ToDevice(device);
}
void CudnnConvolution::InitCudnn(const Tensor &input) {
DataType dtype = input.data_type();
auto dev = input.device();
Context *ctx = dev->context(0);
size_t batchsize = input.shape(0);
if (!has_init_cudnn_) {
CUDNN_CHECK(cudnnCreateTensorDescriptor(&x_desc_));
CUDNN_CHECK(cudnnCreateTensorDescriptor(&y_desc_));
if (bias_term_)
CUDNN_CHECK(cudnnCreateTensorDescriptor(&bias_desc_));
CUDNN_CHECK(cudnnCreateFilterDescriptor(&filter_desc_));
CUDNN_CHECK(cudnnCreateConvolutionDescriptor(&conv_desc_));
}
CUDNN_CHECK(cudnnSetTensor4dDescriptor(x_desc_, CUDNN_TENSOR_NCHW,
GetCudnnDataType(dtype), batchsize,
channels_, height_, width_));
CUDNN_CHECK(cudnnSetTensor4dDescriptor(
y_desc_, CUDNN_TENSOR_NCHW, GetCudnnDataType(dtype), batchsize,
num_filters_, conv_height_, conv_width_));
if (bias_term_)
CUDNN_CHECK(cudnnSetTensor4dDescriptor(bias_desc_, CUDNN_TENSOR_NCHW,
GetCudnnDataType(dtype), 1,
num_filters_, 1, 1));
CUDNN_CHECK(cudnnSetConvolution2dDescriptor(conv_desc_, pad_h_, pad_w_,
stride_h_, stride_w_, 1, 1, // dilation x and y
CUDNN_CROSS_CORRELATION
#if CUDNN_MAJOR >= 7
, GetCudnnDataType(dtype)
#endif // CUDNN_MAJOR
));
CUDNN_CHECK(cudnnSetFilter4dDescriptor(filter_desc_, GetCudnnDataType(dtype),
CUDNN_TENSOR_NCHW, num_filters_,
channels_, kernel_h_, kernel_w_));
if (prefer_ == "fastest" || prefer_ == "limited_workspace" ||
prefer_ == "no_workspace") {
cudnnConvolutionFwdPreference_t fwd_pref;
cudnnConvolutionBwdFilterPreference_t bwd_filt_pref;
cudnnConvolutionBwdDataPreference_t bwd_data_pref;
if (prefer_ == "fastest") {
fwd_pref = CUDNN_CONVOLUTION_FWD_PREFER_FASTEST;
bwd_filt_pref = CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST;
bwd_data_pref = CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST;
} else if (prefer_ == "limited_workspace") {
fwd_pref = CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT;
bwd_filt_pref = CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT;
bwd_data_pref = CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT;
} else {
fwd_pref = CUDNN_CONVOLUTION_FWD_NO_WORKSPACE;
bwd_filt_pref = CUDNN_CONVOLUTION_BWD_FILTER_NO_WORKSPACE;
bwd_data_pref = CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT;
}
CUDNN_CHECK(cudnnGetConvolutionForwardAlgorithm(
ctx->cudnn_handle, x_desc_, filter_desc_, conv_desc_, y_desc_, fwd_pref,
workspace_byte_limit_, &fp_alg_));
CUDNN_CHECK(cudnnGetConvolutionBackwardFilterAlgorithm(
ctx->cudnn_handle, x_desc_, y_desc_, conv_desc_, filter_desc_,
bwd_filt_pref, workspace_byte_limit_, &bp_filter_alg_));
// deprecated in cudnn v7
CUDNN_CHECK(cudnnGetConvolutionBackwardDataAlgorithm(
ctx->cudnn_handle, filter_desc_, y_desc_, conv_desc_, x_desc_,
bwd_data_pref, workspace_byte_limit_, &bp_data_alg_));
} else if (prefer_ == "autotune") {
const int topk = 1;
int num_fp_alg, num_bp_filt_alg, num_bp_data_alg;
cudnnConvolutionFwdAlgoPerf_t fp_alg_perf[topk];
cudnnConvolutionBwdFilterAlgoPerf_t bp_filt_perf[topk];
cudnnConvolutionBwdDataAlgoPerf_t bp_data_perf[topk];
CUDNN_CHECK(cudnnFindConvolutionForwardAlgorithm(
ctx->cudnn_handle, x_desc_, filter_desc_, conv_desc_, y_desc_, topk,
&num_fp_alg, fp_alg_perf));
fp_alg_ = fp_alg_perf[0].algo;
CUDNN_CHECK(cudnnFindConvolutionBackwardFilterAlgorithm(
ctx->cudnn_handle, x_desc_, y_desc_, conv_desc_, filter_desc_, topk,
&num_bp_filt_alg, bp_filt_perf));
bp_filter_alg_ = bp_filt_perf[0].algo;
CUDNN_CHECK(cudnnFindConvolutionBackwardDataAlgorithm(
ctx->cudnn_handle, filter_desc_, y_desc_, conv_desc_, x_desc_, topk,
&num_bp_data_alg, bp_data_perf));
bp_data_alg_ = bp_data_perf[0].algo;
} else {
LOG(FATAL) << "Preferred algorithm is not available!";
}
size_t fp_byte, bp_data_byte, bp_filter_byte;
CUDNN_CHECK(cudnnGetConvolutionForwardWorkspaceSize(
ctx->cudnn_handle, x_desc_, filter_desc_, conv_desc_, y_desc_, fp_alg_,
&fp_byte));
CUDNN_CHECK(cudnnGetConvolutionBackwardDataWorkspaceSize(
ctx->cudnn_handle, filter_desc_, y_desc_, conv_desc_, x_desc_,
bp_data_alg_, &bp_data_byte));
CUDNN_CHECK(cudnnGetConvolutionBackwardFilterWorkspaceSize(
ctx->cudnn_handle, x_desc_, y_desc_, conv_desc_, filter_desc_,
bp_filter_alg_, &bp_filter_byte));
workspace_count_ = std::max(std::max(fp_byte, bp_data_byte), bp_filter_byte) /
sizeof(float) +
1;
if (workspace_count_ * sizeof(float) > workspace_byte_limit_)
LOG(WARNING) << "The required memory for workspace ("
<< workspace_count_ * sizeof(float)
<< ") is larger than the expected Bytes ("
<< workspace_byte_limit_ << ")";
workspace_ = Tensor(Shape{workspace_count_}, dev, dtype);
has_init_cudnn_ = true;
}
const Tensor CudnnConvolution::Forward(int flag, const Tensor &input) {
CHECK(buf_.empty());
CHECK_EQ(input.device()->lang(), kCuda);
CHECK_EQ(input.nDim(), 4u);
if (flag & kTrain) buf_.push(input); // buffer the input for backward
size_t batchsize = input.shape()[0];
DataType dtype = input.data_type();
auto dev = input.device();
if (!has_init_cudnn_) {
InitCudnn(input);
} else {
int n, c, h, w, s;
cudnnDataType_t type;
CUDNN_CHECK(cudnnGetTensor4dDescriptor(x_desc_, &type, &n, &c, &h, &w,
&s, &s, &s, &s));
if (batchsize != static_cast<size_t>(n))
InitCudnn(input);
CHECK(input.shape(1) == static_cast<size_t>(c)
&& input.shape(2) == static_cast<size_t>(h)
&& input.shape(3) == static_cast<size_t>(w))
<< "input sample shape should not change"
<< "previous shape " << c << ", " << h << ", " << w
<< "current shape " << input.shape(1) << ", " << input.shape(2) << ", "
<< input.shape(3);
}
Shape shape{batchsize, num_filters_, conv_height_, conv_width_};
Tensor output(shape, dev, dtype);
output.device()->Exec([input, output, this](Context * ctx) {
Block *inblock = input.block(), *outblock = output.block(),
*wblock = this->weight_.block();
float alpha = 1.f, beta = 0.f;
cudnnConvolutionForward(ctx->cudnn_handle, &alpha, this->x_desc_,
inblock->data(), this->filter_desc_, wblock->data(),
this->conv_desc_, this->fp_alg_,
this->workspace_.block()->mutable_data(),
this->workspace_count_ * sizeof(float), &beta,
this->y_desc_, outblock->mutable_data());
}, {input.block(), weight_.block()}, {output.block(), workspace_.block()});
if (bias_term_) {
output.device()->Exec([output, this](Context * ctx) {
float beta = 1.f, alpha = 1.0f;
Block *outblock = output.block(), *bblock = this->bias_.block();
cudnnAddTensor(ctx->cudnn_handle, &alpha, this->bias_desc_,
bblock->data(), &beta, this->y_desc_,
outblock->mutable_data());
}, {output.block(), bias_.block()}, {output.block()});
}
return output;
}
const std::pair<Tensor, vector<Tensor>> CudnnConvolution::Backward(
int flag, const Tensor &grad) {
CHECK(has_init_cudnn_);
CHECK_EQ(grad.device()->lang(), kCuda);
CHECK_EQ(grad.nDim(), 4u);
CHECK(!buf_.empty());
Tensor src_data = buf_.top();
buf_.pop();
vector<Tensor> param_grad;
Tensor dx;
dx.ResetLike(src_data);
Tensor db, dw;
dw.ResetLike(weight_);
// LOG(ERROR) << "backward bias";
if (bias_term_) {
db.ResetLike(bias_);
dx.device()->Exec([grad, db, this](Context * ctx) {
Block *dyblock = grad.block(), *dbblock = db.block();
float alpha = 1.f, beta = 0.f;
cudnnConvolutionBackwardBias(ctx->cudnn_handle, &alpha, this->y_desc_,
dyblock->data(), &beta, this->bias_desc_,
dbblock->mutable_data());
}, {grad.block()}, {db.block()});
}
// LOG(ERROR) << "backward w";
dx.device()->Exec([grad, dw, src_data, this](Context * ctx) {
Block *inblock = src_data.block(), *dyblock = grad.block(),
*dwblock = dw.block();
float alpha = 1.f, beta = 0.f;
cudnnConvolutionBackwardFilter(
ctx->cudnn_handle, &alpha, this->x_desc_, inblock->data(),
this->y_desc_, dyblock->data(), this->conv_desc_, this->bp_filter_alg_,
this->workspace_.block()->mutable_data(),
this->workspace_count_ * sizeof(float), &beta, this->filter_desc_,
dwblock->mutable_data());
}, {grad.block(), src_data.block()}, {dw.block(), workspace_.block()});
// LOG(ERROR) << "backward src";
dx.device()->Exec([dx, grad, this](Context * ctx) {
Block *wblock = this->weight_.block(), *dyblock = grad.block(),
*dxblock = dx.block();
float alpha = 1.f, beta = 0.f;
cudnnConvolutionBackwardData(ctx->cudnn_handle, &alpha, this->filter_desc_,
wblock->data(), this->y_desc_, dyblock->data(),
this->conv_desc_, this->bp_data_alg_,
this->workspace_.block()->mutable_data(),
this->workspace_count_ * sizeof(float), &beta,
this->x_desc_, dxblock->mutable_data());
}, {grad.block(), weight_.block()}, {dx.block(), workspace_.block()});
param_grad.push_back(dw);
if (bias_term_)
param_grad.push_back(db);
return std::make_pair(dx, param_grad);
}
} // namespace singa
#endif // USE_CUDNN