blob: beb824d4fd4685b8447680b0320e27100a6e92ec [file] [log] [blame]
#include "./convolution.h"
#include "../layer/convolution.h"
namespace singa {
ConvHandle::ConvHandle(const Tensor &input,
const std::vector<size_t>& kernel_size,
const std::vector<size_t>& stride, const std::vector<size_t>& padding,
const size_t in_channels, const size_t out_channels,
const bool bias) {
kernel_h = kernel_size[0];
kernel_w = kernel_size[1];
pad_h = padding[0];
pad_w = padding[1];
stride_h = stride[0];
stride_w = stride[1];
channels = in_channels;
num_filters = out_channels;
bias_term = bias;
batchsize = input.shape(0);
CHECK(input.shape(1) == in_channels) <<
"the number of input channels mismatched.";
height = input.shape(2);
width = input.shape(3);
conv_height = 1;
if (stride_h > 0)
conv_height = (height + 2 * pad_h - kernel_h) / stride_h + 1;
conv_width = (width + 2 * pad_w - kernel_w) / stride_w + 1;
col_height = in_channels * kernel_w * kernel_h;
col_width = conv_height * conv_width;
imagesize = input.Size() / batchsize;
}
Tensor CpuConvForward(const Tensor &x, Tensor &W, Tensor &b,
const ConvHandle &ch) {
CHECK_EQ(x.device()->lang(), kCpp);
CHECK(x.shape(1) == ch.channels && x.shape(2) == ch.height &&
x.shape(3) == ch.width) << "input sample shape should not change";
CHECK(W.shape(0) == ch.num_filters && W.shape(1) == ch.channels &&
W.shape(2) == ch.kernel_h
&& W.shape(3) == ch.kernel_w) << "weights shape should not change";
Shape w_shape = W.shape();
Shape b_shape;
if (ch.bias_term)
b_shape = b.shape();
W.Reshape(Shape{ch.num_filters, ch.col_height});
if (ch.bias_term)
b.Reshape(Shape{ch.num_filters});
DataType dtype = x.data_type();
auto dev = x.device();
Shape shape{ch.batchsize, ch.num_filters, ch.conv_height, ch.conv_width};
Tensor output(shape, dev, dtype);
Tensor col_data(Shape{ch.col_height, ch.col_width});//broadcasted image
float *data_col = new float[ch.col_height * ch.col_width];
auto in_data = x.data<float>();
for (size_t num = 0; num < ch.batchsize; num++) {
Im2col(in_data + num * ch.imagesize, ch.channels, ch.height, ch.width,
ch.kernel_h,
ch.kernel_w, ch.pad_h, ch.pad_w, ch.stride_h, ch.stride_w, data_col);
col_data.CopyDataFromHostPtr(data_col, ch.col_height * ch.col_width);
Tensor each = Mult(W, col_data);
if (ch.bias_term) {
AddColumn(b, &each);
}
CopyDataToFrom(&output, each, each.Size(), num * each.Size());
};
W.Reshape(w_shape);
if (ch.bias_term)
b.Reshape(b_shape);
return output;
}
Tensor CpuConvBackwardx(const Tensor &dy, Tensor &W, const Tensor &x,
const ConvHandle &ch) {
CHECK_EQ(dy.device()->lang(), kCpp);
CHECK(dy.shape(1) == ch.num_filters && dy.shape(2) == ch.conv_height &&
dy.shape(3) == ch.conv_width) << "input gradients shape should not change";
CHECK(W.shape(0) == ch.num_filters && W.shape(1) == ch.channels &&
W.shape(2) == ch.kernel_h
&& W.shape(3) == ch.kernel_w) << "weights shape should not change";
Shape w_shape = W.shape();
W.Reshape(Shape{ch.num_filters, ch.col_height});
Tensor dx;
dx.ResetLike(x);
float *dx_b = new float[ch.imagesize];
for (size_t num = 0; num < ch.batchsize; num++) {
Tensor grad_b(Shape{ch.num_filters, ch.conv_height * ch.conv_width});
CopyDataToFrom(&grad_b, dy, grad_b.Size(), 0, num * grad_b.Size());
Tensor dcol_b = Mult(Transpose(W), grad_b);
auto dcol_data = dcol_b.data<float>();
Col2im(dcol_data, ch.channels, ch.height, ch.width, ch.kernel_h, ch.kernel_w,
ch.pad_h,
ch.pad_w, ch.stride_h, ch.stride_w, dx_b);
dx.CopyDataFromHostPtr(dx_b, ch.imagesize, num * ch.imagesize);
}
W.Reshape(w_shape);
return dx;
}
Tensor CpuConvBackwardW(const Tensor &dy, const Tensor &x, const Tensor &W,
const ConvHandle &ch) {
CHECK_EQ(dy.device()->lang(), kCpp);
CHECK(dy.shape(1) == ch.num_filters && dy.shape(2) == ch.conv_height &&
dy.shape(3) == ch.conv_width) << "input gradients shape should not change";
CHECK(x.shape(1) == ch.channels && x.shape(2) == ch.height &&
x.shape(3) == ch.width) << "input sample shape should not change";
Tensor dW;
dW.ResetLike(W);
dW.SetValue(0.0f);
Shape w_shape = W.shape();
dW.Reshape(Shape{ch.num_filters, ch.col_height});
Tensor col_data(Shape{ch.col_height, ch.col_width});//broadcasted image
float *data_col = new float[ch.col_height * ch.col_width];
auto in_data = dy.data<float>();
for (size_t num = 0; num < ch.batchsize; num++) {
Im2col(in_data + num * ch.imagesize, ch.channels, ch.height, ch.width,
ch.kernel_h,
ch.kernel_w, ch.pad_h, ch.pad_w, ch.stride_h, ch.stride_w, data_col);
col_data.CopyDataFromHostPtr(data_col, ch.col_height * ch.col_width);
Tensor grad_b(Shape{ch.num_filters, ch.conv_height * ch.conv_width});
CopyDataToFrom(&grad_b, dy, grad_b.Size(), 0, num * grad_b.Size());
dW += Mult(grad_b, Transpose(col_data));
}
dW.Reshape(w_shape);
return dW;
}
Tensor CpuConvBackwardb(const Tensor &dy, const Tensor &b,
const ConvHandle &ch) {
CHECK_EQ(dy.device()->lang(), kCpp);
CHECK(dy.shape(1) == ch.num_filters && dy.shape(2) == ch.conv_height &&
dy.shape(3) == ch.conv_width) << "input gradients shape should not change";
CHECK(b.shape(0) == ch.num_filters) << "bias shape should not change";
Tensor db;
db.ResetLike(b);
auto tmpshp = Shape{ch.batchsize * ch.num_filters, dy.Size() / (ch.batchsize * ch.num_filters)};
Tensor tmp1 = Reshape(dy, tmpshp);
Tensor tmp2(Shape{ch.batchsize * ch.num_filters});
SumColumns(tmp1, &tmp2);
Tensor tmp3 = Reshape(tmp2, Shape{ch.batchsize, ch.num_filters});
SumRows(tmp3, &db);
return db;
};
#ifdef USE_CUDNN
CudnnConvHandle::CudnnConvHandle(const Tensor &input,
const std::vector<size_t>& kernel_size,
const std::vector<size_t>& stride, const std::vector<size_t>& padding,
const size_t in_channels, const size_t out_channels, const bool bias,
const size_t groups,
const size_t workspace_byte_limit, const std::string& prefer)
: ConvHandle(input, kernel_size, stride, padding, in_channels, out_channels,
bias) {
DataType dtype = input.data_type();
auto dev = input.device();
Context *ctx = dev->context(0);
CUDNN_CHECK(cudnnCreateTensorDescriptor(&x_desc));
CUDNN_CHECK(cudnnCreateTensorDescriptor(&y_desc));
if (bias_term)
CUDNN_CHECK(cudnnCreateTensorDescriptor(&bias_desc));
CUDNN_CHECK(cudnnCreateFilterDescriptor(&filter_desc));
CUDNN_CHECK(cudnnCreateConvolutionDescriptor(&conv_desc));
CUDNN_CHECK(cudnnSetTensor4dDescriptor(x_desc, CUDNN_TENSOR_NCHW,
GetCudnnDataType(dtype), batchsize,
channels, height, width));
CUDNN_CHECK(cudnnSetTensor4dDescriptor(
y_desc, CUDNN_TENSOR_NCHW, GetCudnnDataType(dtype), batchsize,
num_filters, conv_height, conv_width));
if (bias_term)
CUDNN_CHECK(cudnnSetTensor4dDescriptor(bias_desc, CUDNN_TENSOR_NCHW,
GetCudnnDataType(dtype), 1,
num_filters, 1, 1));
CUDNN_CHECK(cudnnSetConvolution2dDescriptor(conv_desc, pad_h, pad_w,
stride_h, stride_w, 1, 1,
CUDNN_CROSS_CORRELATION
#if CUDNN_MAJOR >= 7
, GetCudnnDataType(dtype)
#endif
));
if (CUDNN_MAJOR >= 7 && groups > 1) {
CUDNN_CHECK(cudnnSetConvolutionGroupCount(conv_desc, groups));
}
else if (groups > 1) {LOG(FATAL) << "The current version of cuDNN not support grouped convolution.";};
CUDNN_CHECK(cudnnSetFilter4dDescriptor(filter_desc, GetCudnnDataType(dtype),
CUDNN_TENSOR_NCHW, num_filters,
channels / groups, kernel_h, kernel_w));
if (prefer == "fastest" || prefer == "limited_workspace" ||
prefer == "no_workspace") {
cudnnConvolutionFwdPreference_t fwd_pref;
cudnnConvolutionBwdFilterPreference_t bwd_filt_pref;
cudnnConvolutionBwdDataPreference_t bwd_data_pref;
if (prefer == "fastest") {
fwd_pref = CUDNN_CONVOLUTION_FWD_PREFER_FASTEST;
bwd_filt_pref = CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST;
bwd_data_pref = CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST;
} else if (prefer == "limited_workspace") {
fwd_pref = CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT;
bwd_filt_pref = CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT;
bwd_data_pref = CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT;
} else {
fwd_pref = CUDNN_CONVOLUTION_FWD_NO_WORKSPACE;
bwd_filt_pref = CUDNN_CONVOLUTION_BWD_FILTER_NO_WORKSPACE;
bwd_data_pref = CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT;
}
CUDNN_CHECK(cudnnGetConvolutionForwardAlgorithm(
ctx->cudnn_handle, x_desc, filter_desc, conv_desc, y_desc, fwd_pref,
workspace_byte_limit, &fp_alg));
CUDNN_CHECK(cudnnGetConvolutionBackwardFilterAlgorithm(
ctx->cudnn_handle, x_desc, y_desc, conv_desc, filter_desc,
bwd_filt_pref, workspace_byte_limit, &bp_filter_alg));
// deprecated in cudnn v7
CUDNN_CHECK(cudnnGetConvolutionBackwardDataAlgorithm(
ctx->cudnn_handle, filter_desc, y_desc, conv_desc, x_desc,
bwd_data_pref, workspace_byte_limit, &bp_data_alg));
} else if (prefer == "autotune") {
const int topk = 1;
int num_fp_alg, num_bp_filt_alg, num_bp_data_alg;
cudnnConvolutionFwdAlgoPerf_t fp_algperf[topk];
cudnnConvolutionBwdFilterAlgoPerf_t bp_filt_perf[topk];
cudnnConvolutionBwdDataAlgoPerf_t bp_data_perf[topk];
CUDNN_CHECK(cudnnFindConvolutionForwardAlgorithm(
ctx->cudnn_handle, x_desc, filter_desc, conv_desc, y_desc, topk,
&num_fp_alg, fp_algperf));
fp_alg = fp_algperf[0].algo;
CUDNN_CHECK(cudnnFindConvolutionBackwardFilterAlgorithm(
ctx->cudnn_handle, x_desc, y_desc, conv_desc, filter_desc, topk,
&num_bp_filt_alg, bp_filt_perf));
bp_filter_alg = bp_filt_perf[0].algo;
CUDNN_CHECK(cudnnFindConvolutionBackwardDataAlgorithm(
ctx->cudnn_handle, filter_desc, y_desc, conv_desc, x_desc, topk,
&num_bp_data_alg, bp_data_perf));
bp_data_alg = bp_data_perf[0].algo;
} else {
LOG(FATAL) << "Preferred algorithm is not available!";
}
size_t fp_byte, bp_data_byte, bp_filter_byte;
CUDNN_CHECK(cudnnGetConvolutionForwardWorkspaceSize(
ctx->cudnn_handle, x_desc, filter_desc, conv_desc, y_desc, fp_alg,
&fp_byte));
CUDNN_CHECK(cudnnGetConvolutionBackwardDataWorkspaceSize(
ctx->cudnn_handle, filter_desc, y_desc, conv_desc, x_desc,
bp_data_alg, &bp_data_byte));
CUDNN_CHECK(cudnnGetConvolutionBackwardFilterWorkspaceSize(
ctx->cudnn_handle, x_desc, y_desc, conv_desc, filter_desc,
bp_filter_alg, &bp_filter_byte));
workspace_count = std::max(std::max(fp_byte, bp_data_byte), bp_filter_byte) /
sizeof(float) +
1;
if (workspace_count * sizeof(float) > workspace_byte_limit)
LOG(WARNING) << "The required memory for workspace ("
<< workspace_count * sizeof(float)
<< ") is larger than the expected Bytes ("
<< workspace_byte_limit << ")";
workspace = Tensor(Shape{workspace_count}, dev, dtype);
}
CudnnConvHandle::~CudnnConvHandle() {
if (bias_desc != nullptr)
CUDNN_CHECK(cudnnDestroyTensorDescriptor(bias_desc));
if (filter_desc != nullptr)
CUDNN_CHECK(cudnnDestroyFilterDescriptor(filter_desc));
if (conv_desc != nullptr)
CUDNN_CHECK(cudnnDestroyConvolutionDescriptor(conv_desc));
if (x_desc != nullptr) CUDNN_CHECK(cudnnDestroyTensorDescriptor(x_desc));
if (y_desc != nullptr) CUDNN_CHECK(cudnnDestroyTensorDescriptor(y_desc));
}
Tensor GpuConvForward(const Tensor &x, const Tensor &W, const Tensor &b,
const CudnnConvHandle &cch) {
CHECK_EQ(x.device()->lang(), kCuda);
DataType dtype = x.data_type();
auto dev = x.device();
Shape shape{cch.batchsize, cch.num_filters, cch.conv_height, cch.conv_width};
Tensor output(shape, dev, dtype);
output.device()->Exec([&output, &x, &W, &cch](Context * ctx) {
Block *inblock = x.block(), *outblock = output.block(),
*wblock = W.block();
float alpha = 1.f, beta = 0.f;
cudnnConvolutionForward(ctx->cudnn_handle, &alpha, cch.x_desc,
inblock->data(), cch.filter_desc, wblock->data(),
cch.conv_desc, cch.fp_alg,
cch.workspace.block()->mutable_data(),
cch.workspace_count * sizeof(float), &beta,
cch.y_desc, outblock->mutable_data());
}, {x.block(), W.block()}, {output.block()}, cch.workspace.block());
if (cch.bias_term) {
output.device()->Exec([&output, &b, &cch](Context * ctx) {
float beta = 1.f, alpha = 1.0f;
Block *outblock = output.block(), *bblock = b.block();
cudnnAddTensor(ctx->cudnn_handle, &alpha, cch.bias_desc,
bblock->data(), &beta, cch.y_desc,
outblock->mutable_data());
}, {output.block(), b.block()}, {output.block()});
}
return output;
}
Tensor GpuConvBackwardx(const Tensor &dy, const Tensor &W, const Tensor &x,
const CudnnConvHandle &cch) {
CHECK_EQ(dy.device()->lang(), kCuda);
Tensor dx;
dx.ResetLike(x);
dy.device()->Exec([&dx, &dy, &W, &cch](Context * ctx) {
Block *wblock = W.block(), *dyblock = dy.block(),
*dxblock = dx.block();
float alpha = 1.f, beta = 0.f;
cudnnConvolutionBackwardData(ctx->cudnn_handle, &alpha, cch.filter_desc,
wblock->data(), cch.y_desc, dyblock->data(),
cch.conv_desc, cch.bp_data_alg,
cch.workspace.block()->mutable_data(),
cch.workspace_count * sizeof(float), &beta,
cch.x_desc, dxblock->mutable_data());
}, {dy.block(), W.block()}, {dx.block(), cch.workspace.block()});
return dx;
}
Tensor GpuConvBackwardW(const Tensor &dy, const Tensor &x, const Tensor &W,
const CudnnConvHandle &cch) {
CHECK_EQ(dy.device()->lang(), kCuda);
Tensor dW;
dW.ResetLike(W);
dy.device()->Exec([&dW, &dy, &x, &cch](Context * ctx) {
Block *inblock = x.block(), *dyblock = dy.block(),
*dwblock = dW.block();
float alpha = 1.f, beta = 0.f;
cudnnConvolutionBackwardFilter(
ctx->cudnn_handle, &alpha, cch.x_desc, inblock->data(),
cch.y_desc, dyblock->data(), cch.conv_desc, cch.bp_filter_alg,
cch.workspace.block()->mutable_data(),
cch.workspace_count * sizeof(float), &beta, cch.filter_desc,
dwblock->mutable_data());
}, {dy.block(), x.block()}, {dW.block(), cch.workspace.block()});
return dW;
}
// input Tensor b for Reset db purpose, can avoid this later.
Tensor GpuConvBackwardb(const Tensor &dy, const Tensor &b,
const CudnnConvHandle &cch) {
CHECK_EQ(dy.device()->lang(), kCuda);
Tensor db;
db.ResetLike(b);
dy.device()->Exec([&db, &dy, &cch](Context * ctx) {
Block *dyblock = dy.block(), *dbblock = db.block();
float alpha = 1.f, beta = 0.f;
cudnnConvolutionBackwardBias(ctx->cudnn_handle, &alpha, cch.y_desc,
dyblock->data(), &beta, cch.bias_desc,
dbblock->mutable_data());
}, {dy.block()}, {db.block()});
return db;
}
#endif // USE_CUDNN
} // namespace singa