blob: 05a457ab6ed789a18316a9c198d6c1cc0af7cb80 [file] [log] [blame]
/*********************************************************
*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*
************************************************************/
#include "pooling.h"
#include <cmath>
namespace singa {
PoolingHandle::PoolingHandle(const Tensor &input,
const std::vector<int> &kernel_size,
const std::vector<int> &stride,
const std::vector<int> &padding,
const bool is_max) {
kernel_h = kernel_size[0];
kernel_w = kernel_size[1];
pad_h = padding[0];
pad_w = padding[1];
stride_h = stride[0];
stride_w = stride[1];
batchsize = input.shape(0);
channels = input.shape(1);
height = input.shape(2);
width = input.shape(3);
pooled_height = 1;
if (stride_h > 0)
pooled_height =
std::floor(((height + 2 * pad_h - kernel_h) / stride_h)) + 1;
pooled_width = std::floor(((width + 2 * pad_w - kernel_w) / stride_w)) + 1;
is_max_pooling = is_max;
#ifdef USE_DNNL
if (input.device()->lang() == kCpp) {
auto x_dims =
dnnl::memory::dims(input.shape().begin(), input.shape().end());
auto y_dims =
dnnl::memory::dims({batchsize, channels, pooled_height, pooled_width});
auto s_dims = dnnl::memory::dims(stride.begin(), stride.end());
auto k_dims = dnnl::memory::dims(kernel_size.begin(), kernel_size.end());
auto p_dims = dnnl::memory::dims(padding.begin(), padding.end());
auto dtype_ = dnnl::memory::data_type::f32;
auto format_tag_ = get_dnnl_format_tag(input);
x_md = dnnl::memory::desc({x_dims}, dtype_, format_tag_);
y_md = dnnl::memory::desc({y_dims}, dtype_, format_tag_);
// allow max or avg (follow cudnn implementation convention)
auto pooling_algo = dnnl::algorithm::pooling_avg_exclude_padding;
if (is_max_pooling) pooling_algo = dnnl::algorithm::pooling_max;
auto pool_fwd_d = dnnl::pooling_forward::desc(
dnnl::prop_kind::forward_training, pooling_algo, x_md, y_md, s_dims,
k_dims, p_dims, p_dims);
auto pool_bwd_d = dnnl::pooling_backward::desc(
pooling_algo, x_md, y_md, s_dims, k_dims, p_dims, p_dims);
auto eng = input.device()->context(0)->dnnl_engine;
pool_fwd_pd = dnnl::pooling_forward::primitive_desc(pool_fwd_d, eng);
pool_bwd_pd =
dnnl::pooling_backward::primitive_desc(pool_bwd_d, eng, pool_fwd_pd);
auto ws_md = pool_fwd_pd.workspace_desc();
ws_mem = dnnl::memory(ws_md, eng);
}
#endif // USE_DNNL
}
PoolingHandle::~PoolingHandle() {}
#ifdef USE_DNNL
Tensor CpuPoolingForward(const PoolingHandle &ph, const Tensor &x) {
CHECK_EQ(x.device()->lang(), kCpp);
Tensor y({(unsigned long)ph.batchsize, (unsigned long)ph.channels,
(unsigned long)ph.pooled_height, (unsigned long)ph.pooled_width},
x.device(), x.data_type());
y.device()->Exec(
[y, x, &ph](Context *ctx) mutable {
auto eng = ctx->dnnl_engine;
using namespace dnnl;
memory x_mem(ph.x_md, eng, x.block()->mutable_data());
memory y_mem(ph.y_md, eng, y.block()->mutable_data());
pooling_forward(ph.pool_fwd_pd)
.execute(ctx->dnnl_stream, {{DNNL_ARG_SRC, x_mem},
{DNNL_ARG_DST, y_mem},
{DNNL_ARG_WORKSPACE, ph.ws_mem}});
ctx->dnnl_stream.wait();
},
{x.block()}, {y.block()});
return y;
}
Tensor CpuPoolingBackward(const PoolingHandle &ph, const Tensor &grad,
const Tensor &x, const Tensor &y) {
CHECK_EQ(x.device()->lang(), kCpp);
CHECK_EQ(grad.device()->lang(), kCpp);
CHECK_EQ(y.device()->lang(), kCpp);
Tensor in_grad;
in_grad.ResetLike(x);
in_grad.device()->Exec(
[x, y, in_grad, grad, &ph](Context *ctx) mutable {
auto eng = ctx->dnnl_engine;
using namespace dnnl;
memory dx_mem(ph.x_md, eng, in_grad.block()->mutable_data());
memory dy_mem(ph.y_md, eng, grad.block()->mutable_data());
pooling_backward(ph.pool_bwd_pd)
.execute(ctx->dnnl_stream, {{DNNL_ARG_DIFF_DST, dy_mem},
{DNNL_ARG_DIFF_SRC, dx_mem},
{DNNL_ARG_WORKSPACE, ph.ws_mem}});
ctx->dnnl_stream.wait();
},
{x.block(), y.block(), grad.block()}, {in_grad.block()});
return in_grad;
}
#endif // USE_DNNL
#ifdef USE_CUDNN
CudnnPoolingHandle::CudnnPoolingHandle(const Tensor &input,
const std::vector<int> &kernel_size,
const std::vector<int> &stride,
const std::vector<int> &padding,
const bool is_max)
: PoolingHandle(input, kernel_size, stride, padding, is_max) {
// nan_prop = CUDNN_NOT_PROPAGATE_NAN;
DataType dtype = input.data_type();
CUDNN_CHECK(cudnnCreateTensorDescriptor(&x_desc));
CUDNN_CHECK(cudnnCreateTensorDescriptor(&y_desc));
CUDNN_CHECK(cudnnCreatePoolingDescriptor(&pool_desc));
CUDNN_CHECK(cudnnSetTensor4dDescriptor(x_desc, CUDNN_TENSOR_NCHW,
GetCudnnDataType(dtype), batchsize,
channels, height, width));
// LOG(ERROR) << batchsize << " " << channels << " " << pooled_height << " "
// << pooled_width;
CUDNN_CHECK(cudnnSetTensor4dDescriptor(
y_desc, CUDNN_TENSOR_NCHW, GetCudnnDataType(dtype), batchsize, channels,
pooled_height, pooled_width));
auto pool_method = CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING;
if (is_max) pool_method = CUDNN_POOLING_MAX;
CUDNN_CHECK(cudnnSetPooling2dDescriptor(pool_desc, pool_method, nan_prop,
kernel_h, kernel_w, pad_h, pad_w,
stride_h, stride_w));
};
CudnnPoolingHandle::~CudnnPoolingHandle() {
if (pool_desc != nullptr)
CUDNN_CHECK(cudnnDestroyPoolingDescriptor(pool_desc));
if (x_desc != nullptr) CUDNN_CHECK(cudnnDestroyTensorDescriptor(x_desc));
if (y_desc != nullptr) CUDNN_CHECK(cudnnDestroyTensorDescriptor(y_desc));
}
Tensor GpuPoolingForward(const CudnnPoolingHandle &cph, const Tensor &x) {
CHECK_EQ(x.device()->lang(), kCuda);
CHECK_EQ(x.nDim(), 4u);
Tensor output = Tensor(
Shape({cph.batchsize, cph.channels, cph.pooled_height, cph.pooled_width}),
x.device(), x.data_type());
output.device()->Exec(
[output, x, &cph](Context *ctx) mutable {
float alpha = 1.0f, beta = 0.0f;
cudnnPoolingForward(ctx->cudnn_handle, cph.pool_desc, &alpha,
cph.x_desc, x.block()->data(), &beta, cph.y_desc,
output.block()->mutable_data());
},
{x.block()}, {output.block()});
return output;
}
Tensor GpuPoolingBackward(const CudnnPoolingHandle &cph, const Tensor &dy,
const Tensor &x, const Tensor &y) {
CHECK_EQ(dy.device()->lang(), kCuda);
CHECK_EQ(dy.nDim(), 4u);
Tensor dx;
dx.ResetLike(x);
dx.device()->Exec(
[dx, dy, x, y, &cph](Context *ctx) mutable {
float alpha = 1.0f, beta = 0.0f;
cudnnPoolingBackward(ctx->cudnn_handle, cph.pool_desc, &alpha,
cph.y_desc, y.block()->data(), cph.y_desc,
dy.block()->data(), cph.x_desc, x.block()->data(),
&beta, cph.x_desc, dx.block()->mutable_data());
},
{dy.block(), y.block(), x.block()}, {dx.block()});
return dx;
};
#endif // USE_CUDNN
} // namespace singa