blob: af44df9069dd66f28bfc9b83a950318c4fcb9ad2 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
/*!
* \file pad.cu
* \brief
* \author Sebastian Bodenstein
*/
#include <algorithm>
#include "./pad-inl.h"
#include "../common/cuda/utils.h"
namespace mshadow {
namespace cuda {
////////////////////////////////////////////////////////////////////////////////
// Special Case: 2d image (so only pad width + height)
// Case 1: Replication Padding
// single_image_2d_edge adapted from Torch
// https://github.com/torch/cunn/blob/master/lib/THCUNN/SpatialReplicationPadding.cu
template <int n_bits, typename DType>
__global__ void image_2d_pad_edge_kernel(Tensor<gpu, 4, DType> dst,
const Tensor<gpu, 4, DType> src,
const int padT,
const int padL) {
int outputPointId = threadIdx.x + blockIdx.x * blockDim.x;
int plane = blockIdx.y;
int batch = blockIdx.z;
if (outputPointId >= dst.size(2) * dst.size(3)) {
return;
}
int outputPointX = outputPointId % dst.size(3);
int outputPointY = outputPointId / dst.size(3);
int iStartX = max(0, -padL);
int iStartY = max(0, -padT);
int oStartX = max(0, padL);
int oStartY = max(0, padT);
int inputPointX =
min(max(padL, outputPointX), static_cast<int>(src.size(3)) + padL - 1) - oStartX + iStartX;
int inputPointY =
min(max(padT, outputPointY), static_cast<int>(src.size(2)) + padT - 1) - oStartY + iStartY;
DType valueToCopy = src[batch][plane][inputPointY][inputPointX];
dst[batch][plane][outputPointY][outputPointX] = valueToCopy;
}
template <typename DType>
inline void image_pad_edge(Tensor<gpu, 4, DType> dst,
const Tensor<gpu, 4, DType>& src,
const mxnet::TShape& pad) {
const int padT = pad[4];
const int padL = pad[6];
dim3 dimBlock(kBaseThreadNum);
int xGridSize = (dst.size(2) * dst.size(3) + 256 - 1) / 256;
dim3 dimGrid(xGridSize, dst.size(1), dst.size(0));
CheckLaunchParam(dimGrid, dimBlock, "Pad");
cudaStream_t stream = Stream<gpu>::GetStream(dst.stream_);
image_2d_pad_edge_kernel<kBaseThreadBits, DType>
<<<dimGrid, dimBlock, 0, stream>>>(dst, src, padT, padL);
MSHADOW_CUDA_POST_KERNEL_CHECK(image_2d_pad_edge_kernel);
}
template <int n_bits, typename DType>
__global__ void image_2d_pad_edge_grad_kernel(Tensor<gpu, 4, DType> grad_in,
const Tensor<gpu, 4, DType> grad_out,
const int padT,
const int padL) {
int outputPointId = threadIdx.x + blockIdx.x * blockDim.x;
int plane = blockIdx.y;
int batch = blockIdx.z;
if (outputPointId >= grad_out.size(2) * grad_out.size(3)) {
return;
}
int outputPointX = outputPointId % grad_out.size(3);
int outputPointY = outputPointId / grad_out.size(3);
int iStartX = max(0, -padL);
int iStartY = max(0, -padT);
int oStartX = max(0, padL);
int oStartY = max(0, padT);
int inputPointX = min(max(padL, outputPointX), static_cast<int>(grad_in.size(3)) + padL - 1) -
oStartX + iStartX;
int inputPointY = min(max(padT, outputPointY), static_cast<int>(grad_in.size(2)) + padT - 1) -
oStartY + iStartY;
DType valueToCopy = grad_out[batch][plane][outputPointY][outputPointX];
atomicAdd(&grad_in[batch][plane][inputPointY][inputPointX], valueToCopy);
}
template <typename DType>
inline void image_pad_edge_grad(Tensor<gpu, 4, DType> grad_in,
const Tensor<gpu, 4, DType>& grad_out,
const mxnet::TShape& pad) {
const int padT = pad[4];
const int padL = pad[6];
dim3 dimBlock(kBaseThreadNum);
int xGridSize = (grad_out.size(2) * grad_out.size(3) + 256 - 1) / 256;
dim3 dimGrid(xGridSize, grad_out.size(1), grad_out.size(0));
CheckLaunchParam(dimGrid, dimBlock, "Pad");
cudaStream_t stream = Stream<gpu>::GetStream(grad_out.stream_);
image_2d_pad_edge_grad_kernel<kBaseThreadBits, DType>
<<<dimGrid, dimBlock, 0, stream>>>(grad_in, grad_out, padT, padL);
MSHADOW_CUDA_POST_KERNEL_CHECK(image_2d_pad_edge_grad_kernel);
}
// Case 2: Constant Padding
template <int n_bits, typename DType>
__global__ void image_2d_pad_constant_kernel(Tensor<gpu, 4, DType> dst,
const Tensor<gpu, 4, DType> src,
const int padT,
const int padL,
const DType constant) {
int outputPointId = threadIdx.x + blockIdx.x * blockDim.x;
if (outputPointId >= dst.size(2) * dst.size(3)) {
return;
}
// cast sizes to int to use in min/max
int Ny = src.size(2);
int Nx = src.size(3);
int plane = blockIdx.y;
int batch = blockIdx.z;
int outputPointX = outputPointId % dst.size(3);
int outputPointY = outputPointId / dst.size(3);
int checkT = max(0, outputPointY - padT + 1);
int checkB = max(0, padT + Ny - outputPointY);
int checkL = max(0, outputPointX - padL + 1);
int checkR = max(0, padL + Nx - outputPointX);
int inputPointX = min(max(outputPointX - padL, 0), Nx - 1);
int inputPointY = min(max(outputPointY - padT, 0), Ny - 1);
// 1 if need padding, 0 if not
int need_pad = !(checkT * checkB * checkL * checkR);
DType valueToCopy = src[batch][plane][inputPointY][inputPointX];
dst[batch][plane][outputPointY][outputPointX] = valueToCopy * (!need_pad) + need_pad * constant;
}
template <typename DType>
inline void image_pad_constant(Tensor<gpu, 4, DType> dst,
const Tensor<gpu, 4, DType>& src,
const mxnet::TShape& pad,
const DType constant) {
const int padT = pad[4];
const int padL = pad[6];
dim3 dimBlock(kBaseThreadNum);
int xGridSize = (dst.size(2) * dst.size(3) + 256 - 1) / 256;
dim3 dimGrid(xGridSize, dst.size(1), dst.size(0));
CheckLaunchParam(dimGrid, dimBlock, "Pad");
cudaStream_t stream = Stream<gpu>::GetStream(dst.stream_);
image_2d_pad_constant_kernel<kBaseThreadBits, DType>
<<<dimGrid, dimBlock, 0, stream>>>(dst, src, padT, padL, constant);
MSHADOW_CUDA_POST_KERNEL_CHECK(image_2d_pad_constant_kernel);
}
template <int n_bits, typename DType>
__global__ void image_2d_pad_constant_grad_kernel(Tensor<gpu, 4, DType> grad_in,
const Tensor<gpu, 4, DType> grad_out,
const int padT,
const int padL) {
int inPointId = threadIdx.x + blockIdx.x * blockDim.x;
int plane = blockIdx.y;
int batch = blockIdx.z;
int pixel_num = grad_in.size(2) * grad_in.size(3);
if (inPointId >= pixel_num) {
return;
}
int inPointX = inPointId % grad_in.size(3);
int inPointY = inPointId / grad_in.size(3);
int outPointX = inPointX + padL;
int outPointY = inPointY + padT;
grad_in[batch][plane][inPointY][inPointX] = grad_out[batch][plane][outPointY][outPointX];
}
template <typename DType>
inline void image_pad_constant_grad(Tensor<gpu, 4, DType> grad_in,
const Tensor<gpu, 4, DType>& grad_out,
const mxnet::TShape& pad) {
const int padT = pad[4];
const int padL = pad[6];
dim3 dimBlock(kBaseThreadNum);
int xGridSize = (grad_in.size(2) * grad_in.size(3) + 256 - 1) / 256;
dim3 dimGrid(xGridSize, grad_in.size(1), grad_in.size(0));
CheckLaunchParam(dimGrid, dimBlock, "Pad");
cudaStream_t stream = Stream<gpu>::GetStream(grad_in.stream_);
image_2d_pad_constant_grad_kernel<kBaseThreadBits, DType>
<<<dimGrid, dimBlock, 0, stream>>>(grad_in, grad_out, padT, padL);
MSHADOW_CUDA_POST_KERNEL_CHECK(image_2d_pad_constant_grad_kernel);
}
// Case 3: Reflection Padding
// adapted from Torch
// https://github.com/torch/cunn/blob/master/lib/THCUNN/SpatialReflectionPadding.cu
template <int n_bits, typename DType>
__global__ void image_2d_pad_reflect_kernel(Tensor<gpu, 4, DType> dst,
const Tensor<gpu, 4, DType> src,
const int padT,
const int padL) {
int outputPointId = threadIdx.x + blockIdx.x * blockDim.x;
int plane = blockIdx.y;
int batch = blockIdx.z;
if (outputPointId >= dst.size(2) * dst.size(3)) {
return;
}
int outputPointX = outputPointId % dst.size(3);
int outputPointY = outputPointId / dst.size(3);
int iStartX = max(0, -padL);
int iStartY = max(0, -padT);
int oStartX = max(0, padL);
int oStartY = max(0, padT);
int inputPointX = __sad(outputPointX, padL, 0) - __sad(outputPointX, src.size(3) + padL - 1, 0) -
outputPointX + 2 * padL + src.size(3) - 1 - oStartX + iStartX;
int inputPointY = __sad(outputPointY, padT, 0) - __sad(outputPointY, src.size(2) + padT - 1, 0) -
outputPointY + 2 * padT + src.size(2) - 1 - oStartY + iStartY;
DType valueToCopy = src[batch][plane][inputPointY][inputPointX];
dst[batch][plane][outputPointY][outputPointX] = valueToCopy;
}
template <typename DType>
inline void image_pad_reflect(Tensor<gpu, 4, DType> dst,
const Tensor<gpu, 4, DType>& src,
const mxnet::TShape& pad) {
const int padT = pad[4];
const int padL = pad[6];
dim3 dimBlock(kBaseThreadNum);
int xGridSize = (dst.size(2) * dst.size(3) + 256 - 1) / 256;
dim3 dimGrid(xGridSize, dst.size(1), dst.size(0));
CheckLaunchParam(dimGrid, dimBlock, "Pad");
cudaStream_t stream = Stream<gpu>::GetStream(dst.stream_);
image_2d_pad_reflect_kernel<kBaseThreadBits, DType>
<<<dimGrid, dimBlock, 0, stream>>>(dst, src, padT, padL);
MSHADOW_CUDA_POST_KERNEL_CHECK(image_2d_pad_reflect_kernel);
}
template <int n_bits, typename DType>
__global__ void image_2d_pad_reflect_grad_kernel(Tensor<gpu, 4, DType> grad_in,
const Tensor<gpu, 4, DType> grad_out,
const int padT,
const int padL) {
int outputPointId = threadIdx.x + blockIdx.x * blockDim.x;
int plane = blockIdx.y;
int batch = blockIdx.z;
if (outputPointId >= grad_out.size(2) * grad_out.size(3)) {
return;
}
int outputPointX = outputPointId % grad_out.size(3);
int outputPointY = outputPointId / grad_out.size(3);
int iStartX = max(0, -padL);
int iStartY = max(0, -padT);
int oStartX = max(0, padL);
int oStartY = max(0, padT);
int inputPointX = __sad(outputPointX, padL, 0) -
__sad(outputPointX, grad_in.size(3) + padL - 1, 0) - outputPointX + 2 * padL +
grad_in.size(3) - 1 - oStartX + iStartX;
int inputPointY = __sad(outputPointY, padT, 0) -
__sad(outputPointY, grad_in.size(2) + padT - 1, 0) - outputPointY + 2 * padT +
grad_in.size(2) - 1 - oStartY + iStartY;
DType valueToCopy = grad_out[batch][plane][outputPointY][outputPointX];
atomicAdd(&grad_in[batch][plane][inputPointY][inputPointX], valueToCopy);
}
template <typename DType>
inline void image_pad_reflect_grad(Tensor<gpu, 4, DType> grad_in,
const Tensor<gpu, 4, DType>& grad_out,
const mxnet::TShape& pad) {
const int padT = pad[4];
const int padL = pad[6];
dim3 dimBlock(kBaseThreadNum);
int xGridSize = (grad_out.size(2) * grad_out.size(3) + 256 - 1) / 256;
dim3 dimGrid(xGridSize, grad_out.size(1), grad_out.size(0));
CheckLaunchParam(dimGrid, dimBlock, "Pad");
cudaStream_t stream = Stream<gpu>::GetStream(grad_out.stream_);
image_2d_pad_reflect_grad_kernel<kBaseThreadBits, DType>
<<<dimGrid, dimBlock, 0, stream>>>(grad_in, grad_out, padT, padL);
MSHADOW_CUDA_POST_KERNEL_CHECK(image_2d_pad_reflect_grad_kernel);
}
////////////////////////////////////////////////////////////////////////////////
// Special Case: 3d image (pad depth + width + height)
// Case 1: Replication Padding
// single_image_3_edge adapted from Torch
// https://github.com/torch/cunn/blob/master/lib/THCUNN/VolumetricReplicationPadding.cu
template <int n_bits, typename DType>
__global__ void image_3d_pad_edge_kernel(Tensor<gpu, 5, DType> dst,
const Tensor<gpu, 5, DType> src,
const int padF,
const int padT,
const int padL) {
int outputPointId = threadIdx.x + blockIdx.x * blockDim.x;
int plane = blockIdx.y;
int batch = blockIdx.z;
if (outputPointId >= dst.size(2) * dst.size(3) * dst.size(4)) {
return;
}
int outputPointX = outputPointId % dst.size(4);
int outputPointY = (outputPointId / dst.size(4)) % dst.size(3);
int outputPointZ = outputPointId / (dst.size(3) * dst.size(4));
int iStartX = max(0, -padL);
int iStartY = max(0, -padT);
int iStartZ = max(0, -padF);
int oStartX = max(0, padL);
int oStartY = max(0, padT);
int oStartZ = max(0, padF);
int inputPointX =
min(max(padL, outputPointX), static_cast<int>(src.size(4)) + padL - 1) - oStartX + iStartX;
int inputPointY =
min(max(padT, outputPointY), static_cast<int>(src.size(3)) + padT - 1) - oStartY + iStartY;
int inputPointZ =
min(max(padF, outputPointZ), static_cast<int>(src.size(2)) + padF - 1) - oStartZ + iStartZ;
DType valueToCopy = src[batch][plane][inputPointZ][inputPointY][inputPointX];
dst[batch][plane][outputPointZ][outputPointY][outputPointX] = valueToCopy;
}
template <typename DType>
inline void image_pad_edge(Tensor<gpu, 5, DType> dst,
const Tensor<gpu, 5, DType>& src,
const mxnet::TShape& pad) {
const int padF = pad[4];
const int padT = pad[6];
const int padL = pad[8];
dim3 dimBlock(kBaseThreadNum);
int xGridSize = (dst.size(2) * dst.size(3) * dst.size(4) + 256 - 1) / 256;
dim3 dimGrid(xGridSize, dst.size(1), dst.size(0));
CheckLaunchParam(dimGrid, dimBlock, "Pad");
cudaStream_t stream = Stream<gpu>::GetStream(dst.stream_);
image_3d_pad_edge_kernel<kBaseThreadBits, DType>
<<<dimGrid, dimBlock, 0, stream>>>(dst, src, padF, padT, padL);
MSHADOW_CUDA_POST_KERNEL_CHECK(image_3d_pad_edge_kernel);
}
template <int n_bits, typename DType>
__global__ void image_3d_pad_edge_grad_kernel(Tensor<gpu, 5, DType> grad_in,
const Tensor<gpu, 5, DType> grad_out,
const int padF,
const int padT,
const int padL) {
int outputPointId = threadIdx.x + blockIdx.x * blockDim.x;
int plane = blockIdx.y;
int batch = blockIdx.z;
if (outputPointId >= grad_out.size(2) * grad_out.size(3) * grad_out.size(4)) {
return;
}
int outputPointX = outputPointId % grad_out.size(4);
int outputPointY = (outputPointId / grad_out.size(4)) % grad_out.size(3);
int outputPointZ = outputPointId / (grad_out.size(3) * grad_out.size(4));
int iStartX = max(0, -padL);
int iStartY = max(0, -padT);
int iStartZ = max(0, -padF);
int oStartX = max(0, padL);
int oStartY = max(0, padT);
int oStartZ = max(0, padF);
int inputPointX = min(max(padL, outputPointX), static_cast<int>(grad_in.size(4)) + padL - 1) -
oStartX + iStartX;
int inputPointY = min(max(padT, outputPointY), static_cast<int>(grad_in.size(3)) + padT - 1) -
oStartY + iStartY;
int inputPointZ = min(max(padF, outputPointZ), static_cast<int>(grad_in.size(2)) + padF - 1) -
oStartZ + iStartZ;
DType valueToCopy = grad_out[batch][plane][outputPointZ][outputPointY][outputPointX];
atomicAdd(&grad_in[batch][plane][inputPointZ][inputPointY][inputPointX], valueToCopy);
}
template <typename DType>
inline void image_pad_edge_grad(Tensor<gpu, 5, DType> grad_in,
const Tensor<gpu, 5, DType>& grad_out,
const mxnet::TShape& pad) {
const int padF = pad[4];
const int padT = pad[6];
const int padL = pad[8];
dim3 dimBlock(kBaseThreadNum);
int xGridSize = (grad_out.size(2) * grad_out.size(3) * grad_out.size(4) + 256 - 1) / 256;
dim3 dimGrid(xGridSize, grad_out.size(1), grad_out.size(0));
CheckLaunchParam(dimGrid, dimBlock, "Pad");
cudaStream_t stream = Stream<gpu>::GetStream(grad_out.stream_);
image_3d_pad_edge_grad_kernel<kBaseThreadBits, DType>
<<<dimGrid, dimBlock, 0, stream>>>(grad_in, grad_out, padF, padT, padL);
MSHADOW_CUDA_POST_KERNEL_CHECK(image_3d_pad_edge_grad_kernel);
}
// Case 2: Constant Padding
template <int n_bits, typename DType>
__global__ void image_3d_pad_constant_kernel(Tensor<gpu, 5, DType> dst,
const Tensor<gpu, 5, DType> src,
const int padF,
const int padT,
const int padL,
const DType constant) {
int outputPointId = threadIdx.x + blockIdx.x * blockDim.x;
if (outputPointId >= dst.size(2) * dst.size(3) * dst.size(4)) {
return;
}
// cast sizes to int to use in min/max
int Nz = src.size(2);
int Ny = src.size(3);
int Nx = src.size(4);
int plane = blockIdx.y;
int batch = blockIdx.z;
int outputPointX = outputPointId % dst.size(4);
int outputPointY = (outputPointId / dst.size(4)) % dst.size(3);
int outputPointZ = outputPointId / (dst.size(3) * dst.size(4));
int checkFront = max(0, outputPointZ - padF + 1);
int checkBack = max(0, padF + Nz - outputPointZ);
int checkTop = max(0, outputPointY - padT + 1);
int checkBottom = max(0, padT + Ny - outputPointY);
int checkLeft = max(0, outputPointX - padL + 1);
int checkRight = max(0, padL + Nx - outputPointX);
int inputPointZ = min(max(outputPointZ - padF, 0), Nz - 1);
int inputPointX = min(max(outputPointX - padL, 0), Nx - 1);
int inputPointY = min(max(outputPointY - padT, 0), Ny - 1);
// 1 if need padding, 0 if not
int need_pad = !(checkFront * checkBack * checkTop * checkBottom * checkLeft * checkRight);
DType valueToCopy = src[batch][plane][inputPointZ][inputPointY][inputPointX];
dst[batch][plane][outputPointZ][outputPointY][outputPointX] =
valueToCopy * (!need_pad) + need_pad * constant;
}
template <typename DType>
inline void image_pad_constant(Tensor<gpu, 5, DType> dst,
const Tensor<gpu, 5, DType>& src,
const mxnet::TShape& pad,
const DType constant) {
const int padF = pad[4];
const int padT = pad[6];
const int padL = pad[8];
dim3 dimBlock(kBaseThreadNum);
int xGridSize = (dst.size(2) * dst.size(3) * dst.size(4) + 256 - 1) / 256;
dim3 dimGrid(xGridSize, dst.size(1), dst.size(0));
CheckLaunchParam(dimGrid, dimBlock, "Pad");
cudaStream_t stream = Stream<gpu>::GetStream(dst.stream_);
image_3d_pad_constant_kernel<kBaseThreadBits, DType>
<<<dimGrid, dimBlock, 0, stream>>>(dst, src, padF, padT, padL, constant);
MSHADOW_CUDA_POST_KERNEL_CHECK(image_3d_pad_constant_kernel);
}
template <int n_bits, typename DType>
__global__ void image_3d_pad_constant_grad_kernel(Tensor<gpu, 5, DType> grad_in,
const Tensor<gpu, 5, DType> grad_out,
const int padF,
const int padT,
const int padL) {
int inPointId = threadIdx.x + blockIdx.x * blockDim.x;
int plane = blockIdx.y;
int batch = blockIdx.z;
int pixel_num = grad_in.size(2) * grad_in.size(3) * grad_in.size(4);
if (inPointId >= pixel_num) {
return;
}
int inPointX = inPointId % grad_in.size(4);
int inPointY = (inPointId / grad_in.size(4)) % grad_in.size(3);
int inPointZ = inPointId / (grad_in.size(3) * grad_in.size(4));
int outPointZ = inPointZ + padF;
int outPointX = inPointX + padL;
int outPointY = inPointY + padT;
grad_in[batch][plane][inPointZ][inPointY][inPointX] =
grad_out[batch][plane][outPointZ][outPointY][outPointX];
}
template <typename DType>
inline void image_pad_constant_grad(Tensor<gpu, 5, DType> grad_in,
const Tensor<gpu, 5, DType>& grad_out,
const mxnet::TShape& pad) {
const int padF = pad[4];
const int padT = pad[6];
const int padL = pad[8];
dim3 dimBlock(kBaseThreadNum);
int xGridSize = (grad_in.size(2) * grad_in.size(3) * grad_in.size(4) + 256 - 1) / 256;
dim3 dimGrid(xGridSize, grad_in.size(1), grad_in.size(0));
CheckLaunchParam(dimGrid, dimBlock, "Pad");
cudaStream_t stream = Stream<gpu>::GetStream(grad_in.stream_);
image_3d_pad_constant_grad_kernel<kBaseThreadBits, DType>
<<<dimGrid, dimBlock, 0, stream>>>(grad_in, grad_out, padF, padT, padL);
MSHADOW_CUDA_POST_KERNEL_CHECK(image_3d_pad_constant_grad_kernel);
}
// Case 3: Reflection Padding
template <int n_bits, typename DType>
__global__ void image_3d_pad_reflect_kernel(Tensor<gpu, 5, DType> dst,
const Tensor<gpu, 5, DType> src,
const int padF,
const int padT,
const int padL) {
int outputPointId = threadIdx.x + blockIdx.x * blockDim.x;
int plane = blockIdx.y;
int batch = blockIdx.z;
if (outputPointId >= dst.size(2) * dst.size(3) * dst.size(4)) {
return;
}
int outputPointX = outputPointId % dst.size(4);
int outputPointY = (outputPointId / dst.size(4)) % dst.size(3);
int outputPointZ = outputPointId / (dst.size(3) * dst.size(4));
int iStartX = max(0, -padL);
int iStartY = max(0, -padT);
int iStartZ = max(0, -padF);
int oStartX = max(0, padL);
int oStartY = max(0, padT);
int oStartZ = max(0, padF);
int inputPointX = __sad(outputPointX, padL, 0) - __sad(outputPointX, src.size(4) + padL - 1, 0) -
outputPointX + 2 * padL + src.size(4) - 1 - oStartX + iStartX;
int inputPointY = __sad(outputPointY, padT, 0) - __sad(outputPointY, src.size(3) + padT - 1, 0) -
outputPointY + 2 * padT + src.size(3) - 1 - oStartY + iStartY;
int inputPointZ = __sad(outputPointZ, padF, 0) - __sad(outputPointZ, src.size(2) + padF - 1, 0) -
outputPointZ + 2 * padF + src.size(2) - 1 - oStartZ + iStartZ;
DType valueToCopy = src[batch][plane][inputPointZ][inputPointY][inputPointX];
dst[batch][plane][outputPointZ][outputPointY][outputPointX] = valueToCopy;
}
template <typename DType>
inline void image_pad_reflect(Tensor<gpu, 5, DType> dst,
const Tensor<gpu, 5, DType>& src,
const mxnet::TShape& pad) {
const int padF = pad[4];
const int padT = pad[6];
const int padL = pad[8];
dim3 dimBlock(kBaseThreadNum);
int xGridSize = (dst.size(2) * dst.size(3) * dst.size(4) + 256 - 1) / 256;
dim3 dimGrid(xGridSize, dst.size(1), dst.size(0));
CheckLaunchParam(dimGrid, dimBlock, "Pad");
cudaStream_t stream = Stream<gpu>::GetStream(dst.stream_);
image_3d_pad_reflect_kernel<kBaseThreadBits, DType>
<<<dimGrid, dimBlock, 0, stream>>>(dst, src, padF, padT, padL);
MSHADOW_CUDA_POST_KERNEL_CHECK(image_3d_pad_reflect_kernel);
}
template <int n_bits, typename DType>
__global__ void image_3d_pad_reflect_grad_kernel(Tensor<gpu, 5, DType> grad_in,
const Tensor<gpu, 5, DType> grad_out,
const int padF,
const int padT,
const int padL) {
int outputPointId = threadIdx.x + blockIdx.x * blockDim.x;
int plane = blockIdx.y;
int batch = blockIdx.z;
if (outputPointId >= grad_out.size(2) * grad_out.size(3) * grad_out.size(4)) {
return;
}
int outputPointX = outputPointId % grad_out.size(4);
int outputPointY = (outputPointId / grad_out.size(4)) % grad_out.size(3);
int outputPointZ = outputPointId / (grad_out.size(3) * grad_out.size(4));
int iStartX = max(0, -padL);
int iStartY = max(0, -padT);
int iStartZ = max(0, -padF);
int oStartX = max(0, padL);
int oStartY = max(0, padT);
int oStartZ = max(0, padF);
int inputPointX = __sad(outputPointX, padL, 0) -
__sad(outputPointX, grad_in.size(4) + padL - 1, 0) - outputPointX + 2 * padL +
grad_in.size(4) - 1 - oStartX + iStartX;
int inputPointY = __sad(outputPointY, padT, 0) -
__sad(outputPointY, grad_in.size(3) + padT - 1, 0) - outputPointY + 2 * padT +
grad_in.size(3) - 1 - oStartY + iStartY;
int inputPointZ = __sad(outputPointZ, padF, 0) -
__sad(outputPointZ, grad_in.size(2) + padF - 1, 0) - outputPointZ + 2 * padF +
grad_in.size(2) - 1 - oStartZ + iStartZ;
DType valueToCopy = grad_out[batch][plane][outputPointZ][outputPointY][outputPointX];
atomicAdd(&grad_in[batch][plane][inputPointZ][inputPointY][inputPointX], valueToCopy);
}
/* int outputPointId = threadIdx.x + blockIdx.x * blockDim.x;
int plane = blockIdx.y;
int batch = blockIdx.z;
if (outputPointId >= grad_out.size(2) * grad_out.size(3)) {
return;
}
int outputPointX = outputPointId % grad_out.size(3);
int outputPointY = outputPointId / grad_out.size(3);
int iStartX = max(0, -padL);
int iStartY = max(0, -padT);
int oStartX = max(0, padL);
int oStartY = max(0, padT);
int inputPointX = __sad(outputPointX, padL, 0)
- __sad(outputPointX, grad_in.size(3) + padL - 1, 0)
- outputPointX
+ 2 * padL + grad_in.size(3) - 1
- oStartX + iStartX;
int inputPointY = __sad(outputPointY, padT, 0)
- __sad(outputPointY, grad_in.size(2) + padT - 1, 0)
- outputPointY
+ 2 * padT + grad_in.size(2) - 1
- oStartY + iStartY;
DType valueToCopy = grad_out[batch][plane][outputPointY][outputPointX];
atomicAdd(&grad_in[batch][plane][inputPointY][inputPointX], valueToCopy);*/
template <typename DType>
inline void image_pad_reflect_grad(Tensor<gpu, 5, DType> grad_in,
const Tensor<gpu, 5, DType>& grad_out,
const mxnet::TShape& pad) {
const int padF = pad[4];
const int padT = pad[6];
const int padL = pad[8];
dim3 dimBlock(kBaseThreadNum);
int xGridSize = (grad_out.size(2) * grad_out.size(3) * grad_out.size(4) + 256 - 1) / 256;
dim3 dimGrid(xGridSize, grad_out.size(1), grad_out.size(0));
CheckLaunchParam(dimGrid, dimBlock, "Pad");
cudaStream_t stream = Stream<gpu>::GetStream(grad_out.stream_);
image_3d_pad_reflect_grad_kernel<kBaseThreadBits, DType>
<<<dimGrid, dimBlock, 0, stream>>>(grad_in, grad_out, padF, padT, padL);
MSHADOW_CUDA_POST_KERNEL_CHECK(image_3d_pad_reflect_grad_kernel);
}
////////////////////////////////////////////////////////////////////////////////
} // namespace cuda
template <int dim, typename DType>
void pad_image(Tensor<gpu, dim, DType> dst,
const Tensor<gpu, dim, DType> src,
const mxnet::TShape pad,
int mode,
const DType constant_value) {
switch (mode) {
case mxnet::op::pad_enum::kEdge:
cuda::image_pad_edge(dst, src, pad);
break;
case mxnet::op::pad_enum::kConstant:
cuda::image_pad_constant(dst, src, pad, constant_value);
break;
case mxnet::op::pad_enum::kReflect:
cuda::image_pad_reflect(dst, src, pad);
break;
}
}
template <int dim, typename DType>
void pad_image_grad(Tensor<gpu, dim, DType> grad_in,
const Tensor<gpu, dim, DType> grad_out,
const mxnet::TShape pad,
int mode) {
switch (mode) {
case mxnet::op::pad_enum::kEdge:
cuda::image_pad_edge_grad(grad_in, grad_out, pad);
break;
case mxnet::op::pad_enum::kConstant:
cuda::image_pad_constant_grad(grad_in, grad_out, pad);
break;
case mxnet::op::pad_enum::kReflect:
cuda::image_pad_reflect_grad(grad_in, grad_out, pad);
break;
}
}
} // namespace mshadow
////////////////////////////////////////////////////////////////////////////////
namespace mxnet {
namespace op {
template <>
Operator* CreateOp<gpu>(PadParam param, int dtype) {
Operator* op = nullptr;
MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { op = new PadOp<gpu, DType>(param); })
return op;
}
} // namespace op
} // namespace mxnet