| /* |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, |
| * software distributed under the License is distributed on an |
| * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| * KIND, either express or implied. See the License for the |
| * specific language governing permissions and limitations |
| * under the License. |
| */ |
| /*! |
| * Copyright (c) 2019 by Contributors |
| * \file bilinear_resize-inl.cuh |
| * \brief bilinear resize operator cuda implementation |
| * \author Hang Zhang, Jake Lee |
| */ |
| |
| #ifndef MXNET_OPERATOR_CONTRIB_BILINEAR_RESIZE_CUH_ |
| #define MXNET_OPERATOR_CONTRIB_BILINEAR_RESIZE_CUH_ |
| |
| #include <cuda_runtime_api.h> |
| #include <algorithm> |
| |
| namespace mxnet { |
| namespace op { |
| |
| using namespace mshadow; |
| |
| enum ImageLayout { |
| HWC, |
| NHWC, |
| NCHW |
| }; |
| |
| template<typename In, typename Out> |
| struct ScalarConvert { |
| static __host__ __device__ __forceinline__ Out to(const In v) { return (Out) v; } |
| }; |
| |
| // The maximum number of threads in a block |
| static const unsigned MAX_BLOCK_SIZE = 512U; |
| |
| // Number of threads in a block given an input size up to MAX_BLOCK_SIZE |
| static unsigned getNumThreads(int nElem, const bool smaller) { |
| unsigned threadSizes[5] = {32, 64, 128, 256, MAX_BLOCK_SIZE}; |
| const int maxi = smaller ? 4 : 5; |
| for (int i = 0; i != maxi; ++i) { |
| if (static_cast<unsigned>(nElem) <= threadSizes[i]) { |
| return threadSizes[i]; |
| } |
| } |
| return smaller ? (MAX_BLOCK_SIZE >> 1) : MAX_BLOCK_SIZE; |
| } |
| |
| // caffe_gpu_interp2_kernel overloading with Tensor<xpu, 3, DType> |
| template<typename xpu, typename Dtype, typename Acctype> |
| __global__ void caffe_gpu_interp2_kernel(const int n, |
| const Acctype rheight, const Acctype rwidth, |
| const Tensor<xpu, 3, Dtype> data1, |
| Tensor<xpu, 3, Dtype> data2, |
| ImageLayout layout) { |
| int index = threadIdx.x + blockIdx.x * blockDim.x; |
| const int channels = data1.size(2); |
| const int height1 = data1.size(0); |
| const int width1 = data1.size(1); |
| const int height2 = data2.size(0); |
| const int width2 = data2.size(1); |
| |
| if (index < n) { |
| const int w2 = index % width2; // 0:width2-1 |
| const int h2 = index / width2; // 0:height2-1 |
| // special case: just copy |
| if (height1 == height2 && width1 == width2) { |
| const int h1 = h2; |
| const int w1 = w2; |
| for (int c = 0; c < channels; ++c) { |
| const Dtype val = data1[h1][w1][c]; |
| data2[h2][w2][c] = val; |
| } |
| return; |
| } |
| // |
| const Acctype h1r = rheight * h2; |
| const int h1 = h1r; |
| const int h1p = (h1 < height1 - 1) ? 1 : 0; |
| const Acctype h1lambda = h1r - h1; |
| const Acctype h0lambda = Acctype(1) - h1lambda; |
| // |
| const Acctype w1r = rwidth * w2; |
| const int w1 = w1r; |
| const int w1p = (w1 < width1 - 1) ? 1 : 0; |
| const Acctype w1lambda = w1r - w1; |
| const Acctype w0lambda = Acctype(1) - w1lambda; |
| for (int c = 0; c < channels; ++c) { |
| const Acctype val = h0lambda * (w0lambda * data1[h1][w1][c] |
| + w1lambda * data1[h1][w1+w1p][c]) |
| + h1lambda * (w0lambda * data1[h1+h1p][w1][c] |
| + w1lambda * data1[h1+h1p][w1+w1p][c]); |
| data2[h2][w2][c] = ScalarConvert<Acctype, Dtype>::to(val); |
| } |
| } |
| } |
| |
| // caffe_gpu_interp2_kernel overloading with Tensor<xpu, 4, DType> |
| template<typename xpu, typename Dtype, typename Acctype> |
| __global__ void caffe_gpu_interp2_kernel(const int n, |
| const Acctype rheight, const Acctype rwidth, |
| const Tensor<xpu, 4, Dtype> data1, |
| Tensor<xpu, 4, Dtype> data2, |
| ImageLayout layout) { |
| int index = threadIdx.x + blockIdx.x * blockDim.x; |
| int batch_size = (layout == NHWC) ? data1.size(0) : data1.size(0); |
| int channels = (layout == NHWC) ? data1.size(3) : data1.size(1); |
| int height1 = (layout == NHWC) ? data1.size(1) : data1.size(2); |
| int width1 = (layout == NHWC) ? data1.size(2) : data1.size(3); |
| int height2 = (layout == NHWC) ? data2.size(1) : data2.size(2); |
| int width2 = (layout == NHWC) ? data2.size(2): data2.size(3); |
| |
| if (index < n) { |
| const int w2 = index % width2; // 0:width2-1 |
| const int h2 = index / width2; // 0:height2-1 |
| // special case: just copy |
| if (height1 == height2 && width1 == width2) { |
| const int h1 = h2; |
| const int w1 = w2; |
| |
| for (int n = 0; n < batch_size; ++n) { |
| for (int c = 0; c < channels; ++c) { |
| if (layout == NHWC) { |
| const Dtype val = data1[n][h1][w1][c]; |
| data2[n][h2][w2][c] = val; |
| } else { |
| const Dtype val = data1[n][c][h1][w1]; |
| data2[n][c][h2][w2] = val; |
| } |
| } |
| } |
| return; |
| } |
| // |
| const Acctype h1r = rheight * h2; |
| const int h1 = h1r; |
| const int h1p = (h1 < height1 - 1) ? 1 : 0; |
| const Acctype h1lambda = h1r - h1; |
| const Acctype h0lambda = Acctype(1) - h1lambda; |
| // |
| const Acctype w1r = rwidth * w2; |
| const int w1 = w1r; |
| const int w1p = (w1 < width1 - 1) ? 1 : 0; |
| const Acctype w1lambda = w1r - w1; |
| const Acctype w0lambda = Acctype(1) - w1lambda; |
| |
| for (auto n = 0; n < batch_size; ++n) { |
| for (int c = 0; c < channels; ++c) { |
| if (layout == NHWC) { |
| const Acctype val = h0lambda * (w0lambda * data1[n][h1][w1][c] |
| + w1lambda * data1[n][h1][w1+w1p][c]) |
| + h1lambda * (w0lambda * data1[n][h1+h1p][w1][c] |
| + w1lambda * data1[n][h1+h1p][w1+w1p][c]); |
| data2[n][h2][w2][c] = ScalarConvert<Acctype, Dtype>::to(val); |
| } else { |
| const Acctype val = h0lambda * (w0lambda * data1[n][c][h1][w1] |
| + w1lambda * data1[n][c][h1][w1+w1p]) |
| + h1lambda * (w0lambda * data1[n][c][h1+h1p][w1] |
| + w1lambda * data1[n][c][h1+h1p][w1+w1p]); |
| data2[n][c][h2][w2] = ScalarConvert<Acctype, Dtype>::to(val); |
| } |
| } |
| } |
| } |
| } |
| |
| } // namespace op |
| } // namespace mxnet |
| |
| #endif // MXNET_OPERATOR_CONTRIB_BILINEAR_RESIZE_CUH_ |