src/operator/contrib/bilinear_resize-inl.cuh - mxnet - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *   http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
  */
 /*!
  * Copyright (c) 2019 by Contributors
  * \file bilinear_resize-inl.cuh
  * \brief bilinear resize operator cuda implementation
  * \author Hang Zhang, Jake Lee
 */

 #ifndef MXNET_OPERATOR_CONTRIB_BILINEAR_RESIZE_CUH_
 #define MXNET_OPERATOR_CONTRIB_BILINEAR_RESIZE_CUH_

 #include <cuda_runtime_api.h>
 #include <algorithm>

 namespace mxnet {
 namespace op {

 using namespace mshadow;

 enum ImageLayout {
   HWC,
   NHWC,
   NCHW
 };

 template<typename In, typename Out>
 struct ScalarConvert {
   static __host__ __device__ __forceinline__ Out to(const In v) { return (Out) v; }
 };

 // The maximum number of threads in a block
 static const unsigned MAX_BLOCK_SIZE = 512U;

 // Number of threads in a block given an input size up to MAX_BLOCK_SIZE
 static unsigned getNumThreads(int nElem, const bool smaller) {
   unsigned threadSizes[5] = {32, 64, 128, 256, MAX_BLOCK_SIZE};
   const int maxi = smaller ? 4 : 5;
   for (int i = 0; i != maxi; ++i) {
     if (static_cast<unsigned>(nElem) <= threadSizes[i]) {
       return threadSizes[i];
     }
   }
   return smaller ? (MAX_BLOCK_SIZE >> 1) : MAX_BLOCK_SIZE;
 }

 // caffe_gpu_interp2_kernel overloading with Tensor<xpu, 3, DType>
 template<typename xpu, typename Dtype, typename Acctype>
 __global__ void caffe_gpu_interp2_kernel(const int n,
     const Acctype rheight, const Acctype rwidth,
     const Tensor<xpu, 3, Dtype> data1,
     Tensor<xpu, 3, Dtype> data2,
     ImageLayout layout) {
   int index = threadIdx.x + blockIdx.x * blockDim.x;
   const int channels = data1.size(2);
   const int height1 = data1.size(0);
   const int width1 = data1.size(1);
   const int height2 = data2.size(0);
   const int width2 = data2.size(1);

   if (index < n) {
     const int w2 = index % width2;  // 0:width2-1
     const int h2 = index / width2;  // 0:height2-1
     // special case: just copy
     if (height1 == height2 && width1 == width2) {
       const int h1 = h2;
       const int w1 = w2;
       for (int c = 0; c < channels; ++c) {
         const Dtype val = data1[h1][w1][c];
         data2[h2][w2][c] = val;
       }
       return;
     }
     //
     const Acctype h1r = rheight * h2;
     const int h1 = h1r;
     const int h1p = (h1 < height1 - 1) ? 1 : 0;
     const Acctype h1lambda = h1r - h1;
     const Acctype h0lambda = Acctype(1) - h1lambda;
     //
     const Acctype w1r = rwidth * w2;
     const int w1 = w1r;
     const int w1p = (w1 < width1 - 1) ? 1 : 0;
     const Acctype w1lambda = w1r - w1;
     const Acctype w0lambda = Acctype(1) - w1lambda;
     for (int c = 0; c < channels; ++c) {
       const Acctype val = h0lambda * (w0lambda * data1[h1][w1][c]
                             + w1lambda * data1[h1][w1+w1p][c])
                             + h1lambda * (w0lambda * data1[h1+h1p][w1][c]
                             + w1lambda * data1[h1+h1p][w1+w1p][c]);
       data2[h2][w2][c] = ScalarConvert<Acctype, Dtype>::to(val);
     }
   }
 }

 // caffe_gpu_interp2_kernel overloading with Tensor<xpu, 4, DType>
 template<typename xpu, typename Dtype, typename Acctype>
 __global__ void caffe_gpu_interp2_kernel(const int n,
     const Acctype rheight, const Acctype rwidth,
     const Tensor<xpu, 4, Dtype> data1,
     Tensor<xpu, 4, Dtype> data2,
     ImageLayout layout) {
   int index = threadIdx.x + blockIdx.x * blockDim.x;
   int batch_size = (layout == NHWC) ? data1.size(0) : data1.size(0);
   int channels = (layout == NHWC) ? data1.size(3) : data1.size(1);
   int height1 = (layout == NHWC) ? data1.size(1) : data1.size(2);
   int width1 = (layout == NHWC) ? data1.size(2) : data1.size(3);
   int height2 = (layout == NHWC) ? data2.size(1) : data2.size(2);
   int width2 = (layout == NHWC) ? data2.size(2): data2.size(3);

   if (index < n) {
     const int w2 = index % width2;  // 0:width2-1
     const int h2 = index / width2;  // 0:height2-1
     // special case: just copy
     if (height1 == height2 && width1 == width2) {
       const int h1 = h2;
       const int w1 = w2;

       for (int n = 0; n < batch_size; ++n) {
         for (int c = 0; c < channels; ++c) {
           if (layout == NHWC) {
             const Dtype val = data1[n][h1][w1][c];
             data2[n][h2][w2][c] = val;
           } else {
             const Dtype val = data1[n][c][h1][w1];
             data2[n][c][h2][w2] = val;
           }
         }
       }
       return;
     }
     //
     const Acctype h1r = rheight * h2;
     const int h1 = h1r;
     const int h1p = (h1 < height1 - 1) ? 1 : 0;
     const Acctype h1lambda = h1r - h1;
     const Acctype h0lambda = Acctype(1) - h1lambda;
     //
     const Acctype w1r = rwidth * w2;
     const int w1 = w1r;
     const int w1p = (w1 < width1 - 1) ? 1 : 0;
     const Acctype w1lambda = w1r - w1;
     const Acctype w0lambda = Acctype(1) - w1lambda;

     for (auto n = 0; n < batch_size; ++n) {
       for (int c = 0; c < channels; ++c) {
         if (layout == NHWC) {
           const Acctype val = h0lambda * (w0lambda * data1[n][h1][w1][c]
                             + w1lambda * data1[n][h1][w1+w1p][c])
                             + h1lambda * (w0lambda * data1[n][h1+h1p][w1][c]
                             + w1lambda * data1[n][h1+h1p][w1+w1p][c]);
           data2[n][h2][w2][c] = ScalarConvert<Acctype, Dtype>::to(val);
         } else {
           const Acctype val = h0lambda * (w0lambda * data1[n][c][h1][w1]
                             + w1lambda * data1[n][c][h1][w1+w1p])
                             + h1lambda * (w0lambda * data1[n][c][h1+h1p][w1]
                             + w1lambda * data1[n][c][h1+h1p][w1+w1p]);
           data2[n][c][h2][w2] = ScalarConvert<Acctype, Dtype>::to(val);
         }
       }
     }
   }
 }

 }  // namespace op
 }  // namespace mxnet

 #endif  // MXNET_OPERATOR_CONTRIB_BILINEAR_RESIZE_CUH_
	/*
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing,
	* software distributed under the License is distributed on an
	* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	* KIND, either express or implied. See the License for the
	* specific language governing permissions and limitations
	* under the License.
	*/
	/*!
	* Copyright (c) 2019 by Contributors
	* \file bilinear_resize-inl.cuh
	* \brief bilinear resize operator cuda implementation
	* \author Hang Zhang, Jake Lee
	*/

	#ifndef MXNET_OPERATOR_CONTRIB_BILINEAR_RESIZE_CUH_
	#define MXNET_OPERATOR_CONTRIB_BILINEAR_RESIZE_CUH_

	#include <cuda_runtime_api.h>
	#include <algorithm>

	namespace mxnet {
	namespace op {

	using namespace mshadow;

	enum ImageLayout {
	HWC,
	NHWC,
	NCHW
	};

	template<typename In, typename Out>
	struct ScalarConvert {
	static __host__ __device__ __forceinline__ Out to(const In v) { return (Out) v; }
	};

	// The maximum number of threads in a block
	static const unsigned MAX_BLOCK_SIZE = 512U;

	// Number of threads in a block given an input size up to MAX_BLOCK_SIZE
	static unsigned getNumThreads(int nElem, const bool smaller) {
	unsigned threadSizes[5] = {32, 64, 128, 256, MAX_BLOCK_SIZE};
	const int maxi = smaller ? 4 : 5;
	for (int i = 0; i != maxi; ++i) {
	if (static_cast<unsigned>(nElem) <= threadSizes[i]) {
	return threadSizes[i];
	}
	}
	return smaller ? (MAX_BLOCK_SIZE >> 1) : MAX_BLOCK_SIZE;
	}

	// caffe_gpu_interp2_kernel overloading with Tensor<xpu, 3, DType>
	template<typename xpu, typename Dtype, typename Acctype>
	__global__ void caffe_gpu_interp2_kernel(const int n,
	const Acctype rheight, const Acctype rwidth,
	const Tensor<xpu, 3, Dtype> data1,
	Tensor<xpu, 3, Dtype> data2,
	ImageLayout layout) {
	int index = threadIdx.x + blockIdx.x * blockDim.x;
	const int channels = data1.size(2);
	const int height1 = data1.size(0);
	const int width1 = data1.size(1);
	const int height2 = data2.size(0);
	const int width2 = data2.size(1);

	if (index < n) {
	const int w2 = index % width2; // 0:width2-1
	const int h2 = index / width2; // 0:height2-1
	// special case: just copy
	if (height1 == height2 && width1 == width2) {
	const int h1 = h2;
	const int w1 = w2;
	for (int c = 0; c < channels; ++c) {
	const Dtype val = data1[h1][w1][c];
	data2[h2][w2][c] = val;
	}
	return;
	}
	//
	const Acctype h1r = rheight * h2;
	const int h1 = h1r;
	const int h1p = (h1 < height1 - 1) ? 1 : 0;
	const Acctype h1lambda = h1r - h1;
	const Acctype h0lambda = Acctype(1) - h1lambda;
	//
	const Acctype w1r = rwidth * w2;
	const int w1 = w1r;
	const int w1p = (w1 < width1 - 1) ? 1 : 0;
	const Acctype w1lambda = w1r - w1;
	const Acctype w0lambda = Acctype(1) - w1lambda;
	for (int c = 0; c < channels; ++c) {
	const Acctype val = h0lambda * (w0lambda * data1[h1][w1][c]
	+ w1lambda * data1[h1][w1+w1p][c])
	+ h1lambda * (w0lambda * data1[h1+h1p][w1][c]
	+ w1lambda * data1[h1+h1p][w1+w1p][c]);
	data2[h2][w2][c] = ScalarConvert<Acctype, Dtype>::to(val);
	}
	}
	}

	// caffe_gpu_interp2_kernel overloading with Tensor<xpu, 4, DType>
	template<typename xpu, typename Dtype, typename Acctype>
	__global__ void caffe_gpu_interp2_kernel(const int n,
	const Acctype rheight, const Acctype rwidth,
	const Tensor<xpu, 4, Dtype> data1,
	Tensor<xpu, 4, Dtype> data2,
	ImageLayout layout) {
	int index = threadIdx.x + blockIdx.x * blockDim.x;
	int batch_size = (layout == NHWC) ? data1.size(0) : data1.size(0);
	int channels = (layout == NHWC) ? data1.size(3) : data1.size(1);
	int height1 = (layout == NHWC) ? data1.size(1) : data1.size(2);
	int width1 = (layout == NHWC) ? data1.size(2) : data1.size(3);
	int height2 = (layout == NHWC) ? data2.size(1) : data2.size(2);
	int width2 = (layout == NHWC) ? data2.size(2): data2.size(3);

	if (index < n) {
	const int w2 = index % width2; // 0:width2-1
	const int h2 = index / width2; // 0:height2-1
	// special case: just copy
	if (height1 == height2 && width1 == width2) {
	const int h1 = h2;
	const int w1 = w2;

	for (int n = 0; n < batch_size; ++n) {
	for (int c = 0; c < channels; ++c) {
	if (layout == NHWC) {
	const Dtype val = data1[n][h1][w1][c];
	data2[n][h2][w2][c] = val;
	} else {
	const Dtype val = data1[n][c][h1][w1];
	data2[n][c][h2][w2] = val;
	}
	}
	}
	return;
	}
	//
	const Acctype h1r = rheight * h2;
	const int h1 = h1r;
	const int h1p = (h1 < height1 - 1) ? 1 : 0;
	const Acctype h1lambda = h1r - h1;
	const Acctype h0lambda = Acctype(1) - h1lambda;
	//
	const Acctype w1r = rwidth * w2;
	const int w1 = w1r;
	const int w1p = (w1 < width1 - 1) ? 1 : 0;
	const Acctype w1lambda = w1r - w1;
	const Acctype w0lambda = Acctype(1) - w1lambda;

	for (auto n = 0; n < batch_size; ++n) {
	for (int c = 0; c < channels; ++c) {
	if (layout == NHWC) {
	const Acctype val = h0lambda * (w0lambda * data1[n][h1][w1][c]
	+ w1lambda * data1[n][h1][w1+w1p][c])
	+ h1lambda * (w0lambda * data1[n][h1+h1p][w1][c]
	+ w1lambda * data1[n][h1+h1p][w1+w1p][c]);
	data2[n][h2][w2][c] = ScalarConvert<Acctype, Dtype>::to(val);
	} else {
	const Acctype val = h0lambda * (w0lambda * data1[n][c][h1][w1]
	+ w1lambda * data1[n][c][h1][w1+w1p])
	+ h1lambda * (w0lambda * data1[n][c][h1+h1p][w1]
	+ w1lambda * data1[n][c][h1+h1p][w1+w1p]);
	data2[n][c][h2][w2] = ScalarConvert<Acctype, Dtype>::to(val);
	}
	}
	}
	}
	}

	} // namespace op
	} // namespace mxnet

	#endif // MXNET_OPERATOR_CONTRIB_BILINEAR_RESIZE_CUH_