src/operator/image/image_random.cu - mxnet - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *   http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
  */

 /*!
  * \file image_random.cu
  * \brief GPU Implementation of image transformation operators
  */
 #include <cuda_runtime_api.h>
 #include "./image_random-inl.h"
 #include "../elemwise_op_common.h"

 namespace mxnet {
 namespace op {
 namespace image {

 using namespace mshadow;

 // ToTensor Kernel for 3D input
 /*
  * In order to not generate the code that uses too many
  * registers (resulting in too many resources requested
  * error) we need to tell the compiler that we will be
  * launching this kernel with cuda::kMaxThreadsPerBlock
  * threads per block. Setting __launch_bounds__ ensures
  * that such configuration can always be launched.
  */
 template <typename xpu, typename Dtype>
 __global__ void __launch_bounds__(cuda::kMaxThreadsPerBlock, 1)
     ToTensorCudaKernel(const Tensor<xpu, 3, Dtype> input,
                        const Tensor<xpu, 3, float> output,
                        const int req,
                        const int N,
                        const int H,
                        const int W,
                        const int C,
                        const float normalize_factor) {
   // We process one image per thread block.
   // In 3D case, we have only 1 block i.e., blockIdx.x
   // We do not use it.
   for (int c = 0; c < C; ++c) {
     for (int h = threadIdx.y; h < H; h += blockDim.y) {
       for (int w = threadIdx.x; w < W; w += blockDim.x) {
         KERNEL_ASSIGN(output[c][h][w], req, input[h][w][c] / normalize_factor);
       }
     }
   }
 }

 // ToTensor Kernel for 4D input
 template <typename xpu, typename Dtype>
 __global__ void __launch_bounds__(cuda::kMaxThreadsPerBlock, 1)
     ToTensorCudaKernel(const Tensor<xpu, 4, Dtype> input,
                        const Tensor<xpu, 4, float> output,
                        const int req,
                        const int N,
                        const int H,
                        const int W,
                        const int C,
                        const float normalize_factor) {
   // We process one image per thread block.
   const int n = blockIdx.x;

   for (int c = 0; c < C; ++c) {
     for (int h = threadIdx.y; h < H; h += blockDim.y) {
       for (int w = threadIdx.x; w < W; w += blockDim.x) {
         KERNEL_ASSIGN(output[n][c][h][w], req, input[n][h][w][c] / normalize_factor);
       }
     }
   }
 }

 template <typename DType, typename T1, typename T2>
 void ToTensorImplCUDA(mshadow::Stream<gpu>* s,
                       const T1 input,
                       const T2 output,
                       const int req,
                       const float normalize_factor) {
   int blocks, H, W, C, N;
   cudaStream_t stream = mshadow::Stream<gpu>::GetStream(s);
   if (std::is_same<T1, Tensor<gpu, 3, DType>>::value) {
     // 3D Input - (H, W, C)
     N      = 0;
     H      = input.size(0);
     W      = input.size(1);
     C      = input.size(2);
     blocks = 1;
   } else {
     // 4D Input - (N, H, W, C)
     N      = input.size(0);
     H      = input.size(1);
     W      = input.size(2);
     C      = input.size(3);
     blocks = N > 0 ? N : 1;
   }

   ToTensorCudaKernel<gpu, DType>
       <<<blocks, dim3(32, 32), 0, stream>>>(input, output, req, N, H, W, C, normalize_factor);
   MSHADOW_CUDA_POST_KERNEL_CHECK(ToTensorCudaKernel);
 }

 // Normalize Forward CUDA Kernel
 template <typename xpu, typename DType>
 __global__ void __launch_bounds__(cuda::kMaxThreadsPerBlock, 1)
     NormalizeCudaKernel(const DType* input,
                         DType* output,
                         const int req,
                         const int N,
                         const int C,
                         const int H,
                         const int W,
                         const float mean_d0,
                         const float mean_d1,
                         const float mean_d2,
                         const float std_d0,
                         const float std_d1,
                         const float std_d2) {
   // We process one image per thread block.
   const int n      = blockIdx.x;
   const int length = H * W;
   const int step   = C * length * n;

   float mean = mean_d0;
   float std  = std_d0;
   for (int c = 0; c < C; ++c) {
     switch (c) {
       case 0:
         break;
       case 1:
         mean = mean_d1;
         std  = std_d1;
         break;
       case 2:
         mean = mean_d2;
         std  = std_d2;
         break;
     }
     for (int i = threadIdx.x; i < length; i += blockDim.x) {
       KERNEL_ASSIGN(*(output + step + i + (c * length)),
                     req,
                     (*(input + step + i + (c * length)) - mean) / std);
     }
   }
 }

 template <typename DType>
 void NormalizeImplCUDA(mshadow::Stream<gpu>* s,
                        const DType* input,
                        DType* output,
                        const int req,
                        const int N,
                        const int C,
                        const int H,
                        const int W,
                        const float mean_d0,
                        const float mean_d1,
                        const float mean_d2,
                        const float std_d0,
                        const float std_d1,
                        const float std_d2) {
   cudaStream_t stream = mshadow::Stream<gpu>::GetStream(s);
   NormalizeCudaKernel<gpu, DType>
       // 1 image per block. N is batch size.
       <<<N, dim3(cuda::kMaxThreadsPerBlock, 1), 0, stream>>>(
           input, output, req, N, C, H, W, mean_d0, mean_d1, mean_d2, std_d0, std_d1, std_d2);
   MSHADOW_CUDA_POST_KERNEL_CHECK(NormalizeCudaKernel);
 }

 // Normalize Backward Kernel
 template <typename xpu, typename DType>
 __global__ void __launch_bounds__(cuda::kMaxThreadsPerBlock, 1)
     NormalizeBackwardCudaKernel(const DType* out_grad,
                                 DType* in_grad,
                                 const int req,
                                 const int N,
                                 const int C,
                                 const int H,
                                 const int W,
                                 const float std_d0,
                                 const float std_d1,
                                 const float std_d2) {
   // We process one image per thread block.
   const int n      = blockIdx.x;
   const int length = H * W;
   const int step   = C * length * n;

   float std = std_d0;
   for (int c = 0; c < C; ++c) {
     switch (c) {
       case 0:
         break;
       case 1:
         std = std_d1;
         break;
       case 2:
         std = std_d2;
         break;
     }
     for (int i = threadIdx.x; i < length; i += blockDim.x) {
       KERNEL_ASSIGN(*(in_grad + step + i + (c * length)),
                     req,
                     *(out_grad + step + i + (c * length)) * (1.0 / std));
     }
   }
 }

 template <typename DType>
 void NormalizeBackwardImplCUDA(mshadow::Stream<gpu>* s,
                                const DType* out_grad,
                                DType* in_grad,
                                const int req,
                                const int N,
                                const int C,
                                const int H,
                                const int W,
                                const float std_d0,
                                const float std_d1,
                                const float std_d2) {
   cudaStream_t stream = mshadow::Stream<gpu>::GetStream(s);
   NormalizeBackwardCudaKernel<gpu, DType>
       // 1 image per block. N is batch size.
       <<<N, dim3(cuda::kMaxThreadsPerBlock, 1), 0, stream>>>(
           out_grad, in_grad, req, N, C, H, W, std_d0, std_d1, std_d2);
   MSHADOW_CUDA_POST_KERNEL_CHECK(NormalizeBackwardCudaKernel);
 }

 NNVM_REGISTER_OP(_image_to_tensor).set_attr<FCompute>("FCompute<gpu>", ToTensorOpForward<gpu>);

 NNVM_REGISTER_OP(_image_normalize).set_attr<FCompute>("FCompute<gpu>", NormalizeOpForward<gpu>);

 NNVM_REGISTER_OP(_backward_image_normalize)
     .set_attr<FCompute>("FCompute<gpu>", NormalizeOpBackward<gpu>);

 }  // namespace image
 }  // namespace op
 }  // namespace mxnet
	/*
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing,
	* software distributed under the License is distributed on an
	* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	* KIND, either express or implied. See the License for the
	* specific language governing permissions and limitations
	* under the License.
	*/

	/*!
	* \file image_random.cu
	* \brief GPU Implementation of image transformation operators
	*/
	#include <cuda_runtime_api.h>
	#include "./image_random-inl.h"
	#include "../elemwise_op_common.h"

	namespace mxnet {
	namespace op {
	namespace image {

	using namespace mshadow;

	// ToTensor Kernel for 3D input
	/*
	* In order to not generate the code that uses too many
	* registers (resulting in too many resources requested
	* error) we need to tell the compiler that we will be
	* launching this kernel with cuda::kMaxThreadsPerBlock
	* threads per block. Setting __launch_bounds__ ensures
	* that such configuration can always be launched.
	*/
	template <typename xpu, typename Dtype>
	__global__ void __launch_bounds__(cuda::kMaxThreadsPerBlock, 1)
	ToTensorCudaKernel(const Tensor<xpu, 3, Dtype> input,
	const Tensor<xpu, 3, float> output,
	const int req,
	const int N,
	const int H,
	const int W,
	const int C,
	const float normalize_factor) {
	// We process one image per thread block.
	// In 3D case, we have only 1 block i.e., blockIdx.x
	// We do not use it.
	for (int c = 0; c < C; ++c) {
	for (int h = threadIdx.y; h < H; h += blockDim.y) {
	for (int w = threadIdx.x; w < W; w += blockDim.x) {
	KERNEL_ASSIGN(output[c][h][w], req, input[h][w][c] / normalize_factor);
	}
	}
	}
	}

	// ToTensor Kernel for 4D input
	template <typename xpu, typename Dtype>
	__global__ void __launch_bounds__(cuda::kMaxThreadsPerBlock, 1)
	ToTensorCudaKernel(const Tensor<xpu, 4, Dtype> input,
	const Tensor<xpu, 4, float> output,
	const int req,
	const int N,
	const int H,
	const int W,
	const int C,
	const float normalize_factor) {
	// We process one image per thread block.
	const int n = blockIdx.x;

	for (int c = 0; c < C; ++c) {
	for (int h = threadIdx.y; h < H; h += blockDim.y) {
	for (int w = threadIdx.x; w < W; w += blockDim.x) {
	KERNEL_ASSIGN(output[n][c][h][w], req, input[n][h][w][c] / normalize_factor);
	}
	}
	}
	}

	template <typename DType, typename T1, typename T2>
	void ToTensorImplCUDA(mshadow::Stream<gpu>* s,
	const T1 input,
	const T2 output,
	const int req,
	const float normalize_factor) {
	int blocks, H, W, C, N;
	cudaStream_t stream = mshadow::Stream<gpu>::GetStream(s);
	if (std::is_same<T1, Tensor<gpu, 3, DType>>::value) {
	// 3D Input - (H, W, C)
	N = 0;
	H = input.size(0);
	W = input.size(1);
	C = input.size(2);
	blocks = 1;
	} else {
	// 4D Input - (N, H, W, C)
	N = input.size(0);
	H = input.size(1);
	W = input.size(2);
	C = input.size(3);
	blocks = N > 0 ? N : 1;
	}

	ToTensorCudaKernel<gpu, DType>
	<<<blocks, dim3(32, 32), 0, stream>>>(input, output, req, N, H, W, C, normalize_factor);
	MSHADOW_CUDA_POST_KERNEL_CHECK(ToTensorCudaKernel);
	}

	// Normalize Forward CUDA Kernel
	template <typename xpu, typename DType>
	__global__ void __launch_bounds__(cuda::kMaxThreadsPerBlock, 1)
	NormalizeCudaKernel(const DType* input,
	DType* output,
	const int req,
	const int N,
	const int C,
	const int H,
	const int W,
	const float mean_d0,
	const float mean_d1,
	const float mean_d2,
	const float std_d0,
	const float std_d1,
	const float std_d2) {
	// We process one image per thread block.
	const int n = blockIdx.x;
	const int length = H * W;
	const int step = C * length * n;

	float mean = mean_d0;
	float std = std_d0;
	for (int c = 0; c < C; ++c) {
	switch (c) {
	case 0:
	break;
	case 1:
	mean = mean_d1;
	std = std_d1;
	break;
	case 2:
	mean = mean_d2;
	std = std_d2;
	break;
	}
	for (int i = threadIdx.x; i < length; i += blockDim.x) {
	KERNEL_ASSIGN((output + step + i + (c length)),
	req,
	((input + step + i + (c length)) - mean) / std);
	}
	}
	}

	template <typename DType>
	void NormalizeImplCUDA(mshadow::Stream<gpu>* s,
	const DType* input,
	DType* output,
	const int req,
	const int N,
	const int C,
	const int H,
	const int W,
	const float mean_d0,
	const float mean_d1,
	const float mean_d2,
	const float std_d0,
	const float std_d1,
	const float std_d2) {
	cudaStream_t stream = mshadow::Stream<gpu>::GetStream(s);
	NormalizeCudaKernel<gpu, DType>
	// 1 image per block. N is batch size.
	<<<N, dim3(cuda::kMaxThreadsPerBlock, 1), 0, stream>>>(
	input, output, req, N, C, H, W, mean_d0, mean_d1, mean_d2, std_d0, std_d1, std_d2);
	MSHADOW_CUDA_POST_KERNEL_CHECK(NormalizeCudaKernel);
	}

	// Normalize Backward Kernel
	template <typename xpu, typename DType>
	__global__ void __launch_bounds__(cuda::kMaxThreadsPerBlock, 1)
	NormalizeBackwardCudaKernel(const DType* out_grad,
	DType* in_grad,
	const int req,
	const int N,
	const int C,
	const int H,
	const int W,
	const float std_d0,
	const float std_d1,
	const float std_d2) {
	// We process one image per thread block.
	const int n = blockIdx.x;
	const int length = H * W;
	const int step = C * length * n;

	float std = std_d0;
	for (int c = 0; c < C; ++c) {
	switch (c) {
	case 0:
	break;
	case 1:
	std = std_d1;
	break;
	case 2:
	std = std_d2;
	break;
	}
	for (int i = threadIdx.x; i < length; i += blockDim.x) {
	KERNEL_ASSIGN((in_grad + step + i + (c length)),
	req,
	(out_grad + step + i + (c length)) * (1.0 / std));
	}
	}
	}

	template <typename DType>
	void NormalizeBackwardImplCUDA(mshadow::Stream<gpu>* s,
	const DType* out_grad,
	DType* in_grad,
	const int req,
	const int N,
	const int C,
	const int H,
	const int W,
	const float std_d0,
	const float std_d1,
	const float std_d2) {
	cudaStream_t stream = mshadow::Stream<gpu>::GetStream(s);
	NormalizeBackwardCudaKernel<gpu, DType>
	// 1 image per block. N is batch size.
	<<<N, dim3(cuda::kMaxThreadsPerBlock, 1), 0, stream>>>(
	out_grad, in_grad, req, N, C, H, W, std_d0, std_d1, std_d2);
	MSHADOW_CUDA_POST_KERNEL_CHECK(NormalizeBackwardCudaKernel);
	}

	NNVM_REGISTER_OP(_image_to_tensor).set_attr<FCompute>("FCompute<gpu>", ToTensorOpForward<gpu>);

	NNVM_REGISTER_OP(_image_normalize).set_attr<FCompute>("FCompute<gpu>", NormalizeOpForward<gpu>);

	NNVM_REGISTER_OP(_backward_image_normalize)
	.set_attr<FCompute>("FCompute<gpu>", NormalizeOpBackward<gpu>);

	} // namespace image
	} // namespace op
	} // namespace mxnet