src/operator/nn/im2col.cuh - mxnet - Git at Google

 /*!
  ******************* BEGIN Caffe Copyright Notice and Disclaimer ****************
  *
  * COPYRIGHT
  *
  * All contributions by the University of California:
  * Copyright (c) 2014-2017 The Regents of the University of California (Regents)
  * All rights reserved.
  *
  * All other contributions:
  * Copyright (c) 2014-2017, the respective contributors
  * All rights reserved.
  *
  * Caffe uses a shared copyright model: each contributor holds copyright over
  * their contributions to Caffe. The project versioning records all such
  * contribution and copyright details. If a contributor wants to further mark
  * their specific copyright on a particular contribution, they should indicate
  * their copyright solely in the commit message of the change when it is
  * committed.
  *
  * LICENSE
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *
  * 1. Redistributions of source code must retain the above copyright notice, this
  * list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright notice,
  * this list of conditions and the following disclaimer in the documentation
  * and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  * CONTRIBUTION AGREEMENT
  *
  * By contributing to the BVLC/caffe repository through pull-request, comment,
  * or otherwise, the contributor releases their content to the
  * license and copyright terms herein.
  *
  ***************** END Caffe Copyright Notice and Disclaimer ********************
  *
  * Copyright (c) 2017 by Contributors
  * \file im2col.h
  * \brief Function definitions of converting an image to
  * column matrix based on kernel, padding, and dilation.
  * These functions are mainly used in convolution operators.
  * The implementation of the im2col and col2im algorithms
  * are copied from Caffe with minor interface modifications
  * adapting to MXNet data structures.
  */

 #ifndef MXNET_OPERATOR_NN_IM2COL_CUH_
 #define MXNET_OPERATOR_NN_IM2COL_CUH_

 #include <mxnet/base.h>
 #include <mxnet/operator.h>
 #include <algorithm>
 #include <cstring>
 #include <vector>
 #include "../mxnet_op.h"

 namespace mxnet {
 namespace op {

 /*!
  * \brief im2col gpu kernel.
  * DO NOT call this directly. Use wrapper function im2col() instead;
  */
 template <typename DType>
 __global__ void im2col_gpu_kernel(const int n, const DType* data_im,
     const int height, const int width, const int kernel_h, const int kernel_w,
     const int pad_h, const int pad_w,
     const int stride_h, const int stride_w,
     const int dilation_h, const int dilation_w,
     const int height_col, const int width_col,
     DType* data_col) {
   CUDA_KERNEL_LOOP(index, n) {
     const int h_index = index / width_col;
     const int h_col = h_index % height_col;
     const int w_col = index % width_col;
     const int c_im = h_index / height_col;
     const int c_col = c_im * kernel_h * kernel_w;
     const int h_offset = h_col * stride_h - pad_h;
     const int w_offset = w_col * stride_w - pad_w;
     DType* data_col_ptr = data_col;
     data_col_ptr += (c_col * height_col + h_col) * width_col + w_col;
     const DType* data_im_ptr = data_im;
     data_im_ptr += (c_im * height + h_offset) * width + w_offset;
     for (int i = 0; i < kernel_h; ++i) {
       for (int j = 0; j < kernel_w; ++j) {
         int h_im = h_offset + i * dilation_h;
         int w_im = w_offset + j * dilation_w;
         *data_col_ptr =
             (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) ?
             data_im_ptr[i * dilation_h * width + j * dilation_w] : static_cast<DType>(0);
         data_col_ptr += height_col * width_col;
       }
     }
   }
 }

 /*!
  * \brief DO NOT call this directly. Use wrapper function im2col() instead;
  */
 template <typename DType>
 inline void im2col_gpu(mshadow::Stream<gpu>* s,
                        const DType* data_im, const int channels,
                        const int height, const int width,
                        const int kernel_h, const int kernel_w,
                        const int pad_h, const int pad_w,
                        const int stride_h, const int stride_w,
                        const int dilation_h, const int dilation_w,
                        DType* data_col) {
   // We are going to launch channels * height_col * width_col kernels, each
   // kernel responsible for copying a single-channel grid.
   int height_col = (height + 2 * pad_h -
       (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
   int width_col = (width + 2 * pad_w -
       (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
   int num_kernels = channels * height_col * width_col;
   using namespace mxnet_op;
   // NOLINT_NEXT_LINE(whitespace/operators)
   im2col_gpu_kernel<DType><<<cuda_get_num_blocks(num_kernels), mshadow::cuda::kBaseThreadNum,
                              0, mshadow::Stream<gpu>::GetStream(s)>>>(
       num_kernels, data_im, height, width, kernel_h, kernel_w, pad_h,
       pad_w, stride_h, stride_w, dilation_h, dilation_w, height_col,
       width_col, data_col);
   MSHADOW_CUDA_POST_KERNEL_CHECK(im2col_gpu_kernel);
 }

 /*!
  * \brief DO NOT call this directly. Use wrapper function col2im() instead;
  */
 template <typename DType>
 __global__ void col2im_gpu_kernel(const int n, const DType* data_col,
     const int channels, const int height, const int width,
     const int kernel_h, const int kernel_w,
     const int pad_h, const int pad_w,
     const int stride_h, const int stride_w,
     const int dilation_h, const int dilation_w,
     const int height_col, const int width_col,
     DType* data_im, OpReqType req) {
   CUDA_KERNEL_LOOP(index, n) {
     DType val = 0;
     const int w_im = index % width + pad_w;
     const int h_im = (index / width) % height + pad_h;
     const int c_im = index / (width * height);
     int kernel_extent_w = (kernel_w - 1) * dilation_w + 1;
     int kernel_extent_h = (kernel_h - 1) * dilation_h + 1;
     // compute the start and end of the output
     const int w_col_start =
         (w_im < kernel_extent_w) ? 0 : (w_im - kernel_extent_w) / stride_w + 1;
     const int w_col_end = min(w_im / stride_w + 1, width_col);
     const int h_col_start =
         (h_im < kernel_extent_h) ? 0 : (h_im - kernel_extent_h) / stride_h + 1;
     const int h_col_end = min(h_im / stride_h + 1, height_col);
     // TODO(caffe): use LCM of stride and dilation to avoid unnecessary loops
     for (int h_col = h_col_start; h_col < h_col_end; h_col += 1) {
       for (int w_col = w_col_start; w_col < w_col_end; w_col += 1) {
         int h_k = (h_im - h_col * stride_h);
         int w_k = (w_im - w_col * stride_w);
         if (h_k % dilation_h == 0 && w_k % dilation_w == 0) {
           h_k /= dilation_h;
           w_k /= dilation_w;
           int data_col_index = (((c_im * kernel_h + h_k) * kernel_w + w_k) *
                                 height_col + h_col) * width_col + w_col;
           val += data_col[data_col_index];
         }
       }
     }
     KERNEL_ASSIGN(data_im[index], req, val);
   }
 }

 /*!
  * \brief DO NOT call this directly. Use wrapper function col2im() instead;
  */
 using mshadow::Shape;
 template <typename DType, int num_axes>
 __global__ void im2col_nd_gpu_kernel(const int n, const DType* data_im,
     const Shape<num_axes+2> im_shape, const Shape<num_axes+1> col_shape,
     const Shape<num_axes> kernel_shape, const Shape<num_axes> pad, const Shape<num_axes> stride,
     const Shape<num_axes> dilation, DType* data_col) {
   int d_temp[num_axes];  // NOLINT(runtime/arrays)
   int d_iter[num_axes];  // NOLINT(runtime/arrays)

   __shared__ int shared_dilation[num_axes];
   __shared__ int shared_kernel_shape[num_axes];
   __shared__ int shared_pad[num_axes];
   __shared__ int shared_stride[num_axes];
   __shared__ int shared_col_shape[num_axes + 1];
   __shared__ int shared_im_shape[num_axes + 1];

   if (threadIdx.x < num_axes) {
     shared_dilation[threadIdx.x] = dilation[threadIdx.x];
     shared_kernel_shape[threadIdx.x] = kernel_shape[threadIdx.x];
     shared_pad[threadIdx.x] = pad[threadIdx.x];
     shared_stride[threadIdx.x] = stride[threadIdx.x];
   }
   if (threadIdx.x < num_axes + 1) {
     shared_col_shape[threadIdx.x] = col_shape[threadIdx.x];
     shared_im_shape[threadIdx.x] = im_shape[threadIdx.x+1];  // skip batch dim
   }
   __syncthreads();

   int i;
   CUDA_KERNEL_LOOP(index, n) {
     // Initialize channel_in, computed in the loop below, with intermediate
     // computations used to compute the spatial indices.
     int channel_in = index;
     int channel_out = 1;
     for (i = num_axes - 1; i >= 0; --i) {
       d_temp[i] = channel_in % shared_col_shape[i + 1];
       channel_in /= shared_col_shape[i + 1];
       channel_out *= shared_kernel_shape[i];
     }
     channel_out *= channel_in;
     int data_col_inc = 1;
     for (i = 0; i < num_axes; ++i) {
       channel_out *= shared_col_shape[i + 1];
       channel_out += d_temp[i];
       d_temp[i] = d_temp[i] * shared_stride[i] - shared_pad[i];
       channel_in *= shared_im_shape[i + 1];
       channel_in += d_temp[i];
       data_col_inc *= shared_col_shape[i + 1];
       d_iter[i] = 0;
     }
     DType* data_col_ptr = data_col + channel_out;
     const DType* data_im_ptr = data_im + channel_in;
     bool incremented;
     do {
       bool in_range = true;
       for (i = 0; i < num_axes; ++i) {
         const int d_iter_im = d_iter[i] * shared_dilation[i] + d_temp[i];
         in_range &= d_iter_im >= 0 && d_iter_im < shared_im_shape[i + 1];
         if (!in_range) { break; }
       }
       if (in_range) {
         int data_im_offset = d_iter[0] * shared_dilation[0];
         for (i = 1; i < num_axes; ++i) {
           data_im_offset *= shared_im_shape[i + 1];
           data_im_offset += d_iter[i] * shared_dilation[i];
         }
         *data_col_ptr = data_im_ptr[data_im_offset];
       } else {
         *data_col_ptr = 0;
       }
       data_col_ptr += data_col_inc;
       incremented = false;
       for (i = num_axes - 1; i >= 0; --i) {
         const int d_max = shared_kernel_shape[i];
         if (d_iter[i] == d_max - 1) {
           d_iter[i] = 0;
         } else {  // d_iter[i] < d_max - 1
           ++d_iter[i];
           incremented = true;
           break;
         }
       }  // for (int i = num_axes - 1; i >= 0; --i)
     } while (incremented);  // do
   }  // CUDA_KERNEL_LOOP(index, n)
 }

 /*!\brief im2col gpu version
  * \param s device stream
  * \param data_im pointer of an image (C, H, W, ...) in the image batch
  * \param col_shape column buffer shape (#channels, output_im_height, output_im_width, ...)
  * \param kernel_shape kernel filter shape
  * \param pad pad shape
  * \param stride stride shape
  * \param dilation dilation shape
  * \param data_col column buffer pointer
  */
 template <typename DType>
 inline void im2col(mshadow::Stream<gpu>* s,
                    const DType* data_im, const mxnet::TShape& im_shape,
                    const mxnet::TShape& col_shape, const mxnet::TShape& kernel_shape,
                    const mxnet::TShape& pad, const mxnet::TShape& stride,
                    const mxnet::TShape& dilation, DType* data_col) {
   // num_axes should be smaller than block size
   index_t num_spatial_axes = kernel_shape.ndim();
   CHECK_LT(num_spatial_axes, mshadow::cuda::kBaseThreadNum);
   index_t num_kernels = im_shape[1] * col_shape.ProdShape(1, col_shape.ndim());
   using namespace mxnet_op;
   switch (num_spatial_axes) {
   case 1:
     im2col_nd_gpu_kernel<DType, 1>  // NOLINT_NEXT_LINE(whitespace/operators)
         <<<cuda_get_num_blocks(num_kernels), mshadow::cuda::kBaseThreadNum,
            0, mshadow::Stream<gpu>::GetStream(s)>>>(
         num_kernels, data_im, im_shape.get<3>(), col_shape.get<2>(),
         kernel_shape.get<1>(), pad.get<1>(), stride.get<1>(), dilation.get<1>(), data_col);
     break;
   case 2:
     im2col_gpu_kernel<DType> // NOLINT_NEXT_LINE(whitespace/operators)
         <<<cuda_get_num_blocks(num_kernels), mshadow::cuda::kBaseThreadNum,
            0, mshadow::Stream<gpu>::GetStream(s)>>>(
         num_kernels, data_im, im_shape[2], im_shape[3], kernel_shape[0], kernel_shape[1],
         pad[0], pad[1], stride[0], stride[1], dilation[0], dilation[1],
         col_shape[1], col_shape[2], data_col);
     break;
   case 3:
     im2col_nd_gpu_kernel<DType, 3>  // NOLINT_NEXT_LINE(whitespace/operators)
         <<<cuda_get_num_blocks(num_kernels), mshadow::cuda::kBaseThreadNum,
            0, mshadow::Stream<gpu>::GetStream(s)>>>(
         num_kernels, data_im, im_shape.get<5>(), col_shape.get<4>(),
         kernel_shape.get<3>(), pad.get<3>(), stride.get<3>(), dilation.get<3>(), data_col);
     break;
   default:
     LOG(FATAL) << "im2col_nd_gpu does not support computation with "
                << num_spatial_axes << " spatial axes";
   }
   MSHADOW_CUDA_POST_KERNEL_CHECK(im2col_nd_gpu_kernel);
 }

 /*!
  * \brief DO NOT call this directly. Use wrapper function col2im() instead;
  */
 template <typename DType>
 inline void col2im_gpu(mshadow::Stream<gpu>* s, const DType* data_col, const int channels,
     const int height, const int width, const int kernel_h, const int kernel_w,
     const int pad_h, const int pad_w, const int stride_h,
     const int stride_w, const int dilation_h, const int dilation_w,
     DType* data_im, OpReqType req) {
   int height_col = (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) /
       stride_h + 1;
   int width_col = (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) /
       stride_w + 1;
   int num_kernels = channels * height * width;
   using namespace mxnet_op;
   // To avoid involving atomic operations, we will launch one kernel per
   // bottom dimension, and then in the kernel add up the top dimensions.
   // NOLINT_NEXT_LINE(whitespace/operators)
   col2im_gpu_kernel<DType><<<cuda_get_num_blocks(num_kernels), mshadow::cuda::kBaseThreadNum,
                              0, mshadow::Stream<gpu>::GetStream(s)>>>(
       num_kernels, data_col, height, width, channels, kernel_h, kernel_w,
       pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w,
       height_col, width_col, data_im, req);
   MSHADOW_CUDA_POST_KERNEL_CHECK(col2im_gpu_kernel);
 }

 /*!
  * \brief DO NOT call this directly. Use wrapper function col2im() instead;
  */
 template <typename DType, int num_axes>
 __global__ void col2im_nd_gpu_kernel(const int n, const DType* data_col,
     const Shape<num_axes+2> im_shape, const Shape<num_axes+1> col_shape,
     const Shape<num_axes> kernel_shape, const Shape<num_axes> pad, const Shape<num_axes> stride,
     const Shape<num_axes> dilation, DType* data_im, OpReqType req) {
   int d_im[num_axes];  // NOLINT(runtime/arrays)
   int d_col_iter[num_axes];  // NOLINT(runtime/arrays)
   int d_col_start[num_axes];  // NOLINT(runtime/arrays)
   int d_col_end[num_axes];  // NOLINT(runtime/arrays)

   __shared__ int shared_dilation[num_axes];
   __shared__ int shared_kernel_shape[num_axes];
   __shared__ int shared_pad[num_axes];
   __shared__ int shared_stride[num_axes];
   __shared__ int shared_col_shape[num_axes + 1];
   __shared__ int shared_im_shape[num_axes + 1];

   if (threadIdx.x < num_axes) {
     shared_dilation[threadIdx.x] = dilation[threadIdx.x];
     shared_kernel_shape[threadIdx.x] = kernel_shape[threadIdx.x];
     shared_pad[threadIdx.x] = pad[threadIdx.x];
     shared_stride[threadIdx.x] = stride[threadIdx.x];
   }
   if (threadIdx.x < num_axes + 1) {
     shared_col_shape[threadIdx.x] = col_shape[threadIdx.x];
     shared_im_shape[threadIdx.x] = im_shape[threadIdx.x+1];  // skip batch dim
   }
   __syncthreads();

   CUDA_KERNEL_LOOP(index, n) {
     // Initialize channel_in, computed in the loop below, with intermediate
     // computations used to compute the spatial indices.
     int c_im = index;
     // Calculate d_im (image dimensions).
     for (int i = num_axes - 1; i >= 0; --i) {
       d_im[i] = c_im % shared_im_shape[i + 1] + shared_pad[i];
       c_im /= shared_im_shape[i + 1];
     }
     // Calculate col start/end indices.
     bool done = false;
     for (int i = 0; i < num_axes; ++i) {
       const int kernel_extent =
           shared_dilation[i] * (shared_kernel_shape[i] - 1) + 1;
       d_col_start[i] = d_col_iter[i] =
           (d_im[i] < kernel_extent) ? 0 :
           (d_im[i] - kernel_extent) / shared_stride[i] + 1;
       d_col_end[i] =
           min(d_im[i] / shared_stride[i] + 1, shared_col_shape[i + 1]);
       if (d_col_start[i] >= d_col_end[i]) {
         // Skip computation if the dimension is 0 at any spatial axis --
         // final val will be 0.
         data_im[index] = 0;
         done = true;
         break;  // for (int i = 0; i < num_axes; ++i)
       }
     }
     if (done) {
       continue;  // CUDA_KERNEL_LOOP(index, n)
     }
     // Loop over the col to compute the output val.
     DType val = 0;
     bool incremented = true;
     bool skip = false;
     do {
       // Compute the final offset.
       int final_offset = 0;
       int kernel_shape_prod = 1;
       int kernel_index;
       for (int i = num_axes - 1; i >= 0; --i) {
         kernel_index = d_im[i] - d_col_iter[i] * shared_stride[i];
         if (kernel_index % shared_dilation[i]) {
           skip = true;
           break;
         } else {
           kernel_index /= shared_dilation[i];
           final_offset += kernel_index * kernel_shape_prod;
           kernel_shape_prod *= shared_kernel_shape[i];
         }
       }
       if (!skip) {
         final_offset += kernel_shape_prod * c_im;
         for (int i = 0; i < num_axes; ++i) {
           final_offset *= shared_col_shape[i + 1];
           final_offset += d_col_iter[i];
         }
         val += data_col[final_offset];
       }
       skip = false;
       incremented = false;
       for (int i = num_axes - 1; i >= 0; --i) {
         const int d_max = d_col_end[i];
         if (d_col_iter[i] == d_max - 1) {
           d_col_iter[i] = d_col_start[i];
         } else {  // d_col_iter[i] < d_max - 1
           ++d_col_iter[i];
           incremented = true;
           break;  // for (int i = num_axes - 1; i >= 0; --i)
         }
       }  // for (int i = num_axes - 1; i >= 0; --i)
     }  while (incremented);
     KERNEL_ASSIGN(data_im[index], req, val);
   }  // CUDA_KERNEL_LOOP(index, n)
 }

 /*!\brief
  * gpu function of col2im algorithm
  * \param s device stream
  * \param data_col start pointer of the column buffer to be filled
  * \param im_shape input image shape in dimensions (N, C, H, W,)
  * \param col_shape column buffer shape
  * \param kernel_shape kernel filter shape
  * \param pad pad shape
  * \param stride stride shape
  * \param dilation dilation shape
  * \param data_im pointer of a image (C, H, W,...) in the image batch
  */
 template <typename DType>
 inline void col2im(mshadow::Stream<gpu>* s,
                    const DType* data_col, const mxnet::TShape& im_shape,
                    const mxnet::TShape& col_shape, const mxnet::TShape& kernel_shape,
                    const mxnet::TShape& pad, const mxnet::TShape& stride,
                    const mxnet::TShape& dilation, DType* data_im, OpReqType req) {
   index_t num_spatial_axes = kernel_shape.ndim();
   index_t im_size = im_shape.ProdShape(1, im_shape.ndim());
   // num_axes should be smaller than block size
   CHECK_LT(num_spatial_axes, mshadow::cuda::kBaseThreadNum);
   using namespace mxnet_op;
   switch (num_spatial_axes) {
   case 1:
     col2im_nd_gpu_kernel<DType, 1>  // NOLINT_NEXT_LINE(whitespace/operators)
           <<<cuda_get_num_blocks(im_size), mshadow::cuda::kBaseThreadNum,
              0, mshadow::Stream<gpu>::GetStream(s)>>>(
           im_size, data_col, im_shape.get<3>(), col_shape.get<2>(),
           kernel_shape.get<1>(), pad.get<1>(), stride.get<1>(), dilation.get<1>(),
           data_im, req);
     MSHADOW_CUDA_POST_KERNEL_CHECK(col2im_nd_gpu_kernel);
     break;
   case 2:
     // To avoid involving atomic operations, we will launch one kernel per
     // bottom dimension, and then in the kernel add up the top dimensions.
     // NOLINT_NEXT_LINE(whitespace/operators)
     col2im_gpu_kernel<DType><<<cuda_get_num_blocks(im_size), mshadow::cuda::kBaseThreadNum,
                                0, mshadow::Stream<gpu>::GetStream(s)>>>(
         im_size, data_col, im_shape[1], im_shape[2], im_shape[3],
         kernel_shape[0], kernel_shape[1], pad[0], pad[1], stride[0], stride[1],
         dilation[0], dilation[1], col_shape[1], col_shape[2], data_im, req);
     MSHADOW_CUDA_POST_KERNEL_CHECK(col2im_gpu_kernel);
     break;
   case 3:
     col2im_nd_gpu_kernel<DType, 3>  // NOLINT_NEXT_LINE(whitespace/operators)
           <<<cuda_get_num_blocks(im_size), mshadow::cuda::kBaseThreadNum,
              0, mshadow::Stream<gpu>::GetStream(s)>>>(
           im_size, data_col, im_shape.get<5>(), col_shape.get<4>(),
           kernel_shape.get<3>(), pad.get<3>(), stride.get<3>(), dilation.get<3>(),
           data_im, req);
     MSHADOW_CUDA_POST_KERNEL_CHECK(col2im_nd_gpu_kernel);
     break;
   default:
     LOG(FATAL) << "col2im_nd_gpu does not support computation with "
                << num_spatial_axes << " spatial axes";
   }
 }

 }  // namespace op
 }  // namespace mxnet

 #endif  // MXNET_OPERATOR_NN_IM2COL_CUH_
	/*!
	***************** BEGIN Caffe Copyright Notice and Disclaimer **************
	*
	* COPYRIGHT
	*
	* All contributions by the University of California:
	* Copyright (c) 2014-2017 The Regents of the University of California (Regents)
	* All rights reserved.
	*
	* All other contributions:
	* Copyright (c) 2014-2017, the respective contributors
	* All rights reserved.
	*
	* Caffe uses a shared copyright model: each contributor holds copyright over
	* their contributions to Caffe. The project versioning records all such
	* contribution and copyright details. If a contributor wants to further mark
	* their specific copyright on a particular contribution, they should indicate
	* their copyright solely in the commit message of the change when it is
	* committed.
	*
	* LICENSE
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions are met:
	*
	* 1. Redistributions of source code must retain the above copyright notice, this
	* list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright notice,
	* this list of conditions and the following disclaimer in the documentation
	* and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
	* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
	* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
	* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
	* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
	* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
	* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
	* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	*
	* CONTRIBUTION AGREEMENT
	*
	* By contributing to the BVLC/caffe repository through pull-request, comment,
	* or otherwise, the contributor releases their content to the
	* license and copyright terms herein.
	*
	*************** END Caffe Copyright Notice and Disclaimer ******************
	*
	* Copyright (c) 2017 by Contributors
	* \file im2col.h
	* \brief Function definitions of converting an image to
	* column matrix based on kernel, padding, and dilation.
	* These functions are mainly used in convolution operators.
	* The implementation of the im2col and col2im algorithms
	* are copied from Caffe with minor interface modifications
	* adapting to MXNet data structures.
	*/

	#ifndef MXNET_OPERATOR_NN_IM2COL_CUH_
	#define MXNET_OPERATOR_NN_IM2COL_CUH_

	#include <mxnet/base.h>
	#include <mxnet/operator.h>
	#include <algorithm>
	#include <cstring>
	#include <vector>
	#include "../mxnet_op.h"

	namespace mxnet {
	namespace op {

	/*!
	* \brief im2col gpu kernel.
	* DO NOT call this directly. Use wrapper function im2col() instead;
	*/
	template <typename DType>
	__global__ void im2col_gpu_kernel(const int n, const DType* data_im,
	const int height, const int width, const int kernel_h, const int kernel_w,
	const int pad_h, const int pad_w,
	const int stride_h, const int stride_w,
	const int dilation_h, const int dilation_w,
	const int height_col, const int width_col,
	DType* data_col) {
	CUDA_KERNEL_LOOP(index, n) {
	const int h_index = index / width_col;
	const int h_col = h_index % height_col;
	const int w_col = index % width_col;
	const int c_im = h_index / height_col;
	const int c_col = c_im * kernel_h * kernel_w;
	const int h_offset = h_col * stride_h - pad_h;
	const int w_offset = w_col * stride_w - pad_w;
	DType* data_col_ptr = data_col;
	data_col_ptr += (c_col * height_col + h_col) * width_col + w_col;
	const DType* data_im_ptr = data_im;
	data_im_ptr += (c_im * height + h_offset) * width + w_offset;
	for (int i = 0; i < kernel_h; ++i) {
	for (int j = 0; j < kernel_w; ++j) {
	int h_im = h_offset + i * dilation_h;
	int w_im = w_offset + j * dilation_w;
	*data_col_ptr =
	(h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) ?
	data_im_ptr[i * dilation_h * width + j * dilation_w] : static_cast<DType>(0);
	data_col_ptr += height_col * width_col;
	}
	}
	}
	}

	/*!
	* \brief DO NOT call this directly. Use wrapper function im2col() instead;
	*/
	template <typename DType>
	inline void im2col_gpu(mshadow::Stream<gpu>* s,
	const DType* data_im, const int channels,
	const int height, const int width,
	const int kernel_h, const int kernel_w,
	const int pad_h, const int pad_w,
	const int stride_h, const int stride_w,
	const int dilation_h, const int dilation_w,
	DType* data_col) {
	// We are going to launch channels * height_col * width_col kernels, each
	// kernel responsible for copying a single-channel grid.
	int height_col = (height + 2 * pad_h -
	(dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
	int width_col = (width + 2 * pad_w -
	(dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
	int num_kernels = channels * height_col * width_col;
	using namespace mxnet_op;
	// NOLINT_NEXT_LINE(whitespace/operators)
	im2col_gpu_kernel<DType><<<cuda_get_num_blocks(num_kernels), mshadow::cuda::kBaseThreadNum,
	0, mshadow::Stream<gpu>::GetStream(s)>>>(
	num_kernels, data_im, height, width, kernel_h, kernel_w, pad_h,
	pad_w, stride_h, stride_w, dilation_h, dilation_w, height_col,
	width_col, data_col);
	MSHADOW_CUDA_POST_KERNEL_CHECK(im2col_gpu_kernel);
	}

	/*!
	* \brief DO NOT call this directly. Use wrapper function col2im() instead;
	*/
	template <typename DType>
	__global__ void col2im_gpu_kernel(const int n, const DType* data_col,
	const int channels, const int height, const int width,
	const int kernel_h, const int kernel_w,
	const int pad_h, const int pad_w,
	const int stride_h, const int stride_w,
	const int dilation_h, const int dilation_w,
	const int height_col, const int width_col,
	DType* data_im, OpReqType req) {
	CUDA_KERNEL_LOOP(index, n) {
	DType val = 0;
	const int w_im = index % width + pad_w;
	const int h_im = (index / width) % height + pad_h;
	const int c_im = index / (width * height);
	int kernel_extent_w = (kernel_w - 1) * dilation_w + 1;
	int kernel_extent_h = (kernel_h - 1) * dilation_h + 1;
	// compute the start and end of the output
	const int w_col_start =
	(w_im < kernel_extent_w) ? 0 : (w_im - kernel_extent_w) / stride_w + 1;
	const int w_col_end = min(w_im / stride_w + 1, width_col);
	const int h_col_start =
	(h_im < kernel_extent_h) ? 0 : (h_im - kernel_extent_h) / stride_h + 1;
	const int h_col_end = min(h_im / stride_h + 1, height_col);
	// TODO(caffe): use LCM of stride and dilation to avoid unnecessary loops
	for (int h_col = h_col_start; h_col < h_col_end; h_col += 1) {
	for (int w_col = w_col_start; w_col < w_col_end; w_col += 1) {
	int h_k = (h_im - h_col * stride_h);
	int w_k = (w_im - w_col * stride_w);
	if (h_k % dilation_h == 0 && w_k % dilation_w == 0) {
	h_k /= dilation_h;
	w_k /= dilation_w;
	int data_col_index = (((c_im * kernel_h + h_k) * kernel_w + w_k) *
	height_col + h_col) * width_col + w_col;
	val += data_col[data_col_index];
	}
	}
	}
	KERNEL_ASSIGN(data_im[index], req, val);
	}
	}

	/*!
	* \brief DO NOT call this directly. Use wrapper function col2im() instead;
	*/
	using mshadow::Shape;
	template <typename DType, int num_axes>
	__global__ void im2col_nd_gpu_kernel(const int n, const DType* data_im,
	const Shape<num_axes+2> im_shape, const Shape<num_axes+1> col_shape,
	const Shape<num_axes> kernel_shape, const Shape<num_axes> pad, const Shape<num_axes> stride,
	const Shape<num_axes> dilation, DType* data_col) {
	int d_temp[num_axes]; // NOLINT(runtime/arrays)
	int d_iter[num_axes]; // NOLINT(runtime/arrays)

	__shared__ int shared_dilation[num_axes];
	__shared__ int shared_kernel_shape[num_axes];
	__shared__ int shared_pad[num_axes];
	__shared__ int shared_stride[num_axes];
	__shared__ int shared_col_shape[num_axes + 1];
	__shared__ int shared_im_shape[num_axes + 1];

	if (threadIdx.x < num_axes) {
	shared_dilation[threadIdx.x] = dilation[threadIdx.x];
	shared_kernel_shape[threadIdx.x] = kernel_shape[threadIdx.x];
	shared_pad[threadIdx.x] = pad[threadIdx.x];
	shared_stride[threadIdx.x] = stride[threadIdx.x];
	}
	if (threadIdx.x < num_axes + 1) {
	shared_col_shape[threadIdx.x] = col_shape[threadIdx.x];
	shared_im_shape[threadIdx.x] = im_shape[threadIdx.x+1]; // skip batch dim
	}
	__syncthreads();

	int i;
	CUDA_KERNEL_LOOP(index, n) {
	// Initialize channel_in, computed in the loop below, with intermediate
	// computations used to compute the spatial indices.
	int channel_in = index;
	int channel_out = 1;
	for (i = num_axes - 1; i >= 0; --i) {
	d_temp[i] = channel_in % shared_col_shape[i + 1];
	channel_in /= shared_col_shape[i + 1];
	channel_out *= shared_kernel_shape[i];
	}
	channel_out *= channel_in;
	int data_col_inc = 1;
	for (i = 0; i < num_axes; ++i) {
	channel_out *= shared_col_shape[i + 1];
	channel_out += d_temp[i];
	d_temp[i] = d_temp[i] * shared_stride[i] - shared_pad[i];
	channel_in *= shared_im_shape[i + 1];
	channel_in += d_temp[i];
	data_col_inc *= shared_col_shape[i + 1];
	d_iter[i] = 0;
	}
	DType* data_col_ptr = data_col + channel_out;
	const DType* data_im_ptr = data_im + channel_in;
	bool incremented;
	do {
	bool in_range = true;
	for (i = 0; i < num_axes; ++i) {
	const int d_iter_im = d_iter[i] * shared_dilation[i] + d_temp[i];
	in_range &= d_iter_im >= 0 && d_iter_im < shared_im_shape[i + 1];
	if (!in_range) { break; }
	}
	if (in_range) {
	int data_im_offset = d_iter[0] * shared_dilation[0];
	for (i = 1; i < num_axes; ++i) {
	data_im_offset *= shared_im_shape[i + 1];
	data_im_offset += d_iter[i] * shared_dilation[i];
	}
	*data_col_ptr = data_im_ptr[data_im_offset];
	} else {
	*data_col_ptr = 0;
	}
	data_col_ptr += data_col_inc;
	incremented = false;
	for (i = num_axes - 1; i >= 0; --i) {
	const int d_max = shared_kernel_shape[i];
	if (d_iter[i] == d_max - 1) {
	d_iter[i] = 0;
	} else { // d_iter[i] < d_max - 1
	++d_iter[i];
	incremented = true;
	break;
	}
	} // for (int i = num_axes - 1; i >= 0; --i)
	} while (incremented); // do
	} // CUDA_KERNEL_LOOP(index, n)
	}

	/*!\brief im2col gpu version
	* \param s device stream
	* \param data_im pointer of an image (C, H, W, ...) in the image batch
	* \param col_shape column buffer shape (#channels, output_im_height, output_im_width, ...)
	* \param kernel_shape kernel filter shape
	* \param pad pad shape
	* \param stride stride shape
	* \param dilation dilation shape
	* \param data_col column buffer pointer
	*/
	template <typename DType>
	inline void im2col(mshadow::Stream<gpu>* s,
	const DType* data_im, const mxnet::TShape& im_shape,
	const mxnet::TShape& col_shape, const mxnet::TShape& kernel_shape,
	const mxnet::TShape& pad, const mxnet::TShape& stride,
	const mxnet::TShape& dilation, DType* data_col) {
	// num_axes should be smaller than block size
	index_t num_spatial_axes = kernel_shape.ndim();
	CHECK_LT(num_spatial_axes, mshadow::cuda::kBaseThreadNum);
	index_t num_kernels = im_shape[1] * col_shape.ProdShape(1, col_shape.ndim());
	using namespace mxnet_op;
	switch (num_spatial_axes) {
	case 1:
	im2col_nd_gpu_kernel<DType, 1> // NOLINT_NEXT_LINE(whitespace/operators)
	<<<cuda_get_num_blocks(num_kernels), mshadow::cuda::kBaseThreadNum,
	0, mshadow::Stream<gpu>::GetStream(s)>>>(
	num_kernels, data_im, im_shape.get<3>(), col_shape.get<2>(),
	kernel_shape.get<1>(), pad.get<1>(), stride.get<1>(), dilation.get<1>(), data_col);
	break;
	case 2:
	im2col_gpu_kernel<DType> // NOLINT_NEXT_LINE(whitespace/operators)
	<<<cuda_get_num_blocks(num_kernels), mshadow::cuda::kBaseThreadNum,
	0, mshadow::Stream<gpu>::GetStream(s)>>>(
	num_kernels, data_im, im_shape[2], im_shape[3], kernel_shape[0], kernel_shape[1],
	pad[0], pad[1], stride[0], stride[1], dilation[0], dilation[1],
	col_shape[1], col_shape[2], data_col);
	break;
	case 3:
	im2col_nd_gpu_kernel<DType, 3> // NOLINT_NEXT_LINE(whitespace/operators)
	<<<cuda_get_num_blocks(num_kernels), mshadow::cuda::kBaseThreadNum,
	0, mshadow::Stream<gpu>::GetStream(s)>>>(
	num_kernels, data_im, im_shape.get<5>(), col_shape.get<4>(),
	kernel_shape.get<3>(), pad.get<3>(), stride.get<3>(), dilation.get<3>(), data_col);
	break;
	default:
	LOG(FATAL) << "im2col_nd_gpu does not support computation with "
	<< num_spatial_axes << " spatial axes";
	}
	MSHADOW_CUDA_POST_KERNEL_CHECK(im2col_nd_gpu_kernel);
	}

	/*!
	* \brief DO NOT call this directly. Use wrapper function col2im() instead;
	*/
	template <typename DType>
	inline void col2im_gpu(mshadow::Stream<gpu>* s, const DType* data_col, const int channels,
	const int height, const int width, const int kernel_h, const int kernel_w,
	const int pad_h, const int pad_w, const int stride_h,
	const int stride_w, const int dilation_h, const int dilation_w,
	DType* data_im, OpReqType req) {
	int height_col = (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) /
	stride_h + 1;
	int width_col = (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) /
	stride_w + 1;
	int num_kernels = channels * height * width;
	using namespace mxnet_op;
	// To avoid involving atomic operations, we will launch one kernel per
	// bottom dimension, and then in the kernel add up the top dimensions.
	// NOLINT_NEXT_LINE(whitespace/operators)
	col2im_gpu_kernel<DType><<<cuda_get_num_blocks(num_kernels), mshadow::cuda::kBaseThreadNum,
	0, mshadow::Stream<gpu>::GetStream(s)>>>(
	num_kernels, data_col, height, width, channels, kernel_h, kernel_w,
	pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w,
	height_col, width_col, data_im, req);
	MSHADOW_CUDA_POST_KERNEL_CHECK(col2im_gpu_kernel);
	}

	/*!
	* \brief DO NOT call this directly. Use wrapper function col2im() instead;
	*/
	template <typename DType, int num_axes>
	__global__ void col2im_nd_gpu_kernel(const int n, const DType* data_col,
	const Shape<num_axes+2> im_shape, const Shape<num_axes+1> col_shape,
	const Shape<num_axes> kernel_shape, const Shape<num_axes> pad, const Shape<num_axes> stride,
	const Shape<num_axes> dilation, DType* data_im, OpReqType req) {
	int d_im[num_axes]; // NOLINT(runtime/arrays)
	int d_col_iter[num_axes]; // NOLINT(runtime/arrays)
	int d_col_start[num_axes]; // NOLINT(runtime/arrays)
	int d_col_end[num_axes]; // NOLINT(runtime/arrays)

	__shared__ int shared_dilation[num_axes];
	__shared__ int shared_kernel_shape[num_axes];
	__shared__ int shared_pad[num_axes];
	__shared__ int shared_stride[num_axes];
	__shared__ int shared_col_shape[num_axes + 1];
	__shared__ int shared_im_shape[num_axes + 1];

	if (threadIdx.x < num_axes) {
	shared_dilation[threadIdx.x] = dilation[threadIdx.x];
	shared_kernel_shape[threadIdx.x] = kernel_shape[threadIdx.x];
	shared_pad[threadIdx.x] = pad[threadIdx.x];
	shared_stride[threadIdx.x] = stride[threadIdx.x];
	}
	if (threadIdx.x < num_axes + 1) {
	shared_col_shape[threadIdx.x] = col_shape[threadIdx.x];
	shared_im_shape[threadIdx.x] = im_shape[threadIdx.x+1]; // skip batch dim
	}
	__syncthreads();

	CUDA_KERNEL_LOOP(index, n) {
	// Initialize channel_in, computed in the loop below, with intermediate
	// computations used to compute the spatial indices.
	int c_im = index;
	// Calculate d_im (image dimensions).
	for (int i = num_axes - 1; i >= 0; --i) {
	d_im[i] = c_im % shared_im_shape[i + 1] + shared_pad[i];
	c_im /= shared_im_shape[i + 1];
	}
	// Calculate col start/end indices.
	bool done = false;
	for (int i = 0; i < num_axes; ++i) {
	const int kernel_extent =
	shared_dilation[i] * (shared_kernel_shape[i] - 1) + 1;
	d_col_start[i] = d_col_iter[i] =
	(d_im[i] < kernel_extent) ? 0 :
	(d_im[i] - kernel_extent) / shared_stride[i] + 1;
	d_col_end[i] =
	min(d_im[i] / shared_stride[i] + 1, shared_col_shape[i + 1]);
	if (d_col_start[i] >= d_col_end[i]) {
	// Skip computation if the dimension is 0 at any spatial axis --
	// final val will be 0.
	data_im[index] = 0;
	done = true;
	break; // for (int i = 0; i < num_axes; ++i)
	}
	}
	if (done) {
	continue; // CUDA_KERNEL_LOOP(index, n)
	}
	// Loop over the col to compute the output val.
	DType val = 0;
	bool incremented = true;
	bool skip = false;
	do {
	// Compute the final offset.
	int final_offset = 0;
	int kernel_shape_prod = 1;
	int kernel_index;
	for (int i = num_axes - 1; i >= 0; --i) {
	kernel_index = d_im[i] - d_col_iter[i] * shared_stride[i];
	if (kernel_index % shared_dilation[i]) {
	skip = true;
	break;
	} else {
	kernel_index /= shared_dilation[i];
	final_offset += kernel_index * kernel_shape_prod;
	kernel_shape_prod *= shared_kernel_shape[i];
	}
	}
	if (!skip) {
	final_offset += kernel_shape_prod * c_im;
	for (int i = 0; i < num_axes; ++i) {
	final_offset *= shared_col_shape[i + 1];
	final_offset += d_col_iter[i];
	}
	val += data_col[final_offset];
	}
	skip = false;
	incremented = false;
	for (int i = num_axes - 1; i >= 0; --i) {
	const int d_max = d_col_end[i];
	if (d_col_iter[i] == d_max - 1) {
	d_col_iter[i] = d_col_start[i];
	} else { // d_col_iter[i] < d_max - 1
	++d_col_iter[i];
	incremented = true;
	break; // for (int i = num_axes - 1; i >= 0; --i)
	}
	} // for (int i = num_axes - 1; i >= 0; --i)
	} while (incremented);
	KERNEL_ASSIGN(data_im[index], req, val);
	} // CUDA_KERNEL_LOOP(index, n)
	}

	/*!\brief
	* gpu function of col2im algorithm
	* \param s device stream
	* \param data_col start pointer of the column buffer to be filled
	* \param im_shape input image shape in dimensions (N, C, H, W,)
	* \param col_shape column buffer shape
	* \param kernel_shape kernel filter shape
	* \param pad pad shape
	* \param stride stride shape
	* \param dilation dilation shape
	* \param data_im pointer of a image (C, H, W,...) in the image batch
	*/
	template <typename DType>
	inline void col2im(mshadow::Stream<gpu>* s,
	const DType* data_col, const mxnet::TShape& im_shape,
	const mxnet::TShape& col_shape, const mxnet::TShape& kernel_shape,
	const mxnet::TShape& pad, const mxnet::TShape& stride,
	const mxnet::TShape& dilation, DType* data_im, OpReqType req) {
	index_t num_spatial_axes = kernel_shape.ndim();
	index_t im_size = im_shape.ProdShape(1, im_shape.ndim());
	// num_axes should be smaller than block size
	CHECK_LT(num_spatial_axes, mshadow::cuda::kBaseThreadNum);
	using namespace mxnet_op;
	switch (num_spatial_axes) {
	case 1:
	col2im_nd_gpu_kernel<DType, 1> // NOLINT_NEXT_LINE(whitespace/operators)
	<<<cuda_get_num_blocks(im_size), mshadow::cuda::kBaseThreadNum,
	0, mshadow::Stream<gpu>::GetStream(s)>>>(
	im_size, data_col, im_shape.get<3>(), col_shape.get<2>(),
	kernel_shape.get<1>(), pad.get<1>(), stride.get<1>(), dilation.get<1>(),
	data_im, req);
	MSHADOW_CUDA_POST_KERNEL_CHECK(col2im_nd_gpu_kernel);
	break;
	case 2:
	// To avoid involving atomic operations, we will launch one kernel per
	// bottom dimension, and then in the kernel add up the top dimensions.
	// NOLINT_NEXT_LINE(whitespace/operators)
	col2im_gpu_kernel<DType><<<cuda_get_num_blocks(im_size), mshadow::cuda::kBaseThreadNum,
	0, mshadow::Stream<gpu>::GetStream(s)>>>(
	im_size, data_col, im_shape[1], im_shape[2], im_shape[3],
	kernel_shape[0], kernel_shape[1], pad[0], pad[1], stride[0], stride[1],
	dilation[0], dilation[1], col_shape[1], col_shape[2], data_im, req);
	MSHADOW_CUDA_POST_KERNEL_CHECK(col2im_gpu_kernel);
	break;
	case 3:
	col2im_nd_gpu_kernel<DType, 3> // NOLINT_NEXT_LINE(whitespace/operators)
	<<<cuda_get_num_blocks(im_size), mshadow::cuda::kBaseThreadNum,
	0, mshadow::Stream<gpu>::GetStream(s)>>>(
	im_size, data_col, im_shape.get<5>(), col_shape.get<4>(),
	kernel_shape.get<3>(), pad.get<3>(), stride.get<3>(), dilation.get<3>(),
	data_im, req);
	MSHADOW_CUDA_POST_KERNEL_CHECK(col2im_nd_gpu_kernel);
	break;
	default:
	LOG(FATAL) << "col2im_nd_gpu does not support computation with "
	<< num_spatial_axes << " spatial axes";
	}
	}

	} // namespace op
	} // namespace mxnet

	#endif // MXNET_OPERATOR_NN_IM2COL_CUH_