src/operator/nn/im2col.h - mxnet - Git at Google

 /*!
  ******************* BEGIN Caffe Copyright Notice and Disclaimer ****************
  *
  * COPYRIGHT
  *
  * All contributions by the University of California:
  * Copyright (c) 2014-2017 The Regents of the University of California (Regents)
  * All rights reserved.
  *
  * All other contributions:
  * Copyright (c) 2014-2017, the respective contributors
  * All rights reserved.
  *
  * Caffe uses a shared copyright model: each contributor holds copyright over
  * their contributions to Caffe. The project versioning records all such
  * contribution and copyright details. If a contributor wants to further mark
  * their specific copyright on a particular contribution, they should indicate
  * their copyright solely in the commit message of the change when it is
  * committed.
  *
  * LICENSE
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *
  * 1. Redistributions of source code must retain the above copyright notice, this
  * list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright notice,
  * this list of conditions and the following disclaimer in the documentation
  * and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  * CONTRIBUTION AGREEMENT
  *
  * By contributing to the BVLC/caffe repository through pull-request, comment,
  * or otherwise, the contributor releases their content to the
  * license and copyright terms herein.
  *
  ***************** END Caffe Copyright Notice and Disclaimer ********************
  *
  * Copyright (c) 2017 by Contributors
  * \file im2col.h
  * \brief Function definitions of converting an image to
  * column matrix based on kernel, padding, and dilation.
  * These functions are mainly used in convolution operators.
  * The implementation of the im2col and col2im algorithms
  * are copied from Caffe with minor interface modifications
  * adapting to MXNet data structures.
  */

 #ifndef MXNET_OPERATOR_NN_IM2COL_H_
 #define MXNET_OPERATOR_NN_IM2COL_H_

 #include <mxnet/base.h>
 #include <mxnet/operator.h>
 #include <cstring>
 #include <vector>
 #include "../mxnet_op.h"

 namespace mxnet {
 namespace op {

 // Function uses casting from int to unsigned to compare if value of
 // parameter a is greater or equal to zero and lower than value of
 // parameter b. The b parameter is of type signed and is always positive,
 // therefore its value is always lower than 0x800... where casting
 // negative value of a parameter converts it to value higher than 0x800...
 // The casting allows to use one condition instead of two.
 inline bool is_a_ge_zero_and_a_lt_b(int a, int b) {
   return static_cast<unsigned>(a) < static_cast<unsigned>(b);
 }

 /*!
  * \brief im2col 2D cpu version.
  * DO NOT call this function directly.
  * Use the wrapper function im2col() instead.
  */
 template <typename DType>
 inline void im2col_cpu(const DType* data_im, const int channels,
     const int height, const int width, const int kernel_h, const int kernel_w,
     const int pad_h, const int pad_w,
     const int stride_h, const int stride_w,
     const int dilation_h, const int dilation_w,
     DType* data_col) {
   const int output_h = (height + 2 * pad_h -
     (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
   const int output_w = (width + 2 * pad_w -
     (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
   const int channel_size = height * width;
   // TODO(junwu): we tested adding openmp (w/ & w/o collapse clause) here
   // for testing the performance of convolution operator,
   // but the total runtime increased by 0.8s for images of shape
   // (8, 32, 64, 64) and decreased by 0.2s for images of shape
   // (16, 64, 64, 64). Both kernel shapes are (8, 8). We think the
   // bottleneck of the convolution operator probably lies in dot().
   // Hence, adding more threads to the loops contributes little
   // toward improving the convolution operator's performance.
   // We will revisit this issue in the future.
   for (int channel = channels; channel--; data_im += channel_size) {
     for (int kernel_row = 0; kernel_row < kernel_h; kernel_row++) {
       for (int kernel_col = 0; kernel_col < kernel_w; kernel_col++) {
         int input_row = -pad_h + kernel_row * dilation_h;
         for (int output_rows = output_h; output_rows; output_rows--) {
           if (!is_a_ge_zero_and_a_lt_b(input_row, height)) {
             for (int output_cols = output_w; output_cols; output_cols--) {
               *(data_col++) = 0;
             }
           } else {
             int input_col = -pad_w + kernel_col * dilation_w;
             for (int output_col = output_w; output_col; output_col--) {
               if (is_a_ge_zero_and_a_lt_b(input_col, width)) {
                 *(data_col++) = data_im[input_row * width + input_col];
               } else {
                 *(data_col++) = 0;
               }
               input_col += stride_w;
             }
           }
           input_row += stride_h;
         }
       }
     }
   }
 }

 /*!
  * \brief core function of im2col algorithm. DO NOT call this function directly.
  * Use wrapper function im2col() instead.
  * \param data_input image pointer pointing the first element of channel dim
  * \param im2col determine whether the algorithm is im2col or col2im
  * \param im_shape input image shape in dimensions (N, C, H, W,)
  * \param col_shape column buffer shape
  * \param kernel_shape kernel filter shape
  * \param pad pad shape
  * \param stride stride shape
  * \param dilation dilation shape
  * \param data_output start pointer of the column buffer to be filled
  */
 template <typename DType>
 inline void im2col_nd_core_cpu(const DType* data_input, const bool im2col,
     const mxnet::TShape& im_shape, const mxnet::TShape& col_shape,
     const mxnet::TShape& kernel_shape, const mxnet::TShape& pad, const mxnet::TShape& stride,
     const mxnet::TShape& dilation, DType* data_output, OpReqType req = mxnet::kWriteTo) {
   if (mxnet::kNullOp == req) return;
   index_t num_spatial_axes = kernel_shape.ndim();
   if (!im2col) {
     index_t im_size = im_shape[1];  // skip batch dim
     for (index_t i = 0; i < num_spatial_axes; ++i) {
       im_size *= im_shape[2 + i];
     }
     if (mxnet::kAddTo != req) {
       std::fill(data_output, data_output+im_size, static_cast<DType>(0));
     }
   }
   index_t kernel_size = 1;
   for (index_t i = 0; i < num_spatial_axes; ++i) {
     kernel_size *= kernel_shape[i];
   }
   const index_t channels_col = col_shape[0];
   std::vector<index_t> d_offset(num_spatial_axes, 0);
   std::vector<index_t> d_iter(num_spatial_axes, 0);
   for (index_t c_col = 0; c_col < channels_col; ++c_col) {
     // Loop over spatial axes in reverse order to compute a per-axis offset.
     index_t offset = c_col;
     for (int d_i = static_cast<int>(num_spatial_axes) - 1; d_i >= 0; --d_i) {
       if (d_i < static_cast<int>(num_spatial_axes) - 1) {
         offset /= kernel_shape[d_i + 1];
       }
       d_offset[d_i] = offset % kernel_shape[d_i];
     }
     for (bool incremented = true; incremented; ) {
       // Loop over spatial axes in forward order to compute the indices in the
       // image and column, and whether the index lies in the padding.
       index_t index_col = c_col;
       int index_im = c_col / kernel_size;
       bool is_padding = false;
       for (index_t d_i = 0; d_i < num_spatial_axes; ++d_i) {
         const index_t d = d_iter[d_i];
         const int d_im = static_cast<int>(d * stride[d_i] + d_offset[d_i] * dilation[d_i])
           - static_cast<int>(pad[d_i]);
         is_padding |= d_im < 0 || d_im >= static_cast<int>(im_shape[d_i + 2]);
         index_col *= col_shape[d_i + 1];
         index_col += d;
         index_im *= static_cast<int>(im_shape[d_i + 2]);
         index_im += d_im;
       }
       if (im2col) {
         if (is_padding) {
           data_output[index_col] = 0;
         } else {
           data_output[index_col] = data_input[index_im];
         }
       } else if (!is_padding) {  // col2im
         data_output[index_im] += data_input[index_col];
       }
       // Loop over spatial axes in reverse order to choose an index,
       // like counting.
       incremented = false;
       for (int d_i = static_cast<int>(num_spatial_axes) - 1; d_i >= 0; --d_i) {
         const index_t d_max = col_shape[d_i + 1];
         CHECK_LT(d_iter[d_i], d_max);
         if (d_iter[d_i] + 1 == d_max) {
           d_iter[d_i] = 0;
         } else {  // d_iter[d_i] < d_max - 1
           ++d_iter[d_i];
           incremented = true;
           break;
         }
       }
     }  // while(incremented)
   }  // for (int c = 0; c < channels_col; ++c)
 }

 /*!
  * \brief cpu function of im2col algorithm
  * \param data_im pointer of a image (C, H, W,...) in the image batch
  * \param im_shape input image shape in dimensions (N, C, H, W,)
  * \param col_shape column buffer shape
  * \param kernel_shape kernel filter shape
  * \param pad pad shape
  * \param stride stride shape
  * \param dilation dilation shape
  * \param data_col start pointer of the column buffer to be filled
  */
 template <typename DType>
 inline void im2col(mshadow::Stream<cpu>* s,
                    const DType* data_im, const mxnet::TShape& im_shape,
                    const mxnet::TShape& col_shape, const mxnet::TShape& kernel_shape,
                    const mxnet::TShape& pad, const mxnet::TShape& stride,
                    const mxnet::TShape& dilation, DType* data_col) {
   if (2 == kernel_shape.ndim()) {
     im2col_cpu(data_im, im_shape[1], im_shape[2], im_shape[3],
                kernel_shape[0], kernel_shape[1], pad[0], pad[1],
                stride[0], stride[1], dilation[0], dilation[1], data_col);
   } else {
     im2col_nd_core_cpu(data_im, true, im_shape, col_shape,
                        kernel_shape, pad, stride, dilation, data_col);
   }
 }

 /*!
  * \brief col2im 2D cpu version.
  * DO NOT call this function directly. Use wrapper function col2im() instead.
  */
 template <typename DType>
 inline void col2im_cpu(const DType* data_col, const int channels,
     const int height, const int width, const int kernel_h, const int kernel_w,
     const int pad_h, const int pad_w,
     const int stride_h, const int stride_w,
     const int dilation_h, const int dilation_w,
     DType* data_im, OpReqType req) {
   if (mxnet::kNullOp == req) return;
   if (mxnet::kAddTo != req) {
     std::fill(data_im, data_im+height*width*channels, static_cast<DType>(0));
   }
   const int output_h = (height + 2 * pad_h -
     (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
   const int output_w = (width + 2 * pad_w -
     (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
   const int channel_size = height * width;
   // TODO(junwu): we tested adding openmp (w/ & w/o collapse clause) here
   // for testing the performance of convolution operator,
   // but the total runtime increased by 0.8s for images of shape
   // (8, 32, 64, 64) and decreased by 0.2s for images of shape
   // (16, 64, 64, 64). Both kernel shapes are (8, 8). We think the
   // bottleneck of the convolution operator probably lies in dot().
   // Hence, adding more threads to the loops contributes little
   // toward improving the convolution operator's performance.
   // We will revisit this issue in the future.
   for (int channel = channels; channel--; data_im += channel_size) {
     for (int kernel_row = 0; kernel_row < kernel_h; kernel_row++) {
       for (int kernel_col = 0; kernel_col < kernel_w; kernel_col++) {
         int input_row = -pad_h + kernel_row * dilation_h;
         for (int output_rows = output_h; output_rows; output_rows--) {
           if (!is_a_ge_zero_and_a_lt_b(input_row, height)) {
             data_col += output_w;
           } else {
             int input_col = -pad_w + kernel_col * dilation_w;
             for (int output_col = output_w; output_col; output_col--) {
               if (is_a_ge_zero_and_a_lt_b(input_col, width)) {
                 data_im[input_row * width + input_col] += *data_col;
               }
               data_col++;
               input_col += stride_w;
             }
           }
           input_row += stride_h;
         }
       }
     }
   }
 }

 /*!\brief
  * cpu function of col2im algorithm
  * \param s device stream
  * \param data_col start pointer of the column buffer to be filled
  * \param im_shape input image shape in dimensions (N, C, H, W,)
  * \param col_shape column buffer shape
  * \param kernel_shape kernel filter shape
  * \param pad pad shape
  * \param stride stride shape
  * \param dilation dilation shape
  * \param data_im pointer of a image (C, H, W,...) in the image batch
  */
 template <typename DType>
 inline void col2im(mshadow::Stream<cpu>* s,
                    const DType* data_col, const mxnet::TShape& im_shape,
                    const mxnet::TShape& col_shape, const mxnet::TShape& kernel_shape,
                    const mxnet::TShape& pad, const mxnet::TShape& stride,
                    const mxnet::TShape& dilation, DType* data_im, OpReqType req) {
   index_t num_spatial_axes = kernel_shape.ndim();
   if (2 == num_spatial_axes) {
     col2im_cpu(data_col, im_shape[1], im_shape[2], im_shape[3],
                kernel_shape[0], kernel_shape[1], pad[0], pad[1],
                stride[0], stride[1], dilation[0], dilation[1], data_im, req);
   } else {
     im2col_nd_core_cpu(data_col, false, im_shape, col_shape,
                        kernel_shape, pad, stride, dilation, data_im, req);
   }
 }

 }  // namespace op
 }  // namespace mxnet
 #ifdef __CUDACC__
 #include "./im2col.cuh"
 #endif
 #endif  // MXNET_OPERATOR_NN_IM2COL_H_
	/*!
	***************** BEGIN Caffe Copyright Notice and Disclaimer **************
	*
	* COPYRIGHT
	*
	* All contributions by the University of California:
	* Copyright (c) 2014-2017 The Regents of the University of California (Regents)
	* All rights reserved.
	*
	* All other contributions:
	* Copyright (c) 2014-2017, the respective contributors
	* All rights reserved.
	*
	* Caffe uses a shared copyright model: each contributor holds copyright over
	* their contributions to Caffe. The project versioning records all such
	* contribution and copyright details. If a contributor wants to further mark
	* their specific copyright on a particular contribution, they should indicate
	* their copyright solely in the commit message of the change when it is
	* committed.
	*
	* LICENSE
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions are met:
	*
	* 1. Redistributions of source code must retain the above copyright notice, this
	* list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright notice,
	* this list of conditions and the following disclaimer in the documentation
	* and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
	* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
	* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
	* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
	* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
	* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
	* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
	* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	*
	* CONTRIBUTION AGREEMENT
	*
	* By contributing to the BVLC/caffe repository through pull-request, comment,
	* or otherwise, the contributor releases their content to the
	* license and copyright terms herein.
	*
	*************** END Caffe Copyright Notice and Disclaimer ******************
	*
	* Copyright (c) 2017 by Contributors
	* \file im2col.h
	* \brief Function definitions of converting an image to
	* column matrix based on kernel, padding, and dilation.
	* These functions are mainly used in convolution operators.
	* The implementation of the im2col and col2im algorithms
	* are copied from Caffe with minor interface modifications
	* adapting to MXNet data structures.
	*/

	#ifndef MXNET_OPERATOR_NN_IM2COL_H_
	#define MXNET_OPERATOR_NN_IM2COL_H_

	#include <mxnet/base.h>
	#include <mxnet/operator.h>
	#include <cstring>
	#include <vector>
	#include "../mxnet_op.h"

	namespace mxnet {
	namespace op {

	// Function uses casting from int to unsigned to compare if value of
	// parameter a is greater or equal to zero and lower than value of
	// parameter b. The b parameter is of type signed and is always positive,
	// therefore its value is always lower than 0x800... where casting
	// negative value of a parameter converts it to value higher than 0x800...
	// The casting allows to use one condition instead of two.
	inline bool is_a_ge_zero_and_a_lt_b(int a, int b) {
	return static_cast<unsigned>(a) < static_cast<unsigned>(b);
	}

	/*!
	* \brief im2col 2D cpu version.
	* DO NOT call this function directly.
	* Use the wrapper function im2col() instead.
	*/
	template <typename DType>
	inline void im2col_cpu(const DType* data_im, const int channels,
	const int height, const int width, const int kernel_h, const int kernel_w,
	const int pad_h, const int pad_w,
	const int stride_h, const int stride_w,
	const int dilation_h, const int dilation_w,
	DType* data_col) {
	const int output_h = (height + 2 * pad_h -
	(dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
	const int output_w = (width + 2 * pad_w -
	(dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
	const int channel_size = height * width;
	// TODO(junwu): we tested adding openmp (w/ & w/o collapse clause) here
	// for testing the performance of convolution operator,
	// but the total runtime increased by 0.8s for images of shape
	// (8, 32, 64, 64) and decreased by 0.2s for images of shape
	// (16, 64, 64, 64). Both kernel shapes are (8, 8). We think the
	// bottleneck of the convolution operator probably lies in dot().
	// Hence, adding more threads to the loops contributes little
	// toward improving the convolution operator's performance.
	// We will revisit this issue in the future.
	for (int channel = channels; channel--; data_im += channel_size) {
	for (int kernel_row = 0; kernel_row < kernel_h; kernel_row++) {
	for (int kernel_col = 0; kernel_col < kernel_w; kernel_col++) {
	int input_row = -pad_h + kernel_row * dilation_h;
	for (int output_rows = output_h; output_rows; output_rows--) {
	if (!is_a_ge_zero_and_a_lt_b(input_row, height)) {
	for (int output_cols = output_w; output_cols; output_cols--) {
	*(data_col++) = 0;
	}
	} else {
	int input_col = -pad_w + kernel_col * dilation_w;
	for (int output_col = output_w; output_col; output_col--) {
	if (is_a_ge_zero_and_a_lt_b(input_col, width)) {
	(data_col++) = data_im[input_row width + input_col];
	} else {
	*(data_col++) = 0;
	}
	input_col += stride_w;
	}
	}
	input_row += stride_h;
	}
	}
	}
	}
	}

	/*!
	* \brief core function of im2col algorithm. DO NOT call this function directly.
	* Use wrapper function im2col() instead.
	* \param data_input image pointer pointing the first element of channel dim
	* \param im2col determine whether the algorithm is im2col or col2im
	* \param im_shape input image shape in dimensions (N, C, H, W,)
	* \param col_shape column buffer shape
	* \param kernel_shape kernel filter shape
	* \param pad pad shape
	* \param stride stride shape
	* \param dilation dilation shape
	* \param data_output start pointer of the column buffer to be filled
	*/
	template <typename DType>
	inline void im2col_nd_core_cpu(const DType* data_input, const bool im2col,
	const mxnet::TShape& im_shape, const mxnet::TShape& col_shape,
	const mxnet::TShape& kernel_shape, const mxnet::TShape& pad, const mxnet::TShape& stride,
	const mxnet::TShape& dilation, DType* data_output, OpReqType req = mxnet::kWriteTo) {
	if (mxnet::kNullOp == req) return;
	index_t num_spatial_axes = kernel_shape.ndim();
	if (!im2col) {
	index_t im_size = im_shape[1]; // skip batch dim
	for (index_t i = 0; i < num_spatial_axes; ++i) {
	im_size *= im_shape[2 + i];
	}
	if (mxnet::kAddTo != req) {
	std::fill(data_output, data_output+im_size, static_cast<DType>(0));
	}
	}
	index_t kernel_size = 1;
	for (index_t i = 0; i < num_spatial_axes; ++i) {
	kernel_size *= kernel_shape[i];
	}
	const index_t channels_col = col_shape[0];
	std::vector<index_t> d_offset(num_spatial_axes, 0);
	std::vector<index_t> d_iter(num_spatial_axes, 0);
	for (index_t c_col = 0; c_col < channels_col; ++c_col) {
	// Loop over spatial axes in reverse order to compute a per-axis offset.
	index_t offset = c_col;
	for (int d_i = static_cast<int>(num_spatial_axes) - 1; d_i >= 0; --d_i) {
	if (d_i < static_cast<int>(num_spatial_axes) - 1) {
	offset /= kernel_shape[d_i + 1];
	}
	d_offset[d_i] = offset % kernel_shape[d_i];
	}
	for (bool incremented = true; incremented; ) {
	// Loop over spatial axes in forward order to compute the indices in the
	// image and column, and whether the index lies in the padding.
	index_t index_col = c_col;
	int index_im = c_col / kernel_size;
	bool is_padding = false;
	for (index_t d_i = 0; d_i < num_spatial_axes; ++d_i) {
	const index_t d = d_iter[d_i];
	const int d_im = static_cast<int>(d * stride[d_i] + d_offset[d_i] * dilation[d_i])
	- static_cast<int>(pad[d_i]);
	is_padding \|= d_im < 0 \|\| d_im >= static_cast<int>(im_shape[d_i + 2]);
	index_col *= col_shape[d_i + 1];
	index_col += d;
	index_im *= static_cast<int>(im_shape[d_i + 2]);
	index_im += d_im;
	}
	if (im2col) {
	if (is_padding) {
	data_output[index_col] = 0;
	} else {
	data_output[index_col] = data_input[index_im];
	}
	} else if (!is_padding) { // col2im
	data_output[index_im] += data_input[index_col];
	}
	// Loop over spatial axes in reverse order to choose an index,
	// like counting.
	incremented = false;
	for (int d_i = static_cast<int>(num_spatial_axes) - 1; d_i >= 0; --d_i) {
	const index_t d_max = col_shape[d_i + 1];
	CHECK_LT(d_iter[d_i], d_max);
	if (d_iter[d_i] + 1 == d_max) {
	d_iter[d_i] = 0;
	} else { // d_iter[d_i] < d_max - 1
	++d_iter[d_i];
	incremented = true;
	break;
	}
	}
	} // while(incremented)
	} // for (int c = 0; c < channels_col; ++c)
	}

	/*!
	* \brief cpu function of im2col algorithm
	* \param data_im pointer of a image (C, H, W,...) in the image batch
	* \param im_shape input image shape in dimensions (N, C, H, W,)
	* \param col_shape column buffer shape
	* \param kernel_shape kernel filter shape
	* \param pad pad shape
	* \param stride stride shape
	* \param dilation dilation shape
	* \param data_col start pointer of the column buffer to be filled
	*/
	template <typename DType>
	inline void im2col(mshadow::Stream<cpu>* s,
	const DType* data_im, const mxnet::TShape& im_shape,
	const mxnet::TShape& col_shape, const mxnet::TShape& kernel_shape,
	const mxnet::TShape& pad, const mxnet::TShape& stride,
	const mxnet::TShape& dilation, DType* data_col) {
	if (2 == kernel_shape.ndim()) {
	im2col_cpu(data_im, im_shape[1], im_shape[2], im_shape[3],
	kernel_shape[0], kernel_shape[1], pad[0], pad[1],
	stride[0], stride[1], dilation[0], dilation[1], data_col);
	} else {
	im2col_nd_core_cpu(data_im, true, im_shape, col_shape,
	kernel_shape, pad, stride, dilation, data_col);
	}
	}

	/*!
	* \brief col2im 2D cpu version.
	* DO NOT call this function directly. Use wrapper function col2im() instead.
	*/
	template <typename DType>
	inline void col2im_cpu(const DType* data_col, const int channels,
	const int height, const int width, const int kernel_h, const int kernel_w,
	const int pad_h, const int pad_w,
	const int stride_h, const int stride_w,
	const int dilation_h, const int dilation_w,
	DType* data_im, OpReqType req) {
	if (mxnet::kNullOp == req) return;
	if (mxnet::kAddTo != req) {
	std::fill(data_im, data_im+heightwidthchannels, static_cast<DType>(0));
	}
	const int output_h = (height + 2 * pad_h -
	(dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
	const int output_w = (width + 2 * pad_w -
	(dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
	const int channel_size = height * width;
	// TODO(junwu): we tested adding openmp (w/ & w/o collapse clause) here
	// for testing the performance of convolution operator,
	// but the total runtime increased by 0.8s for images of shape
	// (8, 32, 64, 64) and decreased by 0.2s for images of shape
	// (16, 64, 64, 64). Both kernel shapes are (8, 8). We think the
	// bottleneck of the convolution operator probably lies in dot().
	// Hence, adding more threads to the loops contributes little
	// toward improving the convolution operator's performance.
	// We will revisit this issue in the future.
	for (int channel = channels; channel--; data_im += channel_size) {
	for (int kernel_row = 0; kernel_row < kernel_h; kernel_row++) {
	for (int kernel_col = 0; kernel_col < kernel_w; kernel_col++) {
	int input_row = -pad_h + kernel_row * dilation_h;
	for (int output_rows = output_h; output_rows; output_rows--) {
	if (!is_a_ge_zero_and_a_lt_b(input_row, height)) {
	data_col += output_w;
	} else {
	int input_col = -pad_w + kernel_col * dilation_w;
	for (int output_col = output_w; output_col; output_col--) {
	if (is_a_ge_zero_and_a_lt_b(input_col, width)) {
	data_im[input_row * width + input_col] += *data_col;
	}
	data_col++;
	input_col += stride_w;
	}
	}
	input_row += stride_h;
	}
	}
	}
	}
	}

	/*!\brief
	* cpu function of col2im algorithm
	* \param s device stream
	* \param data_col start pointer of the column buffer to be filled
	* \param im_shape input image shape in dimensions (N, C, H, W,)
	* \param col_shape column buffer shape
	* \param kernel_shape kernel filter shape
	* \param pad pad shape
	* \param stride stride shape
	* \param dilation dilation shape
	* \param data_im pointer of a image (C, H, W,...) in the image batch
	*/
	template <typename DType>
	inline void col2im(mshadow::Stream<cpu>* s,
	const DType* data_col, const mxnet::TShape& im_shape,
	const mxnet::TShape& col_shape, const mxnet::TShape& kernel_shape,
	const mxnet::TShape& pad, const mxnet::TShape& stride,
	const mxnet::TShape& dilation, DType* data_im, OpReqType req) {
	index_t num_spatial_axes = kernel_shape.ndim();
	if (2 == num_spatial_axes) {
	col2im_cpu(data_col, im_shape[1], im_shape[2], im_shape[3],
	kernel_shape[0], kernel_shape[1], pad[0], pad[1],
	stride[0], stride[1], dilation[0], dilation[1], data_im, req);
	} else {
	im2col_nd_core_cpu(data_col, false, im_shape, col_shape,
	kernel_shape, pad, stride, dilation, data_im, req);
	}
	}

	} // namespace op
	} // namespace mxnet
	#ifdef __CUDACC__
	#include "./im2col.cuh"
	#endif
	#endif // MXNET_OPERATOR_NN_IM2COL_H_