src/operator/nn/pool.cuh - mxnet - Git at Google

 /*!
  ******************* BEGIN Caffe Copyright Notice and Disclaimer ****************
  *
  * COPYRIGHT
  *
  * All contributions by the University of California:
  * Copyright (c) 2014-2017 The Regents of the University of California (Regents)
  * All rights reserved.
  *
  * All other contributions:
  * Copyright (c) 2014-2017, the respective contributors
  * All rights reserved.
  *
  * Caffe uses a shared copyright model: each contributor holds copyright over
  * their contributions to Caffe. The project versioning records all such
  * contribution and copyright details. If a contributor wants to further mark
  * their specific copyright on a particular contribution, they should indicate
  * their copyright solely in the commit message of the change when it is
  * committed.
  *
  * LICENSE
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *
  * 1. Redistributions of source code must retain the above copyright notice, this
  * list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright notice,
  * this list of conditions and the following disclaimer in the documentation
  * and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  * CONTRIBUTION AGREEMENT
  *
  * By contributing to the BVLC/caffe repository through pull-request, comment,
  * or otherwise, the contributor releases their content to the
  * license and copyright terms herein.
  *
  ***************** END Caffe Copyright Notice and Disclaimer ********************
  *
  * \file pool.cuh
  * \brief Function definitions of pooling 1/2/3-D images.
  * We adopted looping 2-D image pixels from Caffe and extended it to 1-D and 3-D cases.
  * \ref https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu
  * \author Jun Wu
  */

 #ifndef MXNET_OPERATOR_NN_POOL_CUH_
 #define MXNET_OPERATOR_NN_POOL_CUH_

 #include <mxnet/base.h>
 #include <mxnet/operator.h>
 #include "./pool_utils.h"
 #include "../mxnet_op.h"
 #include "../mshadow_op.h"
 #include "../../common/cuda/utils.h"

 namespace mxnet {
 namespace op {

 /*!
  * \brief max pooling gpu kernel for 1-D images, for both NCW and NWC layouts.
  * Do not call this kernel directly. Use the interface pool().
  */
 template <typename DType, int layout>
 __global__ void pool_max_1d_gpu_kernel(const int nthreads, const DType* in_data,
                                        const int channels, const int width,
                                        const int pooled_width, const int kernel_w,
                                        const int stride_w, const int pad_w,
                                        DType* out_data) {
   using mshadow::red::limits::MinValue;
   // index is the output image's pixel index
   CUDA_KERNEL_LOOP(index, nthreads) {
     const bool nwc_layout = layout == mshadow::kNWC;
     const int idx = nwc_layout ? (index / channels) : index;
     const int pw = idx % pooled_width;
     const int c = nwc_layout ? (index % channels) : (index / pooled_width) % channels;
     const int n = index / pooled_width / channels;
     int wstart = pw * stride_w - pad_w;
     const int wend = min(wstart + kernel_w, width);
     wstart = max(wstart, 0);
     const DType* in_slice = nwc_layout ? in_data + n * channels * width + c
                                        : in_data + (n * channels + c) * width;
     DType max_val = MinValue<DType>();
     const int multiplier = nwc_layout ? channels : 1;
     for (int w = wstart; w < wend; ++w) {
       const DType in_val = in_slice[w * multiplier];
       if (in_val > max_val) {
         max_val = in_val;
       }
     }
     out_data[index] = max_val;
   }
 }

 /*!
  * \brief max pooling gpu kernel for 2-D images, for both NCHW and NHWC layouts.
  * Do not call this kernel directly. Use the interface pool().
  */
 template <typename DType, int layout>
 __global__ void pool_max_2d_gpu_kernel(const int nthreads, const DType* in_data,
                                        const int channels, const int height, const int width,
                                        const int pooled_height, const int pooled_width,
                                        const int kernel_h, const int kernel_w, const int stride_h,
                                        const int stride_w, const int pad_h, const int pad_w,
                                        DType* out_data) {
   using mshadow::red::limits::MinValue;
   // index is the output image's pixel index
   CUDA_KERNEL_LOOP(index, nthreads) {
     const bool nhwc_layout = layout == mshadow::kNHWC;
     const int idx = nhwc_layout ? (index / channels) : index;
     const int pw = idx % pooled_width;
     const int ph = (idx / pooled_width) % pooled_height;
     const int c = nhwc_layout ? (index % channels)
                               : (index / pooled_width / pooled_height) % channels;
     const int n = index / pooled_width / pooled_height / channels;
     int hstart = ph * stride_h - pad_h;
     int wstart = pw * stride_w - pad_w;
     const int hend = min(hstart + kernel_h, height);
     const int wend = min(wstart + kernel_w, width);
     hstart = max(hstart, 0);
     wstart = max(wstart, 0);
     const DType* in_slice = nhwc_layout ? in_data + n * channels * height * width + c
                                         : in_data + (n * channels + c) * height * width;
     DType max_val = MinValue<DType>();
     const int multiplier = nhwc_layout ? channels : 1;
     for (int h = hstart; h < hend; ++h) {
       for (int w = wstart; w < wend; ++w) {
         const DType in_val = in_slice[(h * width + w) * multiplier];
         if (in_val > max_val) {
           max_val = in_val;
         }
       }
     }
     out_data[index] = max_val;
   }
 }

 /*!
  * \brief max pooling gpu kernel for 3-D images, for both NCDHW and NDHWC layouts.
  * Do not call this kernel directly. Use the interface pool().
  */
 template <typename DType, int layout>
 __global__ void pool_max_3d_gpu_kernel(const int nthreads, const DType* in_data, const int channels,
                                        const int depth, const int height, const int width,
                                        const int pooled_depth, const int pooled_height,
                                        const int pooled_width, const int kernel_d,
                                        const int kernel_h, const int kernel_w, const int stride_d,
                                        const int stride_h, const int stride_w, const int pad_d,
                                        const int pad_h, const int pad_w,
                                        DType* out_data) {
   using mshadow::red::limits::MinValue;
   // index is the output image's pixel index
   CUDA_KERNEL_LOOP(index, nthreads) {
     const bool ndhwc_layout = layout == mshadow::kNDHWC;
     const int idx = ndhwc_layout ? (index / channels) : index;
     const int pw = idx % pooled_width;
     const int ph = (idx / pooled_width) % pooled_height;
     const int pd = (idx / pooled_width / pooled_height) % pooled_depth;
     const int c = ndhwc_layout ? (index % channels)
                                : (index / pooled_width / pooled_height / pooled_depth) % channels;
     const int n = index / pooled_width / pooled_height / pooled_depth / channels;
     int dstart = pd * stride_d - pad_d;
     int hstart = ph * stride_h - pad_h;
     int wstart = pw * stride_w - pad_w;
     const int dend = min(dstart + kernel_d, depth);
     const int hend = min(hstart + kernel_h, height);
     const int wend = min(wstart + kernel_w, width);
     dstart = max(dstart, 0);
     hstart = max(hstart, 0);
     wstart = max(wstart, 0);
     const DType* in_slice = ndhwc_layout ? in_data + n * channels * depth * height * width + c
                                          : in_data + (n * channels + c) * depth * height * width;
     DType max_val = MinValue<DType>();
     const int multiplier = ndhwc_layout ? channels : 1;
     for (int d = dstart; d < dend; ++d) {
       for (int h = hstart; h < hend; ++h) {
         for (int w = wstart; w < wend; ++w) {
           const DType in_val = in_slice[((d * height + h) * width + w) * multiplier];
           if (in_val > max_val) {
             max_val = in_val;
           }
         }
       }
     }
     out_data[index] = max_val;
   }
 }

 /*!
  * \brief avg/sum pooling gpu kernel for 1-D images, for both NCW and NWC layouts.
  * Do not call this kernel directly. Use the interface pool().
  */
 template <typename DType, int layout, int p = 1>
 __global__ void pool_sum_1d_gpu_kernel(const int nthreads, const DType* in_data, const int channels,
                                        const int width, const int pooled_width, const int kernel_w,
                                        const int stride_w, const int pad_w, DType* out_data,
                                        const bool get_avg = false,
                                        const bool count_include_pad = true) {
   using AccType = typename PoolingTypes<DType>::AccType;
   CUDA_KERNEL_LOOP(index, nthreads) {
     const bool nwc_layout = layout == mshadow::kNWC;
     const int idx = nwc_layout ? (index / channels) : index;
     const int pw = idx % pooled_width;
     const int c = nwc_layout ? (index % channels) : (index / pooled_width) % channels;
     const int n = index / pooled_width / channels;
     int wstart = pw * stride_w - pad_w;
     int wend = min(wstart + kernel_w, width + pad_w);
     int pool_size = (get_avg? (wend - wstart) : 1);
     wstart = max(wstart, 0);
     wend = min(wend, width);
     if (get_avg && !count_include_pad) {
       pool_size = (wend - wstart);
     }
     AccType sum = 0;
     const DType* out_slice = nwc_layout ? in_data + n * channels * width + c
                                         : in_data + (n * channels + c) * width;
     const int multiplier = nwc_layout ? channels : 1;
     for (int w = wstart; w < wend; ++w) {
       sum += a_pow_p<AccType, p>::Map(out_slice[w * multiplier]) / pool_size;
     }
     out_data[index] = a_root_p<AccType, p>::Map(sum);
   }
 }

 /*!
  * \brief avg/sum pooling gpu kernel for 2-D images, for both NCHW and NHWC layouts.
  * Do not call this kernel directly. Use the interface pool().
  */
 template <typename DType, int layout, int p = 1>
 __global__ void pool_sum_2d_gpu_kernel(const int nthreads, const DType* in_data, const int channels,
                                        const int height, const int width,
                                        const int pooled_height, const int pooled_width,
                                        const int kernel_h, const int kernel_w,
                                        const int stride_h, const int stride_w,
                                        const int pad_h, const int pad_w, DType* out_data,
                                        const bool get_avg = false,
                                        const bool count_include_pad = true) {
   using AccType = typename PoolingTypes<DType>::AccType;
   CUDA_KERNEL_LOOP(index, nthreads) {
     const bool nhwc_layout = layout == mshadow::kNHWC;
     const int idx = nhwc_layout ? (index / channels) : index;
     const int pw = idx % pooled_width;
     const int ph = (idx / pooled_width) % pooled_height;
     const int c = nhwc_layout ? (index % channels)
                               : (index / pooled_width / pooled_height) % channels;
     const int n = index / pooled_width / pooled_height / channels;
     int hstart = ph * stride_h - pad_h;
     int wstart = pw * stride_w - pad_w;
     int hend = min(hstart + kernel_h, height + pad_h);
     int wend = min(wstart + kernel_w, width + pad_w);
     int pool_size = (get_avg? (hend - hstart) * (wend - wstart) : 1);
     hstart = max(hstart, 0);
     wstart = max(wstart, 0);
     hend = min(hend, height);
     wend = min(wend, width);
     if (get_avg && !count_include_pad) {
       pool_size = (hend - hstart) * (wend - wstart);
     }
     AccType sum = 0;
     const DType* out_slice = nhwc_layout ? in_data + n * channels * height * width + c
                                          : in_data + (n * channels + c) * height * width;
     const int multiplier = nhwc_layout ? channels : 1;
     for (int h = hstart; h < hend; ++h) {
       for (int w = wstart; w < wend; ++w) {
         sum += a_pow_p<AccType, p>::Map(out_slice[(h * width + w) * multiplier]) / pool_size;
       }
     }
     out_data[index] = a_root_p<AccType, p>::Map(sum);
   }
 }

 /*!
  * \brief avg/sum pooling gpu kernel for 3-D images, for both NCDHW and NDHWC layouts.
  * Do not call this kernel directly. Use the interface pool().
  */
 template <typename DType, int layout, int p = 1>
 __global__ void pool_sum_3d_gpu_kernel(const int nthreads, const DType* in_data, const int channels,
                                        const int depth, const int height, const int width,
                                        const int pooled_depth, const int pooled_height,
                                        const int pooled_width, const int kernel_d,
                                        const int kernel_h, const int kernel_w,
                                        const int stride_d, const int stride_h, const int stride_w,
                                        const int pad_d, const int pad_h, const int pad_w,
                                        DType* out_data, const bool get_avg = false,
                                        const bool count_include_pad = true) {
   using AccType = typename PoolingTypes<DType>::AccType;
   CUDA_KERNEL_LOOP(index, nthreads) {
     const bool ndhwc_layout = layout == mshadow::kNDHWC;
     const int idx = ndhwc_layout ? (index / channels) : index;
     const int pw = idx % pooled_width;
     const int ph = (idx / pooled_width) % pooled_height;
     const int pd = (idx / pooled_width / pooled_height) % pooled_depth;
     const int c = ndhwc_layout ? (index % channels)
                                : (index / pooled_width / pooled_height / pooled_depth) % channels;
     const int n = index / pooled_width / pooled_height / pooled_depth / channels;
     int dstart = pd * stride_d - pad_d;
     int hstart = ph * stride_h - pad_h;
     int wstart = pw * stride_w - pad_w;
     int dend = min(dstart + kernel_d, depth + pad_d);
     int hend = min(hstart + kernel_h, height + pad_h);
     int wend = min(wstart + kernel_w, width + pad_w);
     int pool_size = (get_avg? (dend - dstart) * (hend - hstart) * (wend - wstart) : 1);
     dstart = max(dstart, 0);
     hstart = max(hstart, 0);
     wstart = max(wstart, 0);
     dend = min(dend, depth);
     hend = min(hend, height);
     wend = min(wend, width);
     if (get_avg && !count_include_pad) {
       pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart);
     }
     AccType sum = 0;
     const DType* out_slice = ndhwc_layout ? in_data + n * channels * depth * height * width + c
                                           : in_data + (n * channels + c) * depth * height * width;
     const int multiplier = ndhwc_layout ? channels : 1;
     for (int d = dstart; d < dend; ++d) {
       for (int h = hstart; h < hend; ++h) {
         for (int w = wstart; w < wend; ++w) {
           sum += a_pow_p<AccType, p>::Map(out_slice[((d * height + h) * width + w) *
                                                    multiplier]) / pool_size;
         }
       }
     }
     out_data[index] = (pool_size == 0) ?
                       AccType(nanf("")) :
                       a_root_p<AccType, p>::Map(sum);
   }
 }

 /*!
  * \brief max unpooling gpu kernel for 1-D images, for both NCW and NWC layouts.
  * Do not call this kernel directly. Use the interface unpool().
  */
 template <typename DType, int layout>
 __global__ void unpool_max_1d_gpu_kernel(const int nthreads, const DType* out_grad,
                                          const DType* in_data, const DType* out_data,
                                          const int channels, const int width,
                                          const int pooled_width, const int kernel_w,
                                          const int stride_w, const int pad_w,
                                          DType* in_grad) {
   // index is the output image's pixel index
   // the order has to be consistent with pooling max
   // to avoid adding out_grad to the wrong in_grad
   // in the case where there are multiple max pixels
   // covered by a kernel window
   CUDA_KERNEL_LOOP(index, nthreads) {
     const bool nwc_layout = layout == mshadow::kNWC;
     const int idx = nwc_layout ? (index / channels) : index;
     const int pw = idx % pooled_width;
     const int c = nwc_layout ? index % channels : (index / pooled_width) % channels;
     const int n = index / channels / pooled_width;
     int wstart = pw * stride_w - pad_w;
     const int wend = min(wstart + kernel_w, width);
     wstart = max(wstart, 0);
     // in data/grad offset batch and channel dims
     const int in_offset = nwc_layout ? n * channels * width + c : (n * channels + c) * width;
     const DType* in_data_slice = in_data + in_offset;
     int max_idx = -1;
     DType max_val = out_data[index];
     const int multiplier = nwc_layout ? channels : 1;
     for (int w = wstart; w < wend; ++w) {
       if (in_data_slice[w * multiplier] == max_val) {
         max_idx = w;
         break;
       }
     }

     // In the case where pad > 0 and kernel = 1, for example,
     // max_idx can be -1 reaching this step.
     if (max_idx >= 0) {
       atomicAdd(&in_grad[in_offset + max_idx * multiplier], out_grad[index]);
     }
   }
 }

 /*!
  * \brief max unpooling gpu kernel for 2-D images, for both NCHW and NHWC layouts.
  * Do not call this kernel directly. Use the interface unpool().
  */
 template <typename DType, int layout>
 __global__ void unpool_max_2d_gpu_kernel(const int nthreads, const DType* out_grad,
                                          const DType* in_data, const DType* out_data,
                                          const int channels, const int height, const int width,
                                          const int pooled_height, const int pooled_width,
                                          const int kernel_h, const int kernel_w,
                                          const int stride_h, const int stride_w,
                                          const int pad_h, const int pad_w,
                                          DType* in_grad) {
   // index is the output image's pixel index
   // the order has to be consistent with pooling max
   // to avoid adding out_grad to the wrong in_grad
   // in the case where there are multiple max pixels
   // covered by a kernel window
   CUDA_KERNEL_LOOP(index, nthreads) {
     const bool nhwc_layout = layout == mshadow::kNHWC;
     const int idx = nhwc_layout ? (index / channels) : index;
     const int pw = idx % pooled_width;
     const int ph = (idx / pooled_width) % pooled_height;
     const int c = nhwc_layout ? (index % channels)
                               : (index / pooled_width / pooled_height) % channels;
     const int n = index / pooled_width / pooled_height / channels;
     int hstart = ph * stride_h - pad_h;
     int wstart = pw * stride_w - pad_w;
     const int hend = min(hstart + kernel_h, height);
     const int wend = min(wstart + kernel_w, width);
     hstart = max(hstart, 0);
     wstart = max(wstart, 0);
     // in data/grad offset batch and channel dims
     int in_offset = nhwc_layout ? n * channels * height * width + c
                                 : (n * channels + c) * height * width;
     const DType* in_data_slice = in_data + in_offset;
     int max_idx = -1;
     DType max_val = out_data[index];
     const int multiplier = nhwc_layout ? channels : 1;
     bool found = false;
     for (int h = hstart; h < hend; ++h) {
       for (int w = wstart; w < wend; ++w) {
         const int idx = h * width + w;
         if (in_data_slice[idx * multiplier] == max_val) {
           max_idx = idx;
           found = true;
           break;
         }
       }
       if (found) break;
     }

     // In the case where pad > 0 and kernel = 1, for example,
     // max_idx can be -1 reaching this step.
     if (max_idx >= 0) {
       atomicAdd(&in_grad[in_offset + max_idx * multiplier], out_grad[index]);
     }
   }
 }

 /*!
  * \brief max unpooling gpu kernel for 3-D images, for both NCDHW and NDHWC layouts.
  * Do not call this kernel directly. Use the interface unpool().
  */
 template <typename DType, int layout>
 __global__ void unpool_max_3d_gpu_kernel(const int nthreads, const DType* out_grad,
                                          const DType* in_data, const DType* out_data,
                                          const int channels, const int depth, const int height,
                                          const int width, const int pooled_depth,
                                          const int pooled_height, const int pooled_width,
                                          const int kernel_d, const int kernel_h,
                                          const int kernel_w, const int stride_d,
                                          const int stride_h, const int stride_w, const int pad_d,
                                          const int pad_h, const int pad_w,
                                          DType* in_grad) {
   // index is the output image's pixel index
   // the order has to be consistent with pooling max
   // to avoid adding out_grad to the wrong in_grad
   // in the case where there are multiple max pixels
   // covered by a kernel window
   CUDA_KERNEL_LOOP(index, nthreads) {
     const bool ndhwc_layout = layout == mshadow::kNDHWC;
     const int idx = ndhwc_layout ? (index / channels) : index;
     const int pw = idx % pooled_width;
     const int ph = (idx / pooled_width) % pooled_height;
     const int pd = (idx / pooled_width / pooled_height) % pooled_depth;
     const int c = ndhwc_layout ? (index % channels)
                                : (index / pooled_width / pooled_height / pooled_depth) % channels;
     const int n = index / pooled_width / pooled_height / pooled_depth / channels;
     int dstart = pd * stride_d - pad_d;
     int hstart = ph * stride_h - pad_h;
     int wstart = pw * stride_w - pad_w;
     const int dend = min(dstart + kernel_d, depth);
     const int hend = min(hstart + kernel_h, height);
     const int wend = min(wstart + kernel_w, width);
     dstart = max(dstart, 0);
     hstart = max(hstart, 0);
     wstart = max(wstart, 0);
     // in data/grad offset batch and channel dims
     int in_offset = ndhwc_layout ? n * channels * depth * height * width + c
                                  : (n * channels + c) * depth * height * width;
     const DType* in_data_slice = in_data + in_offset;
     int max_idx = -1;
     DType max_val = out_data[index];
     const int multiplier = ndhwc_layout ? channels : 1;
     bool found = false;
     for (int d = dstart; d < dend; ++d) {
       for (int h = hstart; h < hend; ++h) {
         for (int w = wstart; w < wend; ++w) {
           const int idx = (d * height + h) * width + w;
           if (in_data_slice[idx * multiplier] == max_val) {
             max_idx = idx;
             found = true;
             break;
           }
         }
         if (found) break;
       }
       if (found) break;
     }

     // In the case where pad > 0 and kernel = 1, for example,
     // max_idx can be -1 reaching this step.
     if (max_idx >= 0) {
       atomicAdd(&in_grad[in_offset + max_idx * multiplier], out_grad[index]);
     }
   }
 }

 /*!
  * \brief avg/sum unpooling gpu kernel for 1-D images, for both NCW and NWC layouts.
  * Do not call this kernel directly. Use the interface unpool().
  */
 template<typename DType, int layout, int p = 1>
 __global__ void unpool_sum_1d_gpu_kernel(const int nthreads, const DType* out_grad,
                                          const DType* in_data, const DType* out_data,
                                          const int channels, const int width,
                                          const int pooled_width, const int kernel_w,
                                          const int stride_w, const int pad_w, DType* in_grad,
                                          const bool is_avg = false,
                                          const bool count_include_pad = true) {
   // index is the input image index
   CUDA_KERNEL_LOOP(index, nthreads) {
     // find out the local index
     // find out the local offset
     const bool nwc_layout = layout == mshadow::kNWC;
     const int idx = nwc_layout ? (index / channels) : index;
     const int w = idx % width + pad_w;
     const int c = nwc_layout ? index % channels : (index / width) % channels;
     const int n = index / width / channels;
     const int pwstart = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1;
     const int pwend = min(w / stride_w + 1, pooled_width);
     DType gradient = 0;
     const int slice_offset = nwc_layout ? n * channels * pooled_width + c
                                         : (n * channels + c) * pooled_width;
     const DType* out_grad_slice = out_grad + slice_offset;
     const DType* out_data_slice = out_data + slice_offset;
     const int multiplier = nwc_layout ? channels : 1;
     for (int pw = pwstart; pw < pwend; ++pw) {
       // figure out the pooling size
       int wstart = pw * stride_w - pad_w;
       int wend = min(wstart + kernel_w, width + pad_w);
       int pool_size = (is_avg? (wend - wstart) : 1);
       if (is_avg && !count_include_pad) {
         wstart = max(wstart, 0);
         wend = min(wend, width);
         pool_size = (wend - wstart);
       }
       gradient +=
         lp_grad<DType, p>::Map(out_grad_slice[pw * multiplier], in_data[index],
                                out_data_slice[pw * multiplier]) / pool_size;
     }
     // if req=kWriteTo, in_grad has already been assigned zero values in unpool()
     // use "+=" here instead of "=" to accommodate when req=kAddTo
     in_grad[index] += gradient;
   }
 }

 /*!
  * \brief avg/sum unpooling gpu kernel for 2-D images, for both NCHW and NHWC layouts.
  * Do not call this kernel directly. Use the interface unpool().
  */
 template<typename DType, int layout, int p = 1>
 __global__ void unpool_sum_2d_gpu_kernel(const int nthreads, const DType* out_grad,
                                          const DType* in_data, const DType* out_data,
                                          const int channels, const int height, const int width,
                                          const int pooled_height, const int pooled_width,
                                          const int kernel_h, const int kernel_w,
                                          const int stride_h, const int stride_w,
                                          const int pad_h, const int pad_w, DType* in_grad,
                                          const bool is_avg = false,
                                          const bool count_include_pad = true) {
   // index is the input image index
   CUDA_KERNEL_LOOP(index, nthreads) {
     // find out the local index
     // find out the local offset
     const bool nhwc_layout = layout == mshadow::kNHWC;
     const int idx = nhwc_layout ? (index / channels) : index;
     const int w = idx % width + pad_w;
     const int h = (idx / width) % height + pad_h;
     const int c = nhwc_layout ? index % channels : (index / width / height) % channels;
     const int n = index / width / height / channels;
     const int phstart = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1;
     const int phend = min(h / stride_h + 1, pooled_height);
     const int pwstart = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1;
     const int pwend = min(w / stride_w + 1, pooled_width);
     DType gradient = 0;
     const int slice_offset = nhwc_layout ? n * channels * pooled_height * pooled_width + c
                                          : (n * channels + c) * pooled_height * pooled_width;
     const DType* out_grad_slice = out_grad + slice_offset;
     const DType* out_data_slice = out_data + slice_offset;
     const int multiplier = nhwc_layout ? channels : 1;
     for (int ph = phstart; ph < phend; ++ph) {
       for (int pw = pwstart; pw < pwend; ++pw) {
         // figure out the pooling size
         int hstart = ph * stride_h - pad_h;
         int wstart = pw * stride_w - pad_w;
         int hend = min(hstart + kernel_h, height + pad_h);
         int wend = min(wstart + kernel_w, width + pad_w);
         int pool_size = (is_avg? (hend - hstart) * (wend - wstart) : 1);
         int out_index = ph * pooled_width + pw;
         if (is_avg && !count_include_pad) {
           hstart = max(hstart, 0);
           wstart = max(wstart, 0);
           hend = min(hend, height);
           wend = min(wend, width);
           pool_size = (hend - hstart) * (wend - wstart);
         }
         gradient +=
           lp_grad<DType, p>::Map(out_grad_slice[out_index * multiplier],
                                  in_data[index],
                                  out_data_slice[out_index * multiplier]) / pool_size;
       }
     }
     // if req=kWriteTo, in_grad has already been assigned zero values in unpool()
     // use "+=" here instead of "=" to accommodate when req=kAddTo
     in_grad[index] += gradient;
   }
 }

 /*!
  * \brief avg/sum unpooling gpu kernel for 3-D images, for both NCDHW and NDHWC layouts.
  * Do not call this kernel directly. Use the interface unpool().
  */
 template<typename DType, int layout, int p = 1>
 __global__ void unpool_sum_3d_gpu_kernel(const int nthreads, const DType* out_grad,
                                          const DType* in_data, const DType* out_data,
                                          const int channels, const int depth, const int height,
                                          const int width, const int pooled_depth,
                                          const int pooled_height, const int pooled_width,
                                          const int kernel_d, const int kernel_h,
                                          const int kernel_w, const int stride_d, const int stride_h,
                                          const int stride_w, const int pad_d, const int pad_h,
                                          const int pad_w, DType* in_grad, const bool is_avg = false,
                                          const bool count_include_pad = true) {
   // index is the input image index
   CUDA_KERNEL_LOOP(index, nthreads) {
     // find out the local index
     // find out the local offset
     const bool ndhwc_layout = layout == mshadow::kNDHWC;
     const int idx = ndhwc_layout ? (index / channels) : index;
     const int w = idx % width + pad_w;
     const int h = (idx / width) % height + pad_h;
     const int d = (idx / width / height) % depth + pad_d;
     const int c = ndhwc_layout ? index % channels : (index / width / height / depth) % channels;
     const int n = index / width / height / depth / channels;
     const int pdstart = (d < kernel_d) ? 0 : (d - kernel_d) / stride_d + 1;
     const int pdend = min(d / stride_d + 1, pooled_depth);
     const int phstart = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1;
     const int phend = min(h / stride_h + 1, pooled_height);
     const int pwstart = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1;
     const int pwend = min(w / stride_w + 1, pooled_width);
     DType gradient = 0;
     const int slice_offset =
         ndhwc_layout ? n * channels * pooled_depth * pooled_height * pooled_width + c
                      : (n * channels + c) * pooled_depth * pooled_height * pooled_width;
     const DType* out_grad_slice = out_grad + slice_offset;
     const DType* out_data_slice = out_data + slice_offset;
     const int multiplier = ndhwc_layout ? channels : 1;
     for (int pd = pdstart; pd < pdend; ++pd) {
       for (int ph = phstart; ph < phend; ++ph) {
         for (int pw = pwstart; pw < pwend; ++pw) {
           // figure out the pooling size
           int dstart = pd * stride_d - pad_d;
           int hstart = ph * stride_h - pad_h;
           int wstart = pw * stride_w - pad_w;
           int dend = min(dstart + kernel_d, depth + pad_d);
           int hend = min(hstart + kernel_h, height + pad_h);
           int wend = min(wstart + kernel_w, width + pad_w);
           int pool_size = (is_avg? (dend - dstart) * (hend - hstart) * (wend - wstart) : 1);
           int out_index = (pd * pooled_height + ph) * pooled_width + pw;
           if (is_avg && !count_include_pad) {
             dstart = max(dstart, 0);
             hstart = max(hstart, 0);
             wstart = max(wstart, 0);
             dend = min(dend, depth);
             hend = min(hend, height);
             wend = min(wend, width);
             pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart);
           }
           gradient += lp_grad<DType, p>::Map(out_grad_slice[out_index * multiplier],
                                              in_data[index],
                                              out_data_slice[out_index * multiplier]) / pool_size;
         }
       }
     }
     // if req=kWriteTo, in_grad has already been assigned zero values in unpool()
     // use "+=" here instead of "=" to accommodate when req=kAddTo
     in_grad[index] += gradient;
   }
 }

 /*!
  * \brief This function serves as an interface for 1/2/3-D pooling operations.
  * \param s context stream defining the device in use is gpu
  * \param in_data pointer of the input tensor data in the format of NCW, NCHW, or NCDHW
  * \param ishape input tensor shape
  * \param oshape output tensor shape
  * \param kernel kernel shape
  * \param pad pad shape
  * \param stride stride shape
  * \param pool_type supported pooling type: max, avg, sum
  * \param req_type operator request type, only support kWriteTo for now
  * \param out_data pointer of the output tensor data in the format of NCW, NCHW, or NCDHW
  * \param count_include_pad for avg pooling, should 0 pad values be averaged in the window
  */
 template<typename DType, int layout, int p>
 inline void pool(mshadow::Stream<gpu>* s, const DType* in_data, const mxnet::TShape& ishape,
                  const mxnet::TShape& oshape, const mxnet::TShape& kernel, const mxnet::TShape& pad,
                  const mxnet::TShape& stride, const int pool_type, OpReqType req_type,
                  DType* out_data, const bool count_include_pad) {
   CHECK_EQ(req_type, kWriteTo) << "Only support req=kWriteTo in pooling operations";
   using namespace mxnet_op;
   if (kernel.ndim() == 1) {
     if (pool_enum::kMaxPooling == pool_type) {
       // NOLINT_NEXT_LINE(whitespace/operators)
       pool_max_1d_gpu_kernel<DType, layout><<<cuda_get_num_blocks(oshape.Size()), mshadow::cuda::kBaseThreadNum,
                                0, mshadow::Stream<gpu>::GetStream(s)>>>(
                                    oshape.Size(), in_data, ishape[1], ishape[2],
                                    oshape[2], kernel[0], stride[0], pad[0], out_data);
       MSHADOW_CUDA_POST_KERNEL_CHECK(pool_max_1d_gpu_kernel);
     } else if (pool_enum::kAvgPooling == pool_type) {
       // NOLINT_NEXT_LINE(whitespace/operators)
       pool_sum_1d_gpu_kernel<DType, layout><<<cuda_get_num_blocks(oshape.Size()), mshadow::cuda::kBaseThreadNum,
                                0, mshadow::Stream<gpu>::GetStream(s)>>>(
                                    oshape.Size(), in_data, ishape[1], ishape[2], oshape[2],
                                    kernel[0], stride[0], pad[0], out_data,
                                    true, count_include_pad);
       MSHADOW_CUDA_POST_KERNEL_CHECK(pool_sum_1d_gpu_kernel);
     } else if (pool_enum::kSumPooling == pool_type) {
       // NOLINT_NEXT_LINE(whitespace/operators)
       pool_sum_1d_gpu_kernel<DType, layout><<<cuda_get_num_blocks(oshape.Size()), mshadow::cuda::kBaseThreadNum,
                                0, mshadow::Stream<gpu>::GetStream(s)>>>(
                                    oshape.Size(), in_data, ishape[1], ishape[2], oshape[2],
                                    kernel[0], stride[0], pad[0], out_data);
       MSHADOW_CUDA_POST_KERNEL_CHECK(pool_sum_1d_gpu_kernel);
     } else if (pool_enum::kLpPooling == pool_type) {
       // NOLINT_NEXT_LINE(whitespace/operators)
       pool_sum_1d_gpu_kernel<DType, layout, p><<<cuda_get_num_blocks(oshape.Size()), mshadow::cuda::kBaseThreadNum,
                                0, mshadow::Stream<gpu>::GetStream(s)>>>(
                                    oshape.Size(), in_data, ishape[1], ishape[2], oshape[2],
                                    kernel[0], stride[0], pad[0], out_data);
       MSHADOW_CUDA_POST_KERNEL_CHECK(pool_sum_1d_gpu_kernel);
     } else {
       LOG(FATAL) << "Unknown pooling type " << pool_type;
     }
   } else if (kernel.ndim() == 2) {
     if (pool_enum::kMaxPooling == pool_type) {
       // NOLINT_NEXT_LINE(whitespace/operators)
       pool_max_2d_gpu_kernel<DType, layout><<<cuda_get_num_blocks(oshape.Size()), mshadow::cuda::kBaseThreadNum,
                                0, mshadow::Stream<gpu>::GetStream(s)>>>(
                                    oshape.Size(), in_data, ishape[1], ishape[2], ishape[3],
                                    oshape[2], oshape[3], kernel[0], kernel[1],
                                    stride[0], stride[1], pad[0], pad[1], out_data);
       MSHADOW_CUDA_POST_KERNEL_CHECK(pool_max_2d_gpu_kernel);
     } else if (pool_enum::kAvgPooling == pool_type) {
       // NOLINT_NEXT_LINE(whitespace/operators)
       pool_sum_2d_gpu_kernel<DType, layout><<<cuda_get_num_blocks(oshape.Size()), mshadow::cuda::kBaseThreadNum,
                                0, mshadow::Stream<gpu>::GetStream(s)>>>(
                                    oshape.Size(), in_data, ishape[1], ishape[2], ishape[3],
                                    oshape[2], oshape[3], kernel[0], kernel[1],
                                    stride[0], stride[1], pad[0], pad[1], out_data,
                                    true, count_include_pad);
       MSHADOW_CUDA_POST_KERNEL_CHECK(pool_sum_2d_gpu_kernel);
     } else if (pool_enum::kSumPooling == pool_type) {
       // NOLINT_NEXT_LINE(whitespace/operators)
       pool_sum_2d_gpu_kernel<DType, layout><<<cuda_get_num_blocks(oshape.Size()), mshadow::cuda::kBaseThreadNum,
                                0, mshadow::Stream<gpu>::GetStream(s)>>>(
                                    oshape.Size(), in_data, ishape[1], ishape[2], ishape[3],
                                    oshape[2], oshape[3], kernel[0], kernel[1],
                                    stride[0], stride[1], pad[0], pad[1], out_data);
       MSHADOW_CUDA_POST_KERNEL_CHECK(pool_sum_2d_gpu_kernel);
     } else if (pool_enum::kLpPooling == pool_type) {
       // NOLINT_NEXT_LINE(whitespace/operators)
       pool_sum_2d_gpu_kernel<DType, layout, p><<<cuda_get_num_blocks(oshape.Size()), mshadow::cuda::kBaseThreadNum,
                                0, mshadow::Stream<gpu>::GetStream(s)>>>(
                                    oshape.Size(), in_data, ishape[1], ishape[2], ishape[3],
                                    oshape[2], oshape[3], kernel[0], kernel[1],
                                    stride[0], stride[1], pad[0], pad[1], out_data);
       MSHADOW_CUDA_POST_KERNEL_CHECK(pool_sum_2d_gpu_kernel);
     } else {
       LOG(FATAL) << "Unknown pooling type " << pool_type;
     }
   } else if (kernel.ndim() == 3) {
     if (pool_enum::kMaxPooling == pool_type) {
       // NOLINT_NEXT_LINE(whitespace/operators)
       pool_max_3d_gpu_kernel<DType, layout><<<cuda_get_num_blocks(oshape.Size()), mshadow::cuda::kBaseThreadNum,
                                0, mshadow::Stream<gpu>::GetStream(s)>>>(
                                    oshape.Size(), in_data, ishape[1], ishape[2], ishape[3],
                                    ishape[4], oshape[2], oshape[3], oshape[4],
                                    kernel[0], kernel[1], kernel[2], stride[0],
                                    stride[1], stride[2], pad[0], pad[1], pad[2], out_data);
       MSHADOW_CUDA_POST_KERNEL_CHECK(pool_max_3d_gpu_kernel);
     } else if (pool_enum::kAvgPooling == pool_type) {
       // NOLINT_NEXT_LINE(whitespace/operators)
       pool_sum_3d_gpu_kernel<DType, layout><<<cuda_get_num_blocks(oshape.Size()), mshadow::cuda::kBaseThreadNum,
                                0, mshadow::Stream<gpu>::GetStream(s)>>>(
                                    oshape.Size(), in_data, ishape[1], ishape[2], ishape[3],
                                    ishape[4], oshape[2], oshape[3], oshape[4], kernel[0],
                                    kernel[1], kernel[2], stride[0], stride[1], stride[2],
                                    pad[0], pad[1], pad[2], out_data, true, count_include_pad);
       MSHADOW_CUDA_POST_KERNEL_CHECK(pool_sum_3d_gpu_kernel);
     } else if (pool_enum::kSumPooling == pool_type) {
       // NOLINT_NEXT_LINE(whitespace/operators)
       pool_sum_3d_gpu_kernel<DType, layout><<<cuda_get_num_blocks(oshape.Size()), mshadow::cuda::kBaseThreadNum,
                                0, mshadow::Stream<gpu>::GetStream(s)>>>(
                                    oshape.Size(), in_data, ishape[1], ishape[2], ishape[3],
                                    ishape[4], oshape[2], oshape[3], oshape[4], kernel[0],
                                    kernel[1], kernel[2], stride[0], stride[1], stride[2],
                                    pad[0], pad[1], pad[2], out_data);
       MSHADOW_CUDA_POST_KERNEL_CHECK(pool_sum_3d_gpu_kernel);
     } else if (pool_enum::kLpPooling == pool_type) {
       // NOLINT_NEXT_LINE(whitespace/operators)
       pool_sum_3d_gpu_kernel<DType, layout, p><<<cuda_get_num_blocks(oshape.Size()), mshadow::cuda::kBaseThreadNum,
                                0, mshadow::Stream<gpu>::GetStream(s)>>>(
                                    oshape.Size(), in_data, ishape[1], ishape[2], ishape[3],
                                    ishape[4], oshape[2], oshape[3], oshape[4], kernel[0],
                                    kernel[1], kernel[2], stride[0], stride[1], stride[2],
                                    pad[0], pad[1], pad[2], out_data);
       MSHADOW_CUDA_POST_KERNEL_CHECK(pool_sum_3d_gpu_kernel);
     } else {
       LOG(FATAL) << "Unknown pooling type " << pool_type;
     }
   }
 }

 /*!
  * \brief This function serves as an interface for 1/2/3-D pooling operations.
  * \param s context stream defining the device in use is gpu
  * \param in_data pointer of the input tensor data
  * \param ishape input tensor shape
  * \param oshape output tensor shape
  * \param kernel kernel shape
  * \param pad pad shape
  * \param stride stride shape
  * \param pool_type supported pooling type: max, avg, sum
  * \param req_type operator request type, only support kWriteTo for now
  * \param out_data pointer of the output tensor data
  * \param count_include_pad for avg pooling, should 0 pad values be averaged in the window
  * \param layout I/O tensor layout, e.g. NCHW vs. NHWC
  */
 template<typename DType, int p>
 inline void pool(mshadow::Stream<gpu>* s, const DType* in_data, const mxnet::TShape& ishape,
                  const mxnet::TShape& oshape, const mxnet::TShape& kernel, const mxnet::TShape& pad,
                  const mxnet::TShape& stride, const int pool_type, OpReqType req_type,
                  DType* out_data, const bool count_include_pad, int layout) {
   if (kernel.ndim() == 1) {
     if (layout == mshadow::kNWC) {
       // standardize shapes to NCW to aid templated kernel invocation
       mxnet::TShape ishape_ncw = ConvertLayout(ishape.get<3>(), mshadow::kNWC, mshadow::kNCW);
       mxnet::TShape oshape_ncw = ConvertLayout(oshape.get<3>(), mshadow::kNWC, mshadow::kNCW);
       pool<DType, mshadow::kNWC, p>(s, in_data, ishape_ncw, oshape_ncw, kernel,
                                     pad, stride, pool_type, req_type, out_data, count_include_pad);
     } else if (layout == mshadow::kNCW) {
       pool<DType, mshadow::kNCW, p>(s, in_data, ishape, oshape, kernel,
                                     pad, stride, pool_type, req_type, out_data, count_include_pad);
     } else {
       LOG(FATAL) << "Unsupported layout, expecting kNCW or kNWC, saw: " << layout;
     }
   } else if (kernel.ndim() == 2) {
     if (layout == mshadow::kNHWC) {
       // standardize shapes to NCHW to aid templated kernel invocation
       mxnet::TShape ishape_nchw = ConvertLayout(ishape.get<4>(), mshadow::kNHWC, mshadow::kNCHW);
       mxnet::TShape oshape_nchw = ConvertLayout(oshape.get<4>(), mshadow::kNHWC, mshadow::kNCHW);
       pool<DType, mshadow::kNHWC, p>(s, in_data, ishape_nchw, oshape_nchw, kernel,
                                      pad, stride, pool_type, req_type, out_data, count_include_pad);
     } else if (layout == mshadow::kNCHW) {
       pool<DType, mshadow::kNCHW, p>(s, in_data, ishape, oshape, kernel,
                                      pad, stride, pool_type, req_type, out_data, count_include_pad);
     } else {
       LOG(FATAL) << "Unsupported layout, expecting kNCHW or kNHWC, saw: " << layout;
     }
   } else if (kernel.ndim() == 3) {
     if (layout == mshadow::kNDHWC) {
       // standardize shapes to NCDHW to aid templated kernel invocation
       mxnet::TShape ishape_ncdhw = ConvertLayout(ishape.get<5>(), mshadow::kNDHWC, mshadow::kNCDHW);
       mxnet::TShape oshape_ncdhw = ConvertLayout(oshape.get<5>(), mshadow::kNDHWC, mshadow::kNCDHW);
       pool<DType, mshadow::kNDHWC, p>(s, in_data, ishape_ncdhw, oshape_ncdhw, kernel,
                                      pad, stride, pool_type, req_type, out_data, count_include_pad);
     } else if (layout == mshadow::kNCDHW) {
       pool<DType, mshadow::kNCDHW, p>(s, in_data, ishape, oshape, kernel,
                                      pad, stride, pool_type, req_type, out_data, count_include_pad);
     } else {
       LOG(FATAL) << "Unsupported layout, expecting kNCDHW or kNDHWC, saw: " << layout;
     }
   } else {
     LOG(FATAL) << "Unsupported " << kernel.ndim() << "-D pooling";
   }
 }

 /*!
  * \brief This function serves as an interface for 1/2/3-D unpooling operations.
  * \param s context stream defining the device in use is gpu
  * \param out_grad pointer of the gradient of operator's output tensor
  * \param in_data pointer of the input tensor in the format of NCW, NCHW, or NCDHW
  * \param out_data pointer of the output tensor in the format of NCW, NCHW, or NCDHW
  * \param ishape input tensor shape
  * \param oshape output tensor shape
  * \param kernel kernel shape
  * \param pad pad shape
  * \param stride stride shape
  * \param pool_type supported pooling type: max, avg, sum
  * \param req_type operator request type: kNullOp, kNullWriteInplace, kNullWriteTo, kNullAddTo
  * \param in_grad pointer of the gradient of the operator's input tensor
  * \param count_include_pad for avg pooling, should 0 pad values be averaged in the window
  */
 template<typename DType, int layout, int p>
 inline void unpool(mshadow::Stream<gpu>* s, const DType* out_grad, const DType* in_data,
                    const DType* out_data, const mxnet::TShape& ishape, const mxnet::TShape& oshape,
                    const mxnet::TShape& kernel, const mxnet::TShape& pad, const mxnet::TShape& stride,
                    const int pool_type, OpReqType req_type, DType* in_grad,
                    const bool count_include_pad) {
   if (mxnet::kNullOp == req_type) return;
   if (mxnet::kAddTo != req_type) {
     mxnet_op::Kernel<mxnet_op::set_zero, gpu>::Launch(s, ishape.Size(), in_grad);
   }
   using namespace mxnet_op;
   if (kernel.ndim() == 1) {
     if (pool_enum::kMaxPooling == pool_type) {
       // NOLINT_NEXT_LINE(whitespace/operators)
       unpool_max_1d_gpu_kernel<DType, layout><<<cuda_get_num_blocks(oshape.Size()), mshadow::cuda::kBaseThreadNum,
                                  0, mshadow::Stream<gpu>::GetStream(s)>>>(
                                      oshape.Size(), out_grad, in_data, out_data,
                                      ishape[1], ishape[2], oshape[2], kernel[0], stride[0], pad[0],
                                      in_grad);
       MSHADOW_CUDA_POST_KERNEL_CHECK(unpool_max_1d_gpu_kernel);
     } else if (pool_enum::kAvgPooling == pool_type) {
       // NOLINT_NEXT_LINE(whitespace/operators)
       unpool_sum_1d_gpu_kernel<DType, layout><<<cuda_get_num_blocks(ishape.Size()), mshadow::cuda::kBaseThreadNum,
                                  0, mshadow::Stream<gpu>::GetStream(s)>>>(
                                      ishape.Size(), out_grad, in_data, out_data,
                                      ishape[1], ishape[2], oshape[2], kernel[0],
                                      stride[0], pad[0], in_grad, true, count_include_pad);
       MSHADOW_CUDA_POST_KERNEL_CHECK(unpool_sum_1d_gpu_kernel);
     } else if (pool_enum::kSumPooling == pool_type) {
       // NOLINT_NEXT_LINE(whitespace/operators)
       unpool_sum_1d_gpu_kernel<DType, layout><<<cuda_get_num_blocks(ishape.Size()), mshadow::cuda::kBaseThreadNum,
                                  0, mshadow::Stream<gpu>::GetStream(s)>>>(
                                      ishape.Size(), out_grad, in_data, out_data,
                                      ishape[1], ishape[2], oshape[2], kernel[0],
                                      stride[0], pad[0], in_grad);
       MSHADOW_CUDA_POST_KERNEL_CHECK(unpool_sum_1d_gpu_kernel);
     } else if (pool_enum::kLpPooling == pool_type) {
       // NOLINT_NEXT_LINE(whitespace/operators)
       unpool_sum_1d_gpu_kernel<DType, layout, p><<<cuda_get_num_blocks(ishape.Size()), mshadow::cuda::kBaseThreadNum,
                                  0, mshadow::Stream<gpu>::GetStream(s)>>>(
                                      ishape.Size(), out_grad, in_data, out_data,
                                      ishape[1], ishape[2], oshape[2], kernel[0],
                                      stride[0], pad[0], in_grad);
       MSHADOW_CUDA_POST_KERNEL_CHECK(unpool_sum_1d_gpu_kernel);
     } else {
       LOG(FATAL) << "Unknown pooling type " << pool_type;
     }
   } else if (kernel.ndim() == 2) {
     if (pool_enum::kMaxPooling == pool_type) {
       // NOLINT_NEXT_LINE(whitespace/operators)
       unpool_max_2d_gpu_kernel<DType, layout><<<cuda_get_num_blocks(oshape.Size()), mshadow::cuda::kBaseThreadNum,
                                  0, mshadow::Stream<gpu>::GetStream(s)>>>(
                                      oshape.Size(), out_grad, in_data, out_data,
                                      ishape[1], ishape[2], ishape[3],
                                      oshape[2], oshape[3], kernel[0], kernel[1],
                                      stride[0], stride[1], pad[0], pad[1], in_grad);
       MSHADOW_CUDA_POST_KERNEL_CHECK(unpool_max_2d_gpu_kernel);
     } else if (pool_enum::kAvgPooling == pool_type) {
       // NOLINT_NEXT_LINE(whitespace/operators)
       unpool_sum_2d_gpu_kernel<DType, layout><<<cuda_get_num_blocks(ishape.Size()), mshadow::cuda::kBaseThreadNum,
                                  0, mshadow::Stream<gpu>::GetStream(s)>>>(
                                      ishape.Size(), out_grad, in_data, out_data,
                                      ishape[1], ishape[2], ishape[3],
                                      oshape[2], oshape[3], kernel[0], kernel[1],
                                      stride[0], stride[1], pad[0], pad[1], in_grad,
                                      true, count_include_pad);
       MSHADOW_CUDA_POST_KERNEL_CHECK(unpool_sum_2d_gpu_kernel);
     } else if (pool_enum::kSumPooling == pool_type) {
       // NOLINT_NEXT_LINE(whitespace/operators)
       unpool_sum_2d_gpu_kernel<DType, layout><<<cuda_get_num_blocks(ishape.Size()), mshadow::cuda::kBaseThreadNum,
                                  0, mshadow::Stream<gpu>::GetStream(s)>>>(
                                      ishape.Size(), out_grad, in_data, out_data,
                                      ishape[1], ishape[2], ishape[3],
                                      oshape[2], oshape[3], kernel[0], kernel[1],
                                      stride[0], stride[1], pad[0], pad[1], in_grad);
       MSHADOW_CUDA_POST_KERNEL_CHECK(unpool_sum_2d_gpu_kernel);
     } else if (pool_enum::kLpPooling == pool_type) {
       // NOLINT_NEXT_LINE(whitespace/operators)
       unpool_sum_2d_gpu_kernel<DType, layout, p><<<cuda_get_num_blocks(ishape.Size()), mshadow::cuda::kBaseThreadNum,
                                  0, mshadow::Stream<gpu>::GetStream(s)>>>(
                                      ishape.Size(), out_grad, in_data, out_data,
                                      ishape[1], ishape[2], ishape[3],
                                      oshape[2], oshape[3], kernel[0], kernel[1],
                                      stride[0], stride[1], pad[0], pad[1], in_grad);
       MSHADOW_CUDA_POST_KERNEL_CHECK(unpool_sum_2d_gpu_kernel);
     } else {
       LOG(FATAL) << "Unknown pooling type " << pool_type;
     }
   } else if (kernel.ndim() == 3) {
     if (pool_enum::kMaxPooling == pool_type) {
       // NOLINT_NEXT_LINE(whitespace/operators)
       unpool_max_3d_gpu_kernel<DType, layout><<<cuda_get_num_blocks(oshape.Size()), mshadow::cuda::kBaseThreadNum,
                                  0, mshadow::Stream<gpu>::GetStream(s)>>>(
                                      oshape.Size(), out_grad, in_data, out_data,
                                      ishape[1], ishape[2], ishape[3], ishape[4],
                                      oshape[2], oshape[3], oshape[4], kernel[0], kernel[1],
                                      kernel[2], stride[0], stride[1], stride[2],
                                      pad[0], pad[1], pad[2], in_grad);
       MSHADOW_CUDA_POST_KERNEL_CHECK(unpool_max_3d_gpu_kernel);
     } else if (pool_enum::kAvgPooling == pool_type) {
       // NOLINT_NEXT_LINE(whitespace/operators)
       unpool_sum_3d_gpu_kernel<DType, layout><<<cuda_get_num_blocks(ishape.Size()), mshadow::cuda::kBaseThreadNum,
                                  0, mshadow::Stream<gpu>::GetStream(s)>>>(
                                      ishape.Size(), out_grad, in_data, out_data,
                                      ishape[1], ishape[2], ishape[3], ishape[4],
                                      oshape[2], oshape[3], oshape[4], kernel[0], kernel[1],
                                      kernel[2], stride[0], stride[1], stride[2], pad[0], pad[1],
                                      pad[2], in_grad, true, count_include_pad);
       MSHADOW_CUDA_POST_KERNEL_CHECK(unpool_sum_3d_gpu_kernel);
     } else if (pool_enum::kSumPooling == pool_type) {
       // NOLINT_NEXT_LINE(whitespace/operators)
       unpool_sum_3d_gpu_kernel<DType, layout><<<cuda_get_num_blocks(ishape.Size()), mshadow::cuda::kBaseThreadNum,
                                  0, mshadow::Stream<gpu>::GetStream(s)>>>(
                                      ishape.Size(), out_grad, in_data, out_data,
                                      ishape[1], ishape[2], ishape[3], ishape[4],
                                      oshape[2], oshape[3], oshape[4], kernel[0], kernel[1],
                                      kernel[2], stride[0], stride[1], stride[2], pad[0], pad[1],
                                      pad[2], in_grad);
       MSHADOW_CUDA_POST_KERNEL_CHECK(unpool_sum_3d_gpu_kernel);
     } else if (pool_enum::kLpPooling == pool_type) {
       // NOLINT_NEXT_LINE(whitespace/operators)
       unpool_sum_3d_gpu_kernel<DType, layout, p><<<cuda_get_num_blocks(ishape.Size()), mshadow::cuda::kBaseThreadNum,
                                  0, mshadow::Stream<gpu>::GetStream(s)>>>(
                                      ishape.Size(), out_grad, in_data, out_data,
                                      ishape[1], ishape[2], ishape[3], ishape[4],
                                      oshape[2], oshape[3], oshape[4], kernel[0], kernel[1],
                                      kernel[2], stride[0], stride[1], stride[2], pad[0], pad[1],
                                      pad[2], in_grad);
       MSHADOW_CUDA_POST_KERNEL_CHECK(unpool_sum_3d_gpu_kernel);
     } else {
       LOG(FATAL) << "Unknown pooling type " << pool_type;
     }
   } else {
     LOG(FATAL) << "Unsupported " << kernel.ndim() << "-D unpooling";
   }
 }

 /*!
  * \brief This function serves as an interface for 1/2/3-D unpooling operations.
  * \param s context stream defining the device in use is gpu
  * \param out_grad pointer of the gradient of operator's output tensor
  * \param in_data pointer of the input tensor in the format of NCW, NCHW, or NCDHW
  * \param out_data pointer of the output tensor in the format of NCW, NCHW, or NCDHW
  * \param ishape input tensor shape
  * \param oshape output tensor shape
  * \param kernel kernel shape
  * \param pad pad shape
  * \param stride stride shape
  * \param pool_type supported pooling type: max, avg, sum
  * \param req_type operator request type: kNullOp, kNullWriteInplace, kNullWriteTo, kNullAddTo
  * \param in_grad pointer of the gradient of the operator's input tensor
  * \param count_include_pad for avg pooling, should 0 pad values be averaged in the window
  * \param layout I/O tensor layout, e.g. NCHW vs. NHWC
  */
 template<typename DType, int p>
 inline void unpool(mshadow::Stream<gpu>* s, const DType* out_grad, const DType* in_data,
                    const DType* out_data, const mxnet::TShape& ishape, const mxnet::TShape& oshape,
                    const mxnet::TShape& kernel, const mxnet::TShape& pad, const mxnet::TShape& stride,
                    const int pool_type, OpReqType req_type, DType* in_grad,
                    const bool count_include_pad, int layout) {
   if (kernel.ndim() == 1) {
     if (layout == mshadow::kNWC) {
       // standardize shapes to NCW to aid templated kernel invocation
       mxnet::TShape ishape_ncw = ConvertLayout(ishape.get<3>(), mshadow::kNWC, mshadow::kNCW);
       mxnet::TShape oshape_ncw = ConvertLayout(oshape.get<3>(), mshadow::kNWC, mshadow::kNCW);
       unpool<DType, mshadow::kNWC, p>(s, out_grad, in_data, out_data, ishape_ncw, oshape_ncw,
                               kernel, pad, stride, pool_type, req_type, in_grad, count_include_pad);
     } else if (layout == mshadow::kNCW) {
       unpool<DType, mshadow::kNCW, p>(s, out_grad, in_data, out_data, ishape, oshape, kernel,
                               pad, stride, pool_type, req_type, in_grad, count_include_pad);
     } else {
       LOG(FATAL) << "Unsupported layout, expecting kNCW or kNWC, saw: " << layout;
     }
   } else if (kernel.ndim() == 2) {
     if (layout == mshadow::kNHWC) {
       // standardize shapes to NCHW to aid templated kernel invocation
       mxnet::TShape ishape_nchw = ConvertLayout(ishape.get<4>(), mshadow::kNHWC, mshadow::kNCHW);
       mxnet::TShape oshape_nchw = ConvertLayout(oshape.get<4>(), mshadow::kNHWC, mshadow::kNCHW);
       unpool<DType, mshadow::kNHWC, p>(s, out_grad, in_data, out_data, ishape_nchw, oshape_nchw,
                               kernel, pad, stride, pool_type, req_type, in_grad, count_include_pad);
     } else if (layout == mshadow::kNCHW) {
       unpool<DType, mshadow::kNCHW, p>(s, out_grad, in_data, out_data, ishape, oshape, kernel,
                               pad, stride, pool_type, req_type, in_grad, count_include_pad);
     } else {
       LOG(FATAL) << "Unsupported layout, expecting kNCHW or kNHWC, saw: " << layout;
     }
   } else if (kernel.ndim() == 3) {
     if (layout == mshadow::kNDHWC) {
       // standardize shapes to NCDHW to aid templated kernel invocation
       mxnet::TShape ishape_ncdhw = ConvertLayout(ishape.get<5>(), mshadow::kNDHWC, mshadow::kNCDHW);
       mxnet::TShape oshape_ncdhw = ConvertLayout(oshape.get<5>(), mshadow::kNDHWC, mshadow::kNCDHW);
       unpool<DType, mshadow::kNDHWC, p>(s, out_grad, in_data, out_data, ishape_ncdhw, oshape_ncdhw,
                               kernel, pad, stride, pool_type, req_type, in_grad, count_include_pad);
     } else if (layout == mshadow::kNCDHW) {
       unpool<DType, mshadow::kNCDHW, p>(s, out_grad, in_data, out_data, ishape, oshape, kernel,
                               pad, stride, pool_type, req_type, in_grad, count_include_pad);
     } else {
       LOG(FATAL) << "Unsupported layout, expecting kNCDHW or kNDHWC, saw: " << layout;
     }
   } else {
     LOG(FATAL) << "Unsupported " << kernel.ndim() << "-D unpooling";
   }
 }

 }  // namespace op
 }  // namespace mxnet

 #endif  // MXNET_OPERATOR_NN_POOL_CUH_