src/operator/pad.cc - mxnet-test - Git at Google

 /*!
  * Copyright (c) 2015 by Contributors
  * \file pad.cc
  * \brief
  * \author Sebastian Bodenstein
 */
 #include "./pad-inl.h"

 namespace mshadow {

 ////////////////////////////////////////////////////////////////////////////////
 // Special Case: 2d image (so only pad width + height)

 // Case 1: Edge Padding (or Replication Padding)
 // single_image_2d_edge adapted from Torch
 // https://github.com/torch/nn/blob/master/lib/THNN/generic/SpatialReplicationPadding.c
 template <typename DType>
 void single_image_edge(const Tensor<cpu, 3, DType> dst,
                        const Tensor<cpu, 3, DType> src, mxnet::TShape pad) {
   const int nslices = src.size(0);
   const int iheight = src.size(1);
   const int iwidth = src.size(2);

   const int oheight = dst.size(1);
   const int owidth = dst.size(2);

   const int pad_t = pad[4];
   const int pad_l = pad[6];
   int iStartX = std::max(0, -pad_l);
   int iStartY = std::max(0, -pad_t);
   int oStartX = std::max(0, pad_l);
   int oStartY = std::max(0, pad_t);

   int k, ip_x, ip_y;
 #pragma omp parallel for private(k, ip_x, ip_y)
   for (k = 0; k < nslices; k++) {
     int i, j;
     for (i = 0; i < oheight; i++) {
       for (j = 0; j < owidth; j++) {
         if (j < pad_l) {
           ip_x = pad_l;
         } else if (j >= pad_l && j < iwidth + pad_l) {
           ip_x = j;
         } else {
           ip_x = iwidth + pad_l - 1;
         }
         ip_x = ip_x - oStartX + iStartX;
         if (i < pad_t) {
           ip_y = pad_t;
         } else if (i >= pad_t && i < iheight + pad_t) {
           ip_y = i;
         } else {
           ip_y = iheight + pad_t - 1;
         }
         ip_y = ip_y - oStartY + iStartY;

         DType *dest_p = dst.dptr_ + k * owidth * oheight + i * owidth + j;
         DType *src_p = src.dptr_ + k * iwidth * iheight + ip_y * iwidth + ip_x;
         *dest_p = *src_p;
       }
     }
   }
 }

 template <typename DType>
 void single_image_edge_grad(const Tensor<cpu, 3, DType> &grad_in,
                             const Tensor<cpu, 3, DType> grad_out,
                             mxnet::TShape pad) {
   const int nslices = grad_in.size(0);
   const int iheight = grad_in.size(1);
   const int iwidth = grad_in.size(2);

   const int oheight = grad_out.size(1);
   const int owidth = grad_out.size(2);

   const int pad_t = pad[4];
   const int pad_l = pad[6];
   int iStartX = std::max(0, -pad_l);
   int iStartY = std::max(0, -pad_t);
   int oStartX = std::max(0, pad_l);
   int oStartY = std::max(0, pad_t);

   int k, ip_x, ip_y;
 #pragma omp parallel for private(k, ip_x, ip_y)
   for (k = 0; k < nslices; k++) {
     int i, j;
     for (i = 0; i < oheight; i++) {
       for (j = 0; j < owidth; j++) {
         if (j < pad_l) {
           ip_x = pad_l;
         } else if (j >= pad_l && j < iwidth + pad_l) {
           ip_x = j;
         } else {
           ip_x = iwidth + pad_l - 1;
         }
         ip_x = ip_x - oStartX + iStartX;

         if (i < pad_t) {
           ip_y = pad_t;
         } else if (i >= pad_t && i < iheight + pad_t) {
           ip_y = i;
         } else {
           ip_y = iheight + pad_t - 1;
         }
         ip_y = ip_y - oStartY + iStartY;

         DType *src_p = grad_out.dptr_ + k * owidth * oheight + i * owidth + j;
         DType *dest_p =
             grad_in.dptr_ + k * iwidth * iheight + ip_y * iwidth + ip_x;
         *dest_p += *src_p;
       }
     }
   }
 }

 // Case 2: Zero Padding
 template <typename DType>
 void single_image_constant(const Tensor<cpu, 3, DType> &dst,
                            const Tensor<cpu, 3, DType> src, mxnet::TShape pad,
                            DType constant_value) {
   const int pad_t = pad[4];
   const int pad_l = pad[6];
   int c, w, h;
 #pragma omp parallel for private(c, w, h)
   for (c = 0; c < dst.size(0); ++c) {
     for (h = 0; h < dst.size(1); ++h) {
       for (w = 0; w < dst.size(2); ++w) {
         if ((w < pad_l) || (h < pad_t) || (h >= (src.size(1) + pad_t)) ||
             (w >= (src.size(2) + pad_l))) {
           dst[c][h][w] = constant_value;
         } else {
           dst[c][h][w] = src[c][h - pad_t][w - pad_l];
         }
       }
     }
   }
 }

 template <typename DType>
 void single_image_constant_grad(const Tensor<cpu, 3, DType> &in_grad,
                                 const Tensor<cpu, 3, DType> out_grad,
                                 mxnet::TShape pad) {
   const int pad_t = pad[4];
   const int pad_l = pad[6];
   int c, h, w;
 #pragma omp parallel for private(c, w, h)
   for (c = 0; c < in_grad.size(0); ++c) {
     for (h = 0; h < in_grad.size(1); ++h) {
       for (w = 0; w < in_grad.size(2); ++w) {
         in_grad[c][h][w] += out_grad[c][h + pad_t][w + pad_l];
       }
     }
   }
 }

 // Case 3: Reflection Padding
 template <typename DType>
 void single_image_reflect(const Tensor<cpu, 3, DType> &dst,
                            const Tensor<cpu, 3, DType> src, mxnet::TShape pad) {
   const int nslices = src.size(0);
   const int iheight = src.size(1);
   const int iwidth = src.size(2);

   const int oheight = dst.size(1);
   const int owidth = dst.size(2);

   const int pad_t = pad[4];
   const int pad_l = pad[6];
   int iStartX = std::max(0, -pad_l);
   int iStartY = std::max(0, -pad_t);
   int oStartX = std::max(0, pad_l);
   int oStartY = std::max(0, pad_t);

   int k, ip_x, ip_y;
 #pragma omp parallel for private(k, ip_x, ip_y)

   for (k = 0; k < nslices; k++) {
     int i, j;
     for (i = 0; i < oheight; i++) {
       for (j = 0; j < owidth; j++) {
         if (j < pad_l) {
           ip_x = pad_l * 2 - j;
         } else if (j >= pad_l && j < iwidth + pad_l) {
           ip_x = j;
         } else {
           ip_x = (iwidth + pad_l - 1) * 2 - j;
         }
         ip_x = ip_x - oStartX + iStartX;

         if (i < pad_t) {
           ip_y = pad_t * 2 - i;
         } else if (i >= pad_t && i < iheight + pad_t) {
           ip_y = i;
         } else {
           ip_y = (iheight + pad_t - 1) * 2 - i;
         }
         ip_y = ip_y - oStartY + iStartY;

         DType *dest_p = dst.dptr_ + k * owidth * oheight + i * owidth + j;
         DType *src_p = src.dptr_ + k * iwidth * iheight + ip_y * iwidth + ip_x;
         *dest_p = *src_p;
       }
     }
   }
 }

 template <typename DType>
 void single_image_reflect_grad(const Tensor<cpu, 3, DType> &grad_in,
                             const Tensor<cpu, 3, DType> grad_out,
                             mxnet::TShape pad) {
   const int nslices = grad_in.size(0);
   const int iheight = grad_in.size(1);
   const int iwidth = grad_in.size(2);

   const int oheight = grad_out.size(1);
   const int owidth = grad_out.size(2);

   const int pad_t = pad[4];
   const int pad_l = pad[6];
   int iStartX = std::max(0, -pad_l);
   int iStartY = std::max(0, -pad_t);
   int oStartX = std::max(0, pad_l);
   int oStartY = std::max(0, pad_t);

   int k, ip_x, ip_y;
 #pragma omp parallel for private(k, ip_x, ip_y)

   for (k = 0; k < nslices; k++) {
     int i, j;
     for (i = 0; i < oheight; i++) {
       for (j = 0; j < owidth; j++) {
         if (j < pad_l) {
           ip_x = pad_l * 2 - j;
         } else if (j >= pad_l && j < iwidth + pad_l) {
           ip_x = j;
         } else {
           ip_x = (iwidth + pad_l - 1) * 2 - j;
         }
         ip_x = ip_x - oStartX + iStartX;

         if (i < pad_t) {
           ip_y = pad_t * 2 - i;
         } else if (i >= pad_t && i < iheight + pad_t) {
           ip_y = i;
         } else {
           ip_y = (iheight + pad_t - 1) * 2 - i;
         }
         ip_y = ip_y - oStartY + iStartY;

         DType *src_p = grad_out.dptr_ + k * owidth * oheight + i * owidth + j;
         DType *dest_p = grad_in.dptr_ + k * iwidth * iheight + ip_y * iwidth + ip_x;
         *dest_p += *src_p;
       }
     }
   }
 }

 ////////////////////////////////////////////////////////////////////////////////
 // Special Case: 3d image (so only pad width + height + depth)

 // Case 1: Edge Padding (or Replication Padding)
 // single_image_3d_edge adapted from Torch
 // https://github.com/torch/nn/blob/master/lib/THNN/generic/VolumetricReplicationPadding.c
 template <typename DType>
 void single_image_edge(const Tensor<cpu, 4, DType> dst,
                        const Tensor<cpu, 4, DType> src, mxnet::TShape pad) {
   const int nslices = src.size(0);
   const int idepth = src.size(1);
   const int iheight = src.size(2);
   const int iwidth = src.size(3);

   const int odepth = dst.size(1);
   const int oheight = dst.size(2);
   const int owidth = dst.size(3);

   const int pad_f = pad[4];
   const int pad_t = pad[6];
   const int pad_l = pad[8];
   int iStartX = std::max(0, -pad_l);
   int iStartY = std::max(0, -pad_t);
   int iStartZ = std::max(0, -pad_f);
   int oStartX = std::max(0, pad_l);
   int oStartY = std::max(0, pad_t);
   int oStartZ = std::max(0, pad_f);

   int k, ip_x, ip_y, ip_z;
 #pragma omp parallel for private(k, ip_x, ip_y, ip_z)
   for (k = 0; k < nslices; k++) {
     int i, j, z;
     for (z = 0; z < odepth; z++) {
       for (i = 0; i < oheight; i++) {
         for (j = 0; j < owidth; j++) {
           if (j < pad_l) {
             ip_x = pad_l;
           } else if (j >= pad_l && j < iwidth + pad_l) {
             ip_x = j;
           } else {
             ip_x = iwidth + pad_l - 1;
           }
           ip_x = ip_x - oStartX + iStartX;

           if (i < pad_t) {
             ip_y = pad_t;
           } else if (i >= pad_t && i < iheight + pad_t) {
             ip_y = i;
           } else {
             ip_y = iheight + pad_t - 1;
           }
           ip_y = ip_y - oStartY + iStartY;

           if (z < pad_f) {
             ip_z = pad_f;
           } else if (z >= pad_f && z < idepth + pad_f) {
             ip_z = z;
           } else {
             ip_z = idepth + pad_f - 1;
           }
           ip_z = ip_z - oStartZ + iStartZ;

           DType *dest_p = dst.dptr_ + k * owidth * oheight * odepth +
                           z * owidth * oheight + i * owidth + j;
           DType *src_p = src.dptr_ + k * iwidth * iheight * idepth +
                          ip_z * iwidth * iheight + ip_y * iwidth + ip_x;
           *dest_p = *src_p;
         }
       }
     }
   }
 }

 template <typename DType>
 void single_image_edge_grad(const Tensor<cpu, 4, DType> &grad_in,
                             const Tensor<cpu, 4, DType> grad_out,
                             mxnet::TShape pad) {
   const int nslices = grad_in.size(0);
   const int idepth = grad_in.size(1);
   const int iheight = grad_in.size(2);
   const int iwidth = grad_in.size(3);

   const int odepth = grad_out.size(1);
   const int oheight = grad_out.size(2);
   const int owidth = grad_out.size(3);

   const int pad_f = pad[4];
   const int pad_t = pad[6];
   const int pad_l = pad[8];
   int iStartX = std::max(0, -pad_l);
   int iStartY = std::max(0, -pad_t);
   int iStartZ = std::max(0, -pad_f);
   int oStartX = std::max(0, pad_l);
   int oStartY = std::max(0, pad_t);
   int oStartZ = std::max(0, pad_f);

   int k, ip_x, ip_y, ip_z;
 #pragma omp parallel for private(k, ip_x, ip_y, ip_z)
   for (k = 0; k < nslices; k++) {
     int i, j, z;
     for (z = 0; z < odepth; z++) {
       for (i = 0; i < oheight; i++) {
         for (j = 0; j < owidth; j++) {
           if (j < pad_l) {
             ip_x = pad_l;
           } else if (j >= pad_l && j < iwidth + pad_l) {
             ip_x = j;
           } else {
             ip_x = iwidth + pad_l - 1;
           }
           ip_x = ip_x - oStartX + iStartX;

           if (i < pad_t) {
             ip_y = pad_t;
           } else if (i >= pad_t && i < iheight + pad_t) {
             ip_y = i;
           } else {
             ip_y = iheight + pad_t - 1;
           }
           ip_y = ip_y - oStartY + iStartY;

           if (z < pad_f) {
             ip_z = pad_f;
           } else if (z >= pad_f && z < idepth + pad_f) {
             ip_z = z;
           } else {
             ip_z = idepth + pad_f - 1;
           }
           ip_z = ip_z - oStartZ + iStartZ;

           DType *src_p = grad_out.dptr_ + k * owidth * oheight * odepth +
                          z * owidth * oheight + i * owidth + j;
           DType *dest_p = grad_in.dptr_ + k * iwidth * iheight * idepth +
                           ip_z * iwidth * iheight + ip_y * iwidth + ip_x;
           *dest_p += *src_p;
         }
       }
     }
   }
 }

 // Case 2: Zero Padding
 template <typename DType>
 void single_image_constant(const Tensor<cpu, 4, DType> &dst,
                            const Tensor<cpu, 4, DType> src, mxnet::TShape pad,
                            DType constant_value) {
   const int pad_f = pad[4];
   const int pad_t = pad[6];
   const int pad_l = pad[8];
   int c, d, w, h;
 #pragma omp parallel for private(c, d, w, h)
   for (c = 0; c < dst.size(0); ++c) {
     for (d = 0; d < dst.size(1); ++d) {
       for (h = 0; h < dst.size(2); ++h) {
         for (w = 0; w < dst.size(3); ++w) {
           if ((w < pad_l) || (h < pad_t) || (d < pad_f) ||
               (d >= (src.size(1) + pad_f)) || (h >= (src.size(2) + pad_t)) ||
               (w >= (src.size(3) + pad_l))) {
             dst[c][d][h][w] = constant_value;
           } else {
             dst[c][d][h][w] = src[c][d - pad_f][h - pad_t][w - pad_l];
           }
         }
       }
     }
   }
 }

 template <typename DType>
 void single_image_constant_grad(const Tensor<cpu, 4, DType> &in_grad,
                                 const Tensor<cpu, 4, DType> out_grad,
                                 mxnet::TShape pad) {
   const int pad_f = pad[4];
   const int pad_t = pad[6];
   const int pad_l = pad[8];
   int c, d, w, h;
   #pragma omp parallel for private(c, d, w, h)
   for (c = 0; c < in_grad.size(0); ++c) {
     for (d = 0; d < in_grad.size(1); ++d) {
       for (h = 0; h < in_grad.size(2); ++h) {
         for (w = 0; w < in_grad.size(3); ++w) {
           in_grad[c][d][h][w] += out_grad[c][d + pad_f][h + pad_t][w + pad_l];
         }
       }
     }
   }
 }

 // Case 3: Reflection Padding
 template <typename DType>
 void single_image_reflect(const Tensor<cpu, 4, DType> &dst,
                            const Tensor<cpu, 4, DType> src, mxnet::TShape pad) {
   const int nslices = src.size(0);
   const int idepth = src.size(1);
   const int iheight = src.size(2);
   const int iwidth = src.size(3);

   const int odepth = dst.size(1);
   const int oheight = dst.size(2);
   const int owidth = dst.size(3);

   const int pad_f = pad[4];
   const int pad_t = pad[6];
   const int pad_l = pad[8];
   int iStartX = std::max(0, -pad_l);
   int iStartY = std::max(0, -pad_t);
   int iStartZ = std::max(0, -pad_f);
   int oStartX = std::max(0, pad_l);
   int oStartY = std::max(0, pad_t);
   int oStartZ = std::max(0, pad_f);

   int l, ip_x, ip_y, ip_z;
 #pragma omp parallel for private(l, ip_x, ip_y, ip_z)
   for (l = 0; l < nslices; l++) {
     int i, j, k;
     for (k = 0; k < odepth; k++) {
       for (i = 0; i < oheight; i++) {
         for (j = 0; j < owidth; j++) {
           if (j < pad_l) {
             ip_x = pad_l * 2 - j;
           } else if (j >= pad_l && j < iwidth + pad_l) {
             ip_x = j;
           } else {
             ip_x = (iwidth + pad_l - 1) * 2 - j;
           }
           ip_x = ip_x - oStartX + iStartX;

           if (i < pad_t) {
             ip_y = pad_t * 2 - i;
           } else if (i >= pad_t && i < iheight + pad_t) {
             ip_y = i;
           } else {
             ip_y = (iheight + pad_t - 1) * 2 - i;
           }
           ip_y = ip_y - oStartY + iStartY;

           if (k < pad_f) {
             ip_z = pad_f * 2 - k;
           } else if (k >= pad_f && k < idepth + pad_f) {
             ip_z = k;
           } else {
             ip_z = (idepth + pad_f - 1) * 2 - k;
           }
           ip_z = ip_z - oStartZ + iStartZ;

           DType *dest_p = dst.dptr_ + l * owidth * oheight * odepth +
                           k * owidth * oheight + i * owidth + j;
           DType *src_p = src.dptr_ + l * iwidth * iheight * idepth +
                          ip_z * iwidth * iheight + ip_y * iwidth + ip_x;
           *dest_p = *src_p;
         }
       }
     }
   }
 }

 template <typename DType>
 void single_image_reflect_grad(const Tensor<cpu, 4, DType> &grad_in,
                                 const Tensor<cpu, 4, DType> grad_out,
                                 mxnet::TShape pad) {
   const int nslices = grad_in.size(0);
   const int idepth = grad_in.size(1);
   const int iheight = grad_in.size(2);
   const int iwidth = grad_in.size(3);

   const int odepth = grad_out.size(1);
   const int oheight = grad_out.size(2);
   const int owidth = grad_out.size(3);

   const int pad_f = pad[4];
   const int pad_t = pad[6];
   const int pad_l = pad[8];
   int iStartX = std::max(0, -pad_l);
   int iStartY = std::max(0, -pad_t);
   int iStartZ = std::max(0, -pad_f);
   int oStartX = std::max(0, pad_l);
   int oStartY = std::max(0, pad_t);
   int oStartZ = std::max(0, pad_f);

   int l, ip_x, ip_y, ip_z;
 /*#pragma omp parallel for private(l, ip_x, ip_y, ip_z)*/
   for (l = 0; l < nslices; l++) {
     int i, j, k;
     for (k = 0; k < odepth; k++) {
       for (i = 0; i < oheight; i++) {
         for (j = 0; j < owidth; j++) {
           if (j < pad_l) {
             ip_x = pad_l * 2 - j;
           } else if (j >= pad_l && j < iwidth + pad_l) {
             ip_x = j;
           } else {
             ip_x = (iwidth + pad_l - 1) * 2 - j;
           }
           ip_x = ip_x - oStartX + iStartX;

           if (i < pad_t) {
             ip_y = pad_t * 2 - i;
           } else if (i >= pad_t && i < iheight + pad_t) {
             ip_y = i;
           } else {
             ip_y = (iheight + pad_t - 1) * 2 - i;
           }
           ip_y = ip_y - oStartY + iStartY;

           if (k < pad_f) {
             ip_z = pad_f * 2 - k;
           } else if (k >= pad_f && k < idepth + pad_f) {
             ip_z = k;
           } else {
             ip_z = (idepth + pad_f - 1) * 2 - k;
           }
           ip_z = ip_z - oStartZ + iStartZ;

           DType *src_p = grad_out.dptr_ + l * owidth * oheight * odepth +
                          k * owidth * oheight + i * owidth + j;
           DType *dest_p = grad_in.dptr_ + l * iwidth * iheight * idepth +
                           ip_z * iwidth * iheight + ip_y * iwidth + ip_x;
           *dest_p += *src_p;
         }
       }
     }
   }
 }

 ////////////////////////////////////////////////////////////////////////////////
 // Interface to 2d and 3d image pad methods

 template <int dim, typename DType>
 void pad_image(const Tensor<cpu, dim, DType> &dst,
                const Tensor<cpu, dim, DType> src, mxnet::TShape pad, int mode,
                DType constant_value) {
   for (index_t n = 0; n < dst.size(0); ++n) {
     switch (mode) {
       case mxnet::op::pad_enum::kEdge:
         single_image_edge(dst[n], src[n], pad);
         break;
       case mxnet::op::pad_enum::kConstant:
         single_image_constant(dst[n], src[n], pad, constant_value);
         break;
       case mxnet::op::pad_enum::kReflect:
         single_image_reflect(dst[n], src[n], pad);
         break;
     }
   }
 }

 template <int dim, typename DType>
 void pad_image_grad(const Tensor<cpu, dim, DType> &in_grad,
                     const Tensor<cpu, dim, DType> out_grad, mxnet::TShape pad,
                     int mode) {
   for (index_t n = 0; n < in_grad.size(0); ++n) {
     switch (mode) {
       case mxnet::op::pad_enum::kEdge:
         single_image_edge_grad(in_grad[n], out_grad[n], pad);
         break;
       case mxnet::op::pad_enum::kConstant:
         single_image_constant_grad(in_grad[n], out_grad[n], pad);
         break;
       case mxnet::op::pad_enum::kReflect:
         single_image_reflect_grad(in_grad[n], out_grad[n], pad);
         break;
     }
   }
 }

 }  // namespace mshadow

 namespace mxnet {
 namespace op {
 template <>
 Operator *CreateOp<cpu>(PadParam param, int dtype) {
   Operator *op = NULL;
   MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { op = new PadOp<cpu, DType>(param); })
   return op;
 }

 // DO_BIND_DISPATCH comes from operator_common.h
 Operator *PadProp::CreateOperatorEx(Context ctx, std::vector<TShape> *in_shape,
                                     std::vector<int> *in_type) const {
   std::vector<TShape> out_shape, aux_shape;
   std::vector<int> out_type, aux_type;
   CHECK(InferType(in_type, &out_type, &aux_type));
   CHECK(InferShape(in_shape, &out_shape, &aux_shape));
   DO_BIND_DISPATCH(CreateOp, param_, (*in_type)[0]);
 }

 DMLC_REGISTER_PARAMETER(PadParam);

 MXNET_REGISTER_OP_PROPERTY(Pad, PadProp)
 .describe(R"code(Pads an input array with a constant or edge values of the array.

 .. note:: `Pad` is deprecated. Use `pad` instead.

 .. note:: Current implementation only supports 4D and 5D input arrays with padding applied
    only on axes 1, 2 and 3. Expects axes 4 and 5 in `pad_width` to be zero.

 This operation pads an input array with either a `constant_value` or edge values
 along each axis of the input array. The amount of padding is specified by `pad_width`.

 `pad_width` is a tuple of integer padding widths for each axis of the format
 ``(before_1, after_1, ... , before_N, after_N)``. The `pad_width` should be of length ``2*N``
 where ``N`` is the number of dimensions of the array.

 For dimension ``N`` of the input array, ``before_N`` and ``after_N`` indicates how many values
 to add before and after the elements of the array along dimension ``N``.
 The widths of the higher two dimensions ``before_1``, ``after_1``, ``before_2``,
 ``after_2`` must be 0.

 Example::

    x = [[[[  1.   2.   3.]
           [  4.   5.   6.]]

          [[  7.   8.   9.]
           [ 10.  11.  12.]]]


         [[[ 11.  12.  13.]
           [ 14.  15.  16.]]

          [[ 17.  18.  19.]
           [ 20.  21.  22.]]]]

    pad(x,mode="edge", pad_width=(0,0,0,0,1,1,1,1)) =

          [[[[  1.   1.   2.   3.   3.]
             [  1.   1.   2.   3.   3.]
             [  4.   4.   5.   6.   6.]
             [  4.   4.   5.   6.   6.]]

            [[  7.   7.   8.   9.   9.]
             [  7.   7.   8.   9.   9.]
             [ 10.  10.  11.  12.  12.]
             [ 10.  10.  11.  12.  12.]]]


           [[[ 11.  11.  12.  13.  13.]
             [ 11.  11.  12.  13.  13.]
             [ 14.  14.  15.  16.  16.]
             [ 14.  14.  15.  16.  16.]]

            [[ 17.  17.  18.  19.  19.]
             [ 17.  17.  18.  19.  19.]
             [ 20.  20.  21.  22.  22.]
             [ 20.  20.  21.  22.  22.]]]]

    pad(x, mode="constant", constant_value=0, pad_width=(0,0,0,0,1,1,1,1)) =

          [[[[  0.   0.   0.   0.   0.]
             [  0.   1.   2.   3.   0.]
             [  0.   4.   5.   6.   0.]
             [  0.   0.   0.   0.   0.]]

            [[  0.   0.   0.   0.   0.]
             [  0.   7.   8.   9.   0.]
             [  0.  10.  11.  12.   0.]
             [  0.   0.   0.   0.   0.]]]


           [[[  0.   0.   0.   0.   0.]
             [  0.  11.  12.  13.   0.]
             [  0.  14.  15.  16.   0.]
             [  0.   0.   0.   0.   0.]]

            [[  0.   0.   0.   0.   0.]
             [  0.  17.  18.  19.   0.]
             [  0.  20.  21.  22.   0.]
             [  0.   0.   0.   0.   0.]]]]


 )code" ADD_FILELINE)
 .add_argument("data", "NDArray-or-Symbol", "An n-dimensional input array.")
 .add_arguments(PadParam::__FIELDS__());

 NNVM_REGISTER_OP(Pad).add_alias("pad");

 }  // namespace op
 }  // namespace mxnet
	/*!
	* Copyright (c) 2015 by Contributors
	* \file pad.cc
	* \brief
	* \author Sebastian Bodenstein
	*/
	#include "./pad-inl.h"

	namespace mshadow {

	////////////////////////////////////////////////////////////////////////////////
	// Special Case: 2d image (so only pad width + height)

	// Case 1: Edge Padding (or Replication Padding)
	// single_image_2d_edge adapted from Torch
	// https://github.com/torch/nn/blob/master/lib/THNN/generic/SpatialReplicationPadding.c
	template <typename DType>
	void single_image_edge(const Tensor<cpu, 3, DType> dst,
	const Tensor<cpu, 3, DType> src, mxnet::TShape pad) {
	const int nslices = src.size(0);
	const int iheight = src.size(1);
	const int iwidth = src.size(2);

	const int oheight = dst.size(1);
	const int owidth = dst.size(2);

	const int pad_t = pad[4];
	const int pad_l = pad[6];
	int iStartX = std::max(0, -pad_l);
	int iStartY = std::max(0, -pad_t);
	int oStartX = std::max(0, pad_l);
	int oStartY = std::max(0, pad_t);

	int k, ip_x, ip_y;
	#pragma omp parallel for private(k, ip_x, ip_y)
	for (k = 0; k < nslices; k++) {
	int i, j;
	for (i = 0; i < oheight; i++) {
	for (j = 0; j < owidth; j++) {
	if (j < pad_l) {
	ip_x = pad_l;
	} else if (j >= pad_l && j < iwidth + pad_l) {
	ip_x = j;
	} else {
	ip_x = iwidth + pad_l - 1;
	}
	ip_x = ip_x - oStartX + iStartX;
	if (i < pad_t) {
	ip_y = pad_t;
	} else if (i >= pad_t && i < iheight + pad_t) {
	ip_y = i;
	} else {
	ip_y = iheight + pad_t - 1;
	}
	ip_y = ip_y - oStartY + iStartY;

	DType dest_p = dst.dptr_ + k owidth * oheight + i * owidth + j;
	DType src_p = src.dptr_ + k iwidth * iheight + ip_y * iwidth + ip_x;
	dest_p = src_p;
	}
	}
	}
	}

	template <typename DType>
	void single_image_edge_grad(const Tensor<cpu, 3, DType> &grad_in,
	const Tensor<cpu, 3, DType> grad_out,
	mxnet::TShape pad) {
	const int nslices = grad_in.size(0);
	const int iheight = grad_in.size(1);
	const int iwidth = grad_in.size(2);

	const int oheight = grad_out.size(1);
	const int owidth = grad_out.size(2);

	const int pad_t = pad[4];
	const int pad_l = pad[6];
	int iStartX = std::max(0, -pad_l);
	int iStartY = std::max(0, -pad_t);
	int oStartX = std::max(0, pad_l);
	int oStartY = std::max(0, pad_t);

	int k, ip_x, ip_y;
	#pragma omp parallel for private(k, ip_x, ip_y)
	for (k = 0; k < nslices; k++) {
	int i, j;
	for (i = 0; i < oheight; i++) {
	for (j = 0; j < owidth; j++) {
	if (j < pad_l) {
	ip_x = pad_l;
	} else if (j >= pad_l && j < iwidth + pad_l) {
	ip_x = j;
	} else {
	ip_x = iwidth + pad_l - 1;
	}
	ip_x = ip_x - oStartX + iStartX;

	if (i < pad_t) {
	ip_y = pad_t;
	} else if (i >= pad_t && i < iheight + pad_t) {
	ip_y = i;
	} else {
	ip_y = iheight + pad_t - 1;
	}
	ip_y = ip_y - oStartY + iStartY;

	DType src_p = grad_out.dptr_ + k owidth * oheight + i * owidth + j;
	DType *dest_p =
	grad_in.dptr_ + k * iwidth * iheight + ip_y * iwidth + ip_x;
	dest_p += src_p;
	}
	}
	}
	}

	// Case 2: Zero Padding
	template <typename DType>
	void single_image_constant(const Tensor<cpu, 3, DType> &dst,
	const Tensor<cpu, 3, DType> src, mxnet::TShape pad,
	DType constant_value) {
	const int pad_t = pad[4];
	const int pad_l = pad[6];
	int c, w, h;
	#pragma omp parallel for private(c, w, h)
	for (c = 0; c < dst.size(0); ++c) {
	for (h = 0; h < dst.size(1); ++h) {
	for (w = 0; w < dst.size(2); ++w) {
	if ((w < pad_l) \|\| (h < pad_t) \|\| (h >= (src.size(1) + pad_t)) \|\|
	(w >= (src.size(2) + pad_l))) {
	dst[c][h][w] = constant_value;
	} else {
	dst[c][h][w] = src[c][h - pad_t][w - pad_l];
	}
	}
	}
	}
	}

	template <typename DType>
	void single_image_constant_grad(const Tensor<cpu, 3, DType> &in_grad,
	const Tensor<cpu, 3, DType> out_grad,
	mxnet::TShape pad) {
	const int pad_t = pad[4];
	const int pad_l = pad[6];
	int c, h, w;
	#pragma omp parallel for private(c, w, h)
	for (c = 0; c < in_grad.size(0); ++c) {
	for (h = 0; h < in_grad.size(1); ++h) {
	for (w = 0; w < in_grad.size(2); ++w) {
	in_grad[c][h][w] += out_grad[c][h + pad_t][w + pad_l];
	}
	}
	}
	}

	// Case 3: Reflection Padding
	template <typename DType>
	void single_image_reflect(const Tensor<cpu, 3, DType> &dst,
	const Tensor<cpu, 3, DType> src, mxnet::TShape pad) {
	const int nslices = src.size(0);
	const int iheight = src.size(1);
	const int iwidth = src.size(2);

	const int oheight = dst.size(1);
	const int owidth = dst.size(2);

	const int pad_t = pad[4];
	const int pad_l = pad[6];
	int iStartX = std::max(0, -pad_l);
	int iStartY = std::max(0, -pad_t);
	int oStartX = std::max(0, pad_l);
	int oStartY = std::max(0, pad_t);

	int k, ip_x, ip_y;
	#pragma omp parallel for private(k, ip_x, ip_y)

	for (k = 0; k < nslices; k++) {
	int i, j;
	for (i = 0; i < oheight; i++) {
	for (j = 0; j < owidth; j++) {
	if (j < pad_l) {
	ip_x = pad_l * 2 - j;
	} else if (j >= pad_l && j < iwidth + pad_l) {
	ip_x = j;
	} else {
	ip_x = (iwidth + pad_l - 1) * 2 - j;
	}
	ip_x = ip_x - oStartX + iStartX;

	if (i < pad_t) {
	ip_y = pad_t * 2 - i;
	} else if (i >= pad_t && i < iheight + pad_t) {
	ip_y = i;
	} else {
	ip_y = (iheight + pad_t - 1) * 2 - i;
	}
	ip_y = ip_y - oStartY + iStartY;

	DType dest_p = dst.dptr_ + k owidth * oheight + i * owidth + j;
	DType src_p = src.dptr_ + k iwidth * iheight + ip_y * iwidth + ip_x;
	dest_p = src_p;
	}
	}
	}
	}

	template <typename DType>
	void single_image_reflect_grad(const Tensor<cpu, 3, DType> &grad_in,
	const Tensor<cpu, 3, DType> grad_out,
	mxnet::TShape pad) {
	const int nslices = grad_in.size(0);
	const int iheight = grad_in.size(1);
	const int iwidth = grad_in.size(2);

	const int oheight = grad_out.size(1);
	const int owidth = grad_out.size(2);

	const int pad_t = pad[4];
	const int pad_l = pad[6];
	int iStartX = std::max(0, -pad_l);
	int iStartY = std::max(0, -pad_t);
	int oStartX = std::max(0, pad_l);
	int oStartY = std::max(0, pad_t);

	int k, ip_x, ip_y;
	#pragma omp parallel for private(k, ip_x, ip_y)

	for (k = 0; k < nslices; k++) {
	int i, j;
	for (i = 0; i < oheight; i++) {
	for (j = 0; j < owidth; j++) {
	if (j < pad_l) {
	ip_x = pad_l * 2 - j;
	} else if (j >= pad_l && j < iwidth + pad_l) {
	ip_x = j;
	} else {
	ip_x = (iwidth + pad_l - 1) * 2 - j;
	}
	ip_x = ip_x - oStartX + iStartX;

	if (i < pad_t) {
	ip_y = pad_t * 2 - i;
	} else if (i >= pad_t && i < iheight + pad_t) {
	ip_y = i;
	} else {
	ip_y = (iheight + pad_t - 1) * 2 - i;
	}
	ip_y = ip_y - oStartY + iStartY;

	DType src_p = grad_out.dptr_ + k owidth * oheight + i * owidth + j;
	DType dest_p = grad_in.dptr_ + k iwidth * iheight + ip_y * iwidth + ip_x;
	dest_p += src_p;
	}
	}
	}
	}

	////////////////////////////////////////////////////////////////////////////////
	// Special Case: 3d image (so only pad width + height + depth)

	// Case 1: Edge Padding (or Replication Padding)
	// single_image_3d_edge adapted from Torch
	// https://github.com/torch/nn/blob/master/lib/THNN/generic/VolumetricReplicationPadding.c
	template <typename DType>
	void single_image_edge(const Tensor<cpu, 4, DType> dst,
	const Tensor<cpu, 4, DType> src, mxnet::TShape pad) {
	const int nslices = src.size(0);
	const int idepth = src.size(1);
	const int iheight = src.size(2);
	const int iwidth = src.size(3);

	const int odepth = dst.size(1);
	const int oheight = dst.size(2);
	const int owidth = dst.size(3);

	const int pad_f = pad[4];
	const int pad_t = pad[6];
	const int pad_l = pad[8];
	int iStartX = std::max(0, -pad_l);
	int iStartY = std::max(0, -pad_t);
	int iStartZ = std::max(0, -pad_f);
	int oStartX = std::max(0, pad_l);
	int oStartY = std::max(0, pad_t);
	int oStartZ = std::max(0, pad_f);

	int k, ip_x, ip_y, ip_z;
	#pragma omp parallel for private(k, ip_x, ip_y, ip_z)
	for (k = 0; k < nslices; k++) {
	int i, j, z;
	for (z = 0; z < odepth; z++) {
	for (i = 0; i < oheight; i++) {
	for (j = 0; j < owidth; j++) {
	if (j < pad_l) {
	ip_x = pad_l;
	} else if (j >= pad_l && j < iwidth + pad_l) {
	ip_x = j;
	} else {
	ip_x = iwidth + pad_l - 1;
	}
	ip_x = ip_x - oStartX + iStartX;

	if (i < pad_t) {
	ip_y = pad_t;
	} else if (i >= pad_t && i < iheight + pad_t) {
	ip_y = i;
	} else {
	ip_y = iheight + pad_t - 1;
	}
	ip_y = ip_y - oStartY + iStartY;

	if (z < pad_f) {
	ip_z = pad_f;
	} else if (z >= pad_f && z < idepth + pad_f) {
	ip_z = z;
	} else {
	ip_z = idepth + pad_f - 1;
	}
	ip_z = ip_z - oStartZ + iStartZ;

	DType dest_p = dst.dptr_ + k owidth * oheight * odepth +
	z * owidth * oheight + i * owidth + j;
	DType src_p = src.dptr_ + k iwidth * iheight * idepth +
	ip_z * iwidth * iheight + ip_y * iwidth + ip_x;
	dest_p = src_p;
	}
	}
	}
	}
	}

	template <typename DType>
	void single_image_edge_grad(const Tensor<cpu, 4, DType> &grad_in,
	const Tensor<cpu, 4, DType> grad_out,
	mxnet::TShape pad) {
	const int nslices = grad_in.size(0);
	const int idepth = grad_in.size(1);
	const int iheight = grad_in.size(2);
	const int iwidth = grad_in.size(3);

	const int odepth = grad_out.size(1);
	const int oheight = grad_out.size(2);
	const int owidth = grad_out.size(3);

	const int pad_f = pad[4];
	const int pad_t = pad[6];
	const int pad_l = pad[8];
	int iStartX = std::max(0, -pad_l);
	int iStartY = std::max(0, -pad_t);
	int iStartZ = std::max(0, -pad_f);
	int oStartX = std::max(0, pad_l);
	int oStartY = std::max(0, pad_t);
	int oStartZ = std::max(0, pad_f);

	int k, ip_x, ip_y, ip_z;
	#pragma omp parallel for private(k, ip_x, ip_y, ip_z)
	for (k = 0; k < nslices; k++) {
	int i, j, z;
	for (z = 0; z < odepth; z++) {
	for (i = 0; i < oheight; i++) {
	for (j = 0; j < owidth; j++) {
	if (j < pad_l) {
	ip_x = pad_l;
	} else if (j >= pad_l && j < iwidth + pad_l) {
	ip_x = j;
	} else {
	ip_x = iwidth + pad_l - 1;
	}
	ip_x = ip_x - oStartX + iStartX;

	if (i < pad_t) {
	ip_y = pad_t;
	} else if (i >= pad_t && i < iheight + pad_t) {
	ip_y = i;
	} else {
	ip_y = iheight + pad_t - 1;
	}
	ip_y = ip_y - oStartY + iStartY;

	if (z < pad_f) {
	ip_z = pad_f;
	} else if (z >= pad_f && z < idepth + pad_f) {
	ip_z = z;
	} else {
	ip_z = idepth + pad_f - 1;
	}
	ip_z = ip_z - oStartZ + iStartZ;

	DType src_p = grad_out.dptr_ + k owidth * oheight * odepth +
	z * owidth * oheight + i * owidth + j;
	DType dest_p = grad_in.dptr_ + k iwidth * iheight * idepth +
	ip_z * iwidth * iheight + ip_y * iwidth + ip_x;
	dest_p += src_p;
	}
	}
	}
	}
	}

	// Case 2: Zero Padding
	template <typename DType>
	void single_image_constant(const Tensor<cpu, 4, DType> &dst,
	const Tensor<cpu, 4, DType> src, mxnet::TShape pad,
	DType constant_value) {
	const int pad_f = pad[4];
	const int pad_t = pad[6];
	const int pad_l = pad[8];
	int c, d, w, h;
	#pragma omp parallel for private(c, d, w, h)
	for (c = 0; c < dst.size(0); ++c) {
	for (d = 0; d < dst.size(1); ++d) {
	for (h = 0; h < dst.size(2); ++h) {
	for (w = 0; w < dst.size(3); ++w) {
	if ((w < pad_l) \|\| (h < pad_t) \|\| (d < pad_f) \|\|
	(d >= (src.size(1) + pad_f)) \|\| (h >= (src.size(2) + pad_t)) \|\|
	(w >= (src.size(3) + pad_l))) {
	dst[c][d][h][w] = constant_value;
	} else {
	dst[c][d][h][w] = src[c][d - pad_f][h - pad_t][w - pad_l];
	}
	}
	}
	}
	}
	}

	template <typename DType>
	void single_image_constant_grad(const Tensor<cpu, 4, DType> &in_grad,
	const Tensor<cpu, 4, DType> out_grad,
	mxnet::TShape pad) {
	const int pad_f = pad[4];
	const int pad_t = pad[6];
	const int pad_l = pad[8];
	int c, d, w, h;
	#pragma omp parallel for private(c, d, w, h)
	for (c = 0; c < in_grad.size(0); ++c) {
	for (d = 0; d < in_grad.size(1); ++d) {
	for (h = 0; h < in_grad.size(2); ++h) {
	for (w = 0; w < in_grad.size(3); ++w) {
	in_grad[c][d][h][w] += out_grad[c][d + pad_f][h + pad_t][w + pad_l];
	}
	}
	}
	}
	}

	// Case 3: Reflection Padding
	template <typename DType>
	void single_image_reflect(const Tensor<cpu, 4, DType> &dst,
	const Tensor<cpu, 4, DType> src, mxnet::TShape pad) {
	const int nslices = src.size(0);
	const int idepth = src.size(1);
	const int iheight = src.size(2);
	const int iwidth = src.size(3);

	const int odepth = dst.size(1);
	const int oheight = dst.size(2);
	const int owidth = dst.size(3);

	const int pad_f = pad[4];
	const int pad_t = pad[6];
	const int pad_l = pad[8];
	int iStartX = std::max(0, -pad_l);
	int iStartY = std::max(0, -pad_t);
	int iStartZ = std::max(0, -pad_f);
	int oStartX = std::max(0, pad_l);
	int oStartY = std::max(0, pad_t);
	int oStartZ = std::max(0, pad_f);

	int l, ip_x, ip_y, ip_z;
	#pragma omp parallel for private(l, ip_x, ip_y, ip_z)
	for (l = 0; l < nslices; l++) {
	int i, j, k;
	for (k = 0; k < odepth; k++) {
	for (i = 0; i < oheight; i++) {
	for (j = 0; j < owidth; j++) {
	if (j < pad_l) {
	ip_x = pad_l * 2 - j;
	} else if (j >= pad_l && j < iwidth + pad_l) {
	ip_x = j;
	} else {
	ip_x = (iwidth + pad_l - 1) * 2 - j;
	}
	ip_x = ip_x - oStartX + iStartX;

	if (i < pad_t) {
	ip_y = pad_t * 2 - i;
	} else if (i >= pad_t && i < iheight + pad_t) {
	ip_y = i;
	} else {
	ip_y = (iheight + pad_t - 1) * 2 - i;
	}
	ip_y = ip_y - oStartY + iStartY;

	if (k < pad_f) {
	ip_z = pad_f * 2 - k;
	} else if (k >= pad_f && k < idepth + pad_f) {
	ip_z = k;
	} else {
	ip_z = (idepth + pad_f - 1) * 2 - k;
	}
	ip_z = ip_z - oStartZ + iStartZ;

	DType dest_p = dst.dptr_ + l owidth * oheight * odepth +
	k * owidth * oheight + i * owidth + j;
	DType src_p = src.dptr_ + l iwidth * iheight * idepth +
	ip_z * iwidth * iheight + ip_y * iwidth + ip_x;
	dest_p = src_p;
	}
	}
	}
	}
	}

	template <typename DType>
	void single_image_reflect_grad(const Tensor<cpu, 4, DType> &grad_in,
	const Tensor<cpu, 4, DType> grad_out,
	mxnet::TShape pad) {
	const int nslices = grad_in.size(0);
	const int idepth = grad_in.size(1);
	const int iheight = grad_in.size(2);
	const int iwidth = grad_in.size(3);

	const int odepth = grad_out.size(1);
	const int oheight = grad_out.size(2);
	const int owidth = grad_out.size(3);

	const int pad_f = pad[4];
	const int pad_t = pad[6];
	const int pad_l = pad[8];
	int iStartX = std::max(0, -pad_l);
	int iStartY = std::max(0, -pad_t);
	int iStartZ = std::max(0, -pad_f);
	int oStartX = std::max(0, pad_l);
	int oStartY = std::max(0, pad_t);
	int oStartZ = std::max(0, pad_f);

	int l, ip_x, ip_y, ip_z;
	/#pragma omp parallel for private(l, ip_x, ip_y, ip_z)/
	for (l = 0; l < nslices; l++) {
	int i, j, k;
	for (k = 0; k < odepth; k++) {
	for (i = 0; i < oheight; i++) {
	for (j = 0; j < owidth; j++) {
	if (j < pad_l) {
	ip_x = pad_l * 2 - j;
	} else if (j >= pad_l && j < iwidth + pad_l) {
	ip_x = j;
	} else {
	ip_x = (iwidth + pad_l - 1) * 2 - j;
	}
	ip_x = ip_x - oStartX + iStartX;

	if (i < pad_t) {
	ip_y = pad_t * 2 - i;
	} else if (i >= pad_t && i < iheight + pad_t) {
	ip_y = i;
	} else {
	ip_y = (iheight + pad_t - 1) * 2 - i;
	}
	ip_y = ip_y - oStartY + iStartY;

	if (k < pad_f) {
	ip_z = pad_f * 2 - k;
	} else if (k >= pad_f && k < idepth + pad_f) {
	ip_z = k;
	} else {
	ip_z = (idepth + pad_f - 1) * 2 - k;
	}
	ip_z = ip_z - oStartZ + iStartZ;

	DType src_p = grad_out.dptr_ + l owidth * oheight * odepth +
	k * owidth * oheight + i * owidth + j;
	DType dest_p = grad_in.dptr_ + l iwidth * iheight * idepth +
	ip_z * iwidth * iheight + ip_y * iwidth + ip_x;
	dest_p += src_p;
	}
	}
	}
	}
	}

	////////////////////////////////////////////////////////////////////////////////
	// Interface to 2d and 3d image pad methods

	template <int dim, typename DType>
	void pad_image(const Tensor<cpu, dim, DType> &dst,
	const Tensor<cpu, dim, DType> src, mxnet::TShape pad, int mode,
	DType constant_value) {
	for (index_t n = 0; n < dst.size(0); ++n) {
	switch (mode) {
	case mxnet::op::pad_enum::kEdge:
	single_image_edge(dst[n], src[n], pad);
	break;
	case mxnet::op::pad_enum::kConstant:
	single_image_constant(dst[n], src[n], pad, constant_value);
	break;
	case mxnet::op::pad_enum::kReflect:
	single_image_reflect(dst[n], src[n], pad);
	break;
	}
	}
	}

	template <int dim, typename DType>
	void pad_image_grad(const Tensor<cpu, dim, DType> &in_grad,
	const Tensor<cpu, dim, DType> out_grad, mxnet::TShape pad,
	int mode) {
	for (index_t n = 0; n < in_grad.size(0); ++n) {
	switch (mode) {
	case mxnet::op::pad_enum::kEdge:
	single_image_edge_grad(in_grad[n], out_grad[n], pad);
	break;
	case mxnet::op::pad_enum::kConstant:
	single_image_constant_grad(in_grad[n], out_grad[n], pad);
	break;
	case mxnet::op::pad_enum::kReflect:
	single_image_reflect_grad(in_grad[n], out_grad[n], pad);
	break;
	}
	}
	}

	} // namespace mshadow

	namespace mxnet {
	namespace op {
	template <>
	Operator *CreateOp<cpu>(PadParam param, int dtype) {
	Operator *op = NULL;
	MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { op = new PadOp<cpu, DType>(param); })
	return op;
	}

	// DO_BIND_DISPATCH comes from operator_common.h
	Operator PadProp::CreateOperatorEx(Context ctx, std::vector<TShape> in_shape,
	std::vector<int> *in_type) const {
	std::vector<TShape> out_shape, aux_shape;
	std::vector<int> out_type, aux_type;
	CHECK(InferType(in_type, &out_type, &aux_type));
	CHECK(InferShape(in_shape, &out_shape, &aux_shape));
	DO_BIND_DISPATCH(CreateOp, param_, (*in_type)[0]);
	}

	DMLC_REGISTER_PARAMETER(PadParam);

	MXNET_REGISTER_OP_PROPERTY(Pad, PadProp)
	.describe(R"code(Pads an input array with a constant or edge values of the array.

	.. note:: `Pad` is deprecated. Use `pad` instead.

	.. note:: Current implementation only supports 4D and 5D input arrays with padding applied
	only on axes 1, 2 and 3. Expects axes 4 and 5 in `pad_width` to be zero.

	This operation pads an input array with either a `constant_value` or edge values
	along each axis of the input array. The amount of padding is specified by `pad_width`.

	`pad_width` is a tuple of integer padding widths for each axis of the format
	``(before_1, after_1, ... , before_N, after_N)``. The `pad_width` should be of length ``2*N``
	where ``N`` is the number of dimensions of the array.

	For dimension ``N`` of the input array, ``before_N`` and ``after_N`` indicates how many values
	to add before and after the elements of the array along dimension ``N``.
	The widths of the higher two dimensions ``before_1``, ``after_1``, ``before_2``,
	``after_2`` must be 0.

	Example::

	x = [[[[ 1. 2. 3.]
	[ 4. 5. 6.]]

	[[ 7. 8. 9.]
	[ 10. 11. 12.]]]


	[[[ 11. 12. 13.]
	[ 14. 15. 16.]]

	[[ 17. 18. 19.]
	[ 20. 21. 22.]]]]

	pad(x,mode="edge", pad_width=(0,0,0,0,1,1,1,1)) =

	[[[[ 1. 1. 2. 3. 3.]
	[ 1. 1. 2. 3. 3.]
	[ 4. 4. 5. 6. 6.]
	[ 4. 4. 5. 6. 6.]]

	[[ 7. 7. 8. 9. 9.]
	[ 7. 7. 8. 9. 9.]
	[ 10. 10. 11. 12. 12.]
	[ 10. 10. 11. 12. 12.]]]


	[[[ 11. 11. 12. 13. 13.]
	[ 11. 11. 12. 13. 13.]
	[ 14. 14. 15. 16. 16.]
	[ 14. 14. 15. 16. 16.]]

	[[ 17. 17. 18. 19. 19.]
	[ 17. 17. 18. 19. 19.]
	[ 20. 20. 21. 22. 22.]
	[ 20. 20. 21. 22. 22.]]]]

	pad(x, mode="constant", constant_value=0, pad_width=(0,0,0,0,1,1,1,1)) =

	[[[[ 0. 0. 0. 0. 0.]
	[ 0. 1. 2. 3. 0.]
	[ 0. 4. 5. 6. 0.]
	[ 0. 0. 0. 0. 0.]]

	[[ 0. 0. 0. 0. 0.]
	[ 0. 7. 8. 9. 0.]
	[ 0. 10. 11. 12. 0.]
	[ 0. 0. 0. 0. 0.]]]


	[[[ 0. 0. 0. 0. 0.]
	[ 0. 11. 12. 13. 0.]
	[ 0. 14. 15. 16. 0.]
	[ 0. 0. 0. 0. 0.]]

	[[ 0. 0. 0. 0. 0.]
	[ 0. 17. 18. 19. 0.]
	[ 0. 20. 21. 22. 0.]
	[ 0. 0. 0. 0. 0.]]]]


	)code" ADD_FILELINE)
	.add_argument("data", "NDArray-or-Symbol", "An n-dimensional input array.")
	.add_arguments(PadParam::__FIELDS__());

	NNVM_REGISTER_OP(Pad).add_alias("pad");

	} // namespace op
	} // namespace mxnet