src/operator/tensor/matrix_op-inl.h - mxnet - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *   http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
  */

 /*!
  *  Copyright (c) 2015 by Contributors
  * \file matrix_op-inl.h
  * \brief Function definition of matrix related operators
  */
 #ifndef MXNET_OPERATOR_TENSOR_MATRIX_OP_INL_H_
 #define MXNET_OPERATOR_TENSOR_MATRIX_OP_INL_H_

 #include <mxnet/operator_util.h>
 #include <vector>
 #include <algorithm>
 #include <utility>
 #include <type_traits>
 #include "../mshadow_op.h"
 #include "../elemwise_op_common.h"
 #include "../channel_op_common.h"
 #include "../mxnet_op.h"
 #include "broadcast_reduce_op.h"
 #include "./init_op.h"
 #include "../../common/static_array.h"
 #include "./slice-inl.h"

 #if MXNET_USE_CUDA
 #include <thrust/device_vector.h>
 #endif

 namespace mxnet {
 namespace op {

 struct ReshapeParam : public dmlc::Parameter<ReshapeParam> {
   mxnet::TShape target_shape;
   bool keep_highest;
   nnvm::Tuple<int> shape;
   bool reverse;
   DMLC_DECLARE_PARAMETER(ReshapeParam) {
     DMLC_DECLARE_FIELD(shape)
     .set_default(nnvm::Tuple<int>())
     .describe("The target shape");
     DMLC_DECLARE_FIELD(reverse)
     .set_default(false)
     .describe("If true then the special values are inferred from right to left");
     DMLC_DECLARE_FIELD(target_shape)
     .set_default(mxnet::TShape())
     .describe("(Deprecated! Use ``shape`` instead.) "
               "Target new shape. One and only one dim can be 0, "
               "in which case it will be inferred from the rest of dims");
     DMLC_DECLARE_FIELD(keep_highest).set_default(false)
     .describe("(Deprecated! Use ``shape`` instead.) Whether keep the highest dim unchanged."
               "If set to true, then the first dim in target_shape is ignored,"
               "and always fixed as input");
   }
 };

 template<typename IType>
 inline mxnet::TShape InferReshapeShape(const nnvm::Tuple<IType>& shape,
                                 const mxnet::TShape& dshape, bool reverse) {
   std::vector<IType> dshape_vec;
   std::vector<IType> param_shape_vec(shape.begin(), shape.end());
   for (index_t i = 0; i < dshape.ndim(); ++i) {
     dshape_vec.push_back(dshape[i]);
   }
   std::vector<IType> tmp;
   size_t src_idx = 0;
   int inf_idx = -1;
   if (reverse) {
     std::reverse(dshape_vec.begin(), dshape_vec.end());
     std::reverse(param_shape_vec.begin(), param_shape_vec.end());
   }
   auto dshape_len = dshape_vec.size();
   auto params_len = param_shape_vec.size();
   for (size_t i = 0; i < params_len; ++i) {
     IType proposed_dim = param_shape_vec[i];
     if (proposed_dim == 0) {
       // keep same
       CHECK_LT(src_idx, dshape_len);
       tmp.push_back(dshape_vec[src_idx++]);
     } else if (proposed_dim == -1) {
       // infer
       CHECK_LT(inf_idx, 0) << "One and only one dim can be inferred";
       inf_idx = i;
       tmp.push_back(1);
       src_idx++;
     } else if (proposed_dim == -2) {
       // copy all remaining dims from source
       while (src_idx < dshape_len) {
         size_t dn = dshape_vec[src_idx++];
         tmp.push_back(dn);
       }
     } else if (proposed_dim == -3) {
       // merge two dims from source
       CHECK_LT(src_idx, dshape_len-1);
       size_t d1 = dshape_vec[src_idx++];
       size_t d2 = dshape_vec[src_idx++];
       size_t dn = d1 * d2;
       tmp.push_back(dn);
     } else if (proposed_dim == -4) {
       // split the source dim s into two dims
       // read the left dim and then the right dim (either can be -1)
       CHECK_LT(i + 2, params_len);
       CHECK_LT(src_idx, dshape_len);
       size_t d0 = dshape_vec[src_idx++];
       IType d1 = param_shape_vec[++i];
       IType d2 = param_shape_vec[++i];
       CHECK(d1 != -1 || d2 != -1) << "Split dims cannot both be -1.";
       if (d1 == -1) d1 = d0 / d2;
       if (d2 == -1) d2 = d0 / d1;
       CHECK(d1 * d2 == static_cast<IType>(d0) || static_cast<IType>(d0) == IType(0)) <<
         "Split dims " << d1 << ", " << d2 << " do not divide original dim " << d0;
       tmp.push_back(d1);
       tmp.push_back(d2);
     } else {
       // greater than 0, new shape
       tmp.push_back(proposed_dim);
       src_idx++;
     }
   }

   if (inf_idx >= 0) {
     if (dshape.Size() > 0) {
       IType new_size = 1;
       for (IType x : tmp) new_size *= x;
       tmp[inf_idx] = dshape.Size() / new_size;
     } else {
       tmp[inf_idx] = 0;
     }
   }
   if (reverse) {
     std::reverse(param_shape_vec.begin(), param_shape_vec.end());
     std::reverse(dshape_vec.begin(), dshape_vec.end());
     std::reverse(tmp.begin(), tmp.end());
   }
   mxnet::TShape oshape(tmp.begin(), tmp.end());
   return oshape;
 }

 inline bool ReverseReshapeInferShape(mxnet::TShape *in, const mxnet::TShape& out) {
   if (in->Size() && out.Size()) {
     return true;
   } else if (!out.Size()) {
     return false;
   } else {
     int zero_axis = -1;
     int non_zero_prod = 1;
     for (index_t i = 0; i < in->ndim(); i++) {
       if ((*in)[i] == 0) {
         if (zero_axis != -1)
           return false;  // more than 1 zero found.
         else
           zero_axis = i;
       } else {
         non_zero_prod *= (*in)[i];
       }
     }
     (*in)[zero_axis] = out.Size() / non_zero_prod;
     return true;
   }
 }

 inline bool ReshapeShape(const nnvm::NodeAttrs& attrs,
                          mxnet::ShapeVector *in_attrs,
                          mxnet::ShapeVector *out_attrs) {
   const ReshapeParam& param_ = nnvm::get<ReshapeParam>(attrs.parsed);
   CHECK_EQ(in_attrs->size(), 1U) << "Input: [data]";
   CHECK_EQ(out_attrs->size(), 1U);
   mxnet::TShape &dshape = (*in_attrs)[0];
   if (dshape.ndim() == 0) return false;
   mxnet::TShape oshape;
   if (param_.shape.ndim() != 0) {
     oshape = InferReshapeShape(param_.shape, dshape, param_.reverse);
   } else if (param_.target_shape.ndim()) {
     LOG(INFO) << "Using target_shape will be deprecated.";
     oshape = param_.target_shape;
     int neg_count = 0;
     index_t inf_idx = 0;
     index_t start_idx = param_.keep_highest ? 1 : 0;
     if (param_.keep_highest) {
       oshape[0] = dshape[0];
     }
     for (index_t i = start_idx; i < oshape.ndim(); ++i) {
       if (oshape[i] == 0) {
         neg_count++;
         inf_idx = i;
       }
     }
     if (neg_count == 1) {
       oshape[inf_idx] = 1;
       oshape[inf_idx] = dshape.Size() / oshape.Size();
     }
   } else {
     return (*out_attrs)[0].ndim() && ReverseReshapeInferShape(&(*in_attrs)[0], (*out_attrs)[0]);
   }
   ReverseReshapeInferShape(&dshape, oshape);
   CHECK_EQ(oshape.Size(), dshape.Size())
     << "Target shape size is different to source. "
     << "Target: " << oshape
     << "\nSource: " << dshape;
   SHAPE_ASSIGN_CHECK(*out_attrs, 0, oshape);
   return ReverseReshapeInferShape(&(*in_attrs)[0], (*out_attrs)[0]);
 }

 inline bool FlattenShape(const nnvm::NodeAttrs& attrs,
                          mxnet::ShapeVector *in_attrs,
                          mxnet::ShapeVector *out_attrs) {
   CHECK_EQ(in_attrs->size(), 1U) << "Input: [data]";
   CHECK_EQ(out_attrs->size(), 1U);
   const mxnet::TShape &dshape = (*in_attrs)[0];
   if (dshape.ndim() == 0) return false;
   uint32_t target_dim = 1;
   for (uint32_t i = 1; i < dshape.ndim(); ++i) {
     target_dim *= dshape[i];
   }
   SHAPE_ASSIGN_CHECK(*out_attrs, 0, mshadow::Shape2(dshape[0], target_dim));
   return true;
 }

 struct TransposeParam : public dmlc::Parameter<TransposeParam> {
   mxnet::TShape axes;
   DMLC_DECLARE_PARAMETER(TransposeParam) {
     DMLC_DECLARE_FIELD(axes).set_default(mxnet::TShape())
     .describe("Target axis order. By default the axes will be inverted.");
   }
 };

 template<typename xpu>
 void TransposeImpl(RunContext ctx,
                    const TBlob& src,
                    const TBlob& ret,
                    const mxnet::TShape& axes) {
   using namespace mshadow;
   using namespace mshadow::expr;
   CHECK_EQ(src.type_flag_, ret.type_flag_);
   Stream<xpu> *s = ctx.get_stream<xpu>();
   MSHADOW_TYPE_SWITCH(ret.type_flag_, DType, {
     switch (axes.ndim()) {
      case 0:
       break;
      case 1: {
       Tensor<xpu, 1, DType> in = src.get<xpu, 1, DType>(s);
       Tensor<xpu, 1, DType> out = ret.get<xpu, 1, DType>(s);
       Copy(out, in, s);
       break;
      }
      case 2: {
       mshadow::Tensor<xpu, 2, DType> in = src.FlatTo2D<xpu, DType>(s);
       mshadow::Tensor<xpu, 2, DType> out = ret.FlatTo2D<xpu, DType>(s);
       if (axes[0] == 1 && axes[1] == 0) {
         out = in.T();
       } else {
         Copy(out, in, s);
       }
       break;
      }
      case 3: {
       Tensor<xpu, 3, DType> in = src.get<xpu, 3, DType>(s);
       Tensor<xpu, 3, DType> out = ret.get<xpu, 3, DType>(s);
       out = transpose(in, axes.get<3>());
       break;
      }
      case 4: {
       Tensor<xpu, 4, DType> in = src.get<xpu, 4, DType>(s);
       Tensor<xpu, 4, DType> out = ret.get<xpu, 4, DType>(s);
       out = transpose(in, axes.get<4>());
       break;
      }
      case 5: {
       Tensor<xpu, 5, DType> in = src.get<xpu, 5, DType>(s);
       Tensor<xpu, 5, DType> out = ret.get<xpu, 5, DType>(s);
       out = transpose(in, axes.get<5>());
       break;
      }
      case 6: {
       Tensor<xpu, 6, DType> in = src.get<xpu, 6, DType>(s);
       Tensor<xpu, 6, DType> out = ret.get<xpu, 6, DType>(s);
       out = transpose(in, axes.get<6>());
       break;
      }
      default:
       LOG(FATAL) << "Transpose support at most 6 dimensions";
       break;
     }
   });
 }

 // matrix transpose
 template<typename xpu>
 void Transpose(const nnvm::NodeAttrs& attrs,
                const OpContext& ctx,
                const std::vector<TBlob>& inputs,
                const std::vector<OpReqType>& req,
                const std::vector<TBlob>& outputs) {
   const TransposeParam& param = nnvm::get<TransposeParam>(attrs.parsed);
   CHECK_EQ(req[0], kWriteTo) << "Transpose does not support inplace";
   if (param.axes.ndim() == 0) {
     mxnet::TShape axes = mxnet::TShape(inputs[0].ndim());
     for (index_t i = 0; i < axes.ndim(); ++i) {
       axes[i] = axes.ndim() - 1 - i;
     }
     TransposeImpl<xpu>(ctx.run_ctx, inputs[0], outputs[0], axes);
   } else {
     TransposeImpl<xpu>(ctx.run_ctx, inputs[0], outputs[0], param.axes);
   }
 }

 inline bool TransposeShape(const nnvm::NodeAttrs& attrs,
                              mxnet::ShapeVector *in_attrs,
                              mxnet::ShapeVector *out_attrs) {
   const TransposeParam& param = nnvm::get<TransposeParam>(attrs.parsed);
   CHECK_EQ(in_attrs->size(), 1U);
   CHECK_EQ(out_attrs->size(), 1U);
   mxnet::TShape& shp = (*in_attrs)[0];
   CHECK_LE(shp.ndim(), 6U) << "Transpose support at most 6 dimensions";
   mxnet::TShape ret(shp.ndim());
   if (param.axes.ndim() == 0) {
     for (index_t i = 0; i < shp.ndim(); ++i) {
       ret[i] = shp[shp.ndim()-1-i];
     }
   } else {
     CHECK_EQ(shp.ndim(), param.axes.ndim());
     for (size_t i = 0; i < shp.ndim(); ++i) {
       CHECK(param.axes[i] < static_cast<int64_t>(shp.ndim()));
       ret[i] = shp[param.axes[i]];
     }
   }
   SHAPE_ASSIGN_CHECK(*out_attrs, 0, ret);
   return true;
 }


 struct ExpandDimParam : public dmlc::Parameter<ExpandDimParam> {
   int axis;
   DMLC_DECLARE_PARAMETER(ExpandDimParam) {
     DMLC_DECLARE_FIELD(axis)
     .describe("Position where new axis is to be inserted. Suppose that "
               "the input `NDArray`'s dimension is `ndim`, the range of "
               "the inserted axis is `[-ndim, ndim]`");
   }
 };


 inline bool ExpandDimShape(const nnvm::NodeAttrs& attrs,
                            mxnet::ShapeVector *in_attrs,
                            mxnet::ShapeVector *out_attrs) {
   const ExpandDimParam& param = nnvm::get<ExpandDimParam>(attrs.parsed);
   CHECK_EQ(in_attrs->size(), 1U);
   CHECK_EQ(out_attrs->size(), 1U);
   if (in_attrs->at(0).ndim() == 0U && out_attrs->at(0).ndim() == 0U) {
     return false;
   }

   mxnet::TShape& ishape = (*in_attrs)[0];
   mxnet::TShape& oshape = (*out_attrs)[0];
   int indim = ishape.ndim();
   bool unknown_ishape = false;
   if (0 == indim) {
     indim = oshape.ndim() - 1;
     unknown_ishape = true;
   }

   int axis = param.axis;
   if (axis < 0) {
     axis += indim + 1;
   }
   CHECK(axis >= 0 && axis <= indim)
       << "axis must be in the range [" << -indim << ", " << indim << "] ("
       << param.axis << " provided)";
   mxnet::TShape ret(indim + 1);
   for (int i = 0; i < axis; ++i) {
     ret[i] = (unknown_ishape? 0 : ishape[i]);
   }
   ret[axis] = 1;
   for (int i = axis+1; i < indim+1; ++i) {
     ret[i] = (unknown_ishape? 0 : ishape[i-1]);
   }
   SHAPE_ASSIGN_CHECK(*out_attrs, 0, ret);

   ret = mxnet::TShape(indim);
   for (int i = 0; i < axis; ++i) ret[i] = oshape[i];
   for (int i = axis+1; i < indim+1; ++i) ret[i-1] = oshape[i];
   SHAPE_ASSIGN_CHECK(*in_attrs, 0, ret);
   return true;
 }

 // Currently MKLDNN only supports step = 1 or step has no value
 inline bool SupportMKLDNNSlice(const SliceParam& param) {
   if (param.step.ndim() == 0U) return true;
   for (uint32_t i = 0; i < param.step.ndim(); ++i) {
     if (param.step[i].has_value() && param.step[i].value() != 1)
       return false;
   }
   return true;
 }

 inline bool SliceForwardInferStorageType(const nnvm::NodeAttrs& attrs,
                                          const int dev_mask,
                                          DispatchMode* dispatch_mode,
                                          std::vector<int>* in_attrs,
                                          std::vector<int>* out_attrs) {
   CHECK_EQ(in_attrs->size(), 1);
   CHECK_EQ(out_attrs->size(), 1);
   const SliceParam& param = nnvm::get<SliceParam>(attrs.parsed);
   const auto& in_stype = in_attrs->at(0);
   auto& out_stype = out_attrs->at(0);
   bool dispatched = false;
   const auto dispatch_ex = DispatchMode::kFComputeEx;
   // If step = 1, no need to fallback; otherwise fallback to dense
   bool trivial_step = false;
   if (param.step.ndim() == 0U) {
     trivial_step = true;
   } else if (param.step.ndim() == 1U
       && (!param.step[0].has_value() || param.step[0].value() == 1)) {
     trivial_step = true;
   }

   if (in_stype == kDefaultStorage) {
 #if MXNET_USE_MKLDNN == 1
     if (dev_mask == Context::kCPU && MKLDNNEnvSet()
         && SupportMKLDNNSlice(param)) {
       dispatched = storage_type_assign(&out_stype, kDefaultStorage,
                                        dispatch_mode, dispatch_ex);
     }
 #endif
     if (!dispatched) {
       dispatched = storage_type_assign(&out_stype, kDefaultStorage,
                                        dispatch_mode, DispatchMode::kFCompute);
     }
   }

   if (!dispatched && in_stype == kCSRStorage && trivial_step) {
     dispatched = storage_type_assign(&out_stype, kCSRStorage,
                                      dispatch_mode, dispatch_ex);
   }

   if (!dispatched) {
     dispatched = dispatch_fallback(out_attrs, dispatch_mode);
   }

   return dispatched;
 }

 // slice the indptr of a csr
 struct SliceCsrIndPtr {
   template<typename IType>
   MSHADOW_XINLINE static void Map(int i, IType* out, const IType* in, const IType* base) {
     KERNEL_ASSIGN(out[i], kWriteTo, in[i] - *base);
   }
 };

 /*
  * a wrapper to launch SliceCsrIndPtr kernel.
  * slice [src[begin] .. src[end]) and store in dst[0, end - begin)
  */
 template<typename xpu, typename IType>
 void SliceCsrIndPtrImpl(const int begin, const int end, RunContext ctx,
                         const IType* src, IType* dst) {
   using namespace mshadow;
   using namespace mxnet_op;
   Stream<xpu> *s = ctx.get_stream<xpu>();
   int indptr_len = end - begin + 1;
   Kernel<SliceCsrIndPtr, xpu>::Launch(s, indptr_len, dst, src + begin, src + begin);
 }

 /*
  * Slice a CSR NDArray for first dimension
  */
 template<typename xpu>
 void SliceDimOneCsrImpl(const mxnet::TShape &begin, const mxnet::TShape &end, const OpContext& ctx,
                         const NDArray &in, const NDArray &out) {
   using namespace mshadow;
   using namespace mxnet_op;
   using namespace csr;
   nnvm::dim_t begin_row = begin[0];
   nnvm::dim_t end_row = end[0];
   nnvm::dim_t indptr_len = end_row - begin_row + 1;
   out.CheckAndAllocAuxData(kIndPtr, Shape1(indptr_len));
   // assume idx indptr share the same type
   MSHADOW_IDX_TYPE_SWITCH(in.aux_type(kIndPtr), RType, {
     MSHADOW_IDX_TYPE_SWITCH(in.aux_type(kIdx), IType, {
       MSHADOW_TYPE_SWITCH(in.dtype(), DType, {
         RType* in_indptr = in.aux_data(kIndPtr).dptr<RType>();
         RType* out_indptr = out.aux_data(kIndPtr).dptr<RType>();
         SliceCsrIndPtrImpl<xpu, RType>(begin_row, end_row, ctx.run_ctx, in_indptr, out_indptr);

         Stream<xpu> *s = ctx.get_stream<xpu>();

         RType nnz = 0;
         mshadow::Copy(Tensor<cpu, 1, RType>(&nnz, Shape1(1)),
                       Tensor<xpu, 1, RType>(out_indptr + indptr_len - 1, Shape1(1), s));
         // return csr zeros if nnz = 0
         if (nnz == 0) {
           out.set_aux_shape(kIdx, Shape1(0));
           return;
         }
         // copy indices and values
         out.CheckAndAllocAuxData(kIdx, Shape1(nnz));
         out.CheckAndAllocData(Shape1(nnz));
         IType* in_idx = in.aux_data(kIdx).dptr<IType>();
         IType* out_idx = out.aux_data(kIdx).dptr<IType>();
         DType* in_data = in.data().dptr<DType>();
         DType* out_data = out.data().dptr<DType>();

         RType offset = 0;
         mshadow::Copy(Tensor<cpu, 1, RType>(&offset, Shape1(1)),
                       Tensor<xpu, 1, RType>(in_indptr + begin_row, Shape1(1), s));

         mshadow::Copy(Tensor<xpu, 1, IType>(out_idx, Shape1(nnz), s),
                       Tensor<xpu, 1, IType>(in_idx + offset, Shape1(nnz), s), s);
         mshadow::Copy(Tensor<xpu, 1, DType>(out_data, Shape1(nnz), s),
                       Tensor<xpu, 1, DType>(in_data + offset, Shape1(nnz), s), s);
       });
     });
   });
 }

 /*!
  * \brief slice a CSRNDArray for two dimensions
  */
 struct SliceDimTwoCsrAssign {
   /*!
    * \brief This function slices a CSRNDArray on axis one between begin_col and end_col
    * \param i           loop index
    * \param out_idx     output csr ndarray column indices
    * \param out_data    output csr ndarray data
    * \param out_indptr  output csr ndarray row index pointer
    * \param in_idx      input csr ndarray column indices
    * \param in_data     input csr ndarray data
    * \param in_indptr   input csr ndarray row index pointer
    * \param begin_col   begin column indice
    * \param end_col     end column indice
    */
   template<typename IType, typename RType, typename DType>
   MSHADOW_XINLINE static void Map(int i,
                                   IType* out_idx, DType* out_data,
                                   const RType* out_indptr,
                                   const IType* in_idx, const DType* in_data,
                                   const RType* in_indptr,
                                   const int begin_col, const int end_col) {
     RType ind = out_indptr[i];
     for (RType j = in_indptr[i]; j < in_indptr[i+1]; j++) {
       // indices of CSRNDArray are in ascending order per row
       if (in_idx[j] >= end_col) {
         break;
       } else if (in_idx[j] >= begin_col) {
         out_idx[ind] = in_idx[j] - begin_col;
         out_data[ind] = in_data[j];
         ind++;
       }
     }
   }
 };

 /*
  * Slice a CSR NDArray for two dimensions
  */
 template<typename xpu>
 void SliceDimTwoCsrImpl(const mxnet::TShape &begin, const mxnet::TShape &end, const OpContext& ctx,
                         const NDArray &in, const NDArray &out);


 template<typename xpu>
 void SliceCsrImpl(const SliceParam &param, const OpContext& ctx,
                   const NDArray &in, OpReqType req, const NDArray &out) {
   if (req == kNullOp) return;
   CHECK_NE(req, kAddTo) << "kAddTo for Slice on CSR input is not supported";
   CHECK_NE(req, kWriteInplace) << "kWriteInplace for Slice on CSR input is not supported";

   const mxnet::TShape ishape = in.shape();
   const mxnet::TShape oshape = out.shape();

   uint32_t N = ishape.ndim();
   mxnet::TShape begin(N), end(N);
   for (uint32_t i = 0; i < N; ++i) {
     int s = 0;
     if (param.begin[i]) {
       s = *param.begin[i];
       if (s < 0) s += ishape[i];
     }
     begin[i] = s;
     end[i] = s + oshape[i];
   }
   switch (N) {
     case 1: {
       SliceDimOneCsrImpl<xpu>(begin, end, ctx, in, out);
       break;
     }
     case 2: {
       SliceDimTwoCsrImpl<xpu>(begin, end, ctx, in, out);
       break;
     }
     default:
       LOG(FATAL) << "CSR is only for 2-D shape";
       break;
   }
 }

 template<typename xpu>
 void SliceEx(const nnvm::NodeAttrs& attrs,
              const OpContext& ctx,
              const std::vector<NDArray>& inputs,
              const std::vector<OpReqType>& req,
              const std::vector<NDArray>& outputs) {
   CHECK_EQ(inputs.size(), 1);
   CHECK_EQ(outputs.size(), 1);
   const SliceParam& param = nnvm::get<SliceParam>(attrs.parsed);
   auto in_stype = inputs[0].storage_type();
   if (in_stype == kCSRStorage) {
     SliceCsrImpl<xpu>(param, ctx, inputs[0], req[0], outputs[0]);
   } else {
     LOG(FATAL) << "Slice not implemented for storage type" << in_stype;
   }
 }

 template<int ndim>
 inline void GetIndexRange(const mxnet::TShape& dshape,
                           const nnvm::Tuple<dmlc::optional<int>>& param_begin,
                           const nnvm::Tuple<dmlc::optional<int>>& param_end,
                           const nnvm::Tuple<dmlc::optional<int>>& param_step,
                           common::StaticArray<index_t, ndim>* begin,
                           common::StaticArray<index_t, ndim>* end,
                           common::StaticArray<index_t, ndim>* step) {
   CHECK_NE(dshape.ndim(), 0U);
   CHECK_LE(param_begin.ndim(), dshape.ndim())
     << "Slicing axis exceeds data dimensions";
   CHECK_LE(param_end.ndim(), dshape.ndim())
     << "Slicing axis exceeds data dimensions";
   CHECK_EQ(param_begin.ndim(), param_end.ndim())
     << "begin and end must have the same length";
   CHECK_EQ(ndim, dshape.ndim())
     << "Static array size=" << ndim
     << " is not equal to data shape ndim=" << dshape.ndim();

   if (param_step.ndim() != 0U) {
     CHECK_EQ(param_step.ndim(), param_begin.ndim())
       << "step and begin must have the same length";
   }

   for (index_t i = 0; i < param_begin.ndim(); ++i) {
     index_t s = param_step.ndim() != 0U && param_step[i].has_value() ? param_step[i].value() : 1;
     CHECK_NE(s, 0) << "slice op step[" << i << "] cannot be 0";

     index_t b = 0, e = 0;
     const index_t len = dshape[i];
     if (len > 0) {
       b = param_begin[i].has_value() ? param_begin[i].value() : (s < 0 ? len - 1 : 0);
       e = param_end[i].has_value() ? param_end[i].value() : (s < 0 ? -1 : len);

       // checking upper and lower bounds for begin
       if (b < 0) {
         b += len;
         CHECK_GE(b, 0) << "slicing with begin[" << i << "]=" << b - len
                        << " exceeds limit of input dimension[" << i << "]=" << len;
       }
       CHECK_LT(b, len) << "slicing with begin[" << i << "]=" << b
                        << " exceeds limit of input dimension[" << i << "]=" << len;

       // checking upper and lower bounds for end
       if (e < 0 && param_end[i].has_value()) {
         e += len;
         CHECK_GE(e, 0) << "slicing with end[" << i << "]=" << e - len
                        << " exceeds limit of input dimension[" << i << "]=" << len;
       }
       CHECK_LE(e, len) << "slicing with end[" << i << "]=" << e
                        << " exceeds limit of input dimension[" << i << "]=" << len;

       // checking begin==end case which is not supported
       CHECK_NE(b, e) << "slicing with begin[" << i << "]=end[" << i << "]="
                      << e << " results in an empty tensor and is not supported";
     }

     (*begin)[i] = b;
     (*end)[i] = e;
     (*step)[i] = s;
   }

   for (index_t i = param_begin.ndim(); i < dshape.ndim(); ++i) {
     (*begin)[i] = 0;
     (*end)[i] = dshape[i];
     (*step)[i] = 1;
   }
 }

 inline void SetSliceOpOutputDimSize(const index_t i, const int b,
                                     const int e, const int s,
                                     mxnet::TShape* oshape) {
   if (e != b) {
     if (s > 0) {
       CHECK_LT(b, e) << "slicing with begin=[" << i << "]=" << b << ", end[" << i << "]="
                      << e << ", and step[" << i << "]=" << s << " is invalid";
       (*oshape)[i] = (e - b - 1) / s + 1;
     } else {
       CHECK_LT(e, b) << "slicing with begin=[" << i << "]=" << b << ", end[" << i << "]="
                      << e << ", and step[" << i << "]=" << s << " is invalid";
       (*oshape)[i] = (b - e - 1) / (-s) + 1;
     }
   }  // else leave oshape[i] as 0 for partial infer
 }

 inline bool SliceOpShape(const nnvm::NodeAttrs& attrs,
                          mxnet::ShapeVector* in_attrs,
                          mxnet::ShapeVector* out_attrs) {
   CHECK_EQ(in_attrs->size(), 1U);
   CHECK_EQ(out_attrs->size(), 1U);
   const mxnet::TShape& dshape = (*in_attrs)[0];
   if (dshape.ndim() == 0) return false;
   const SliceParam& param = nnvm::get<SliceParam>(attrs.parsed);
   mxnet::TShape oshape = dshape;

   MXNET_NDIM_SWITCH(dshape.ndim(), ndim, {
     common::StaticArray<index_t, ndim> begin, end, step;
     GetIndexRange(dshape, param.begin, param.end, param.step, &begin, &end, &step);
     for (index_t i = 0; i < param.begin.ndim(); ++i) {
       const int b = begin[i], e = end[i], s = step[i];
       SetSliceOpOutputDimSize(i, b, e, s, &oshape);
     }
   });

   SHAPE_ASSIGN_CHECK(*out_attrs, 0, oshape);
   return !shape_is_none(dshape) && !shape_is_none(oshape);
 }

 template<int ndim, int req, typename xpu>
 struct slice_forward;

 template<int ndim, int req>
 struct slice_forward<ndim, req, gpu> {
   // i is the i-th row after flattening out into 2D tensor
   template<typename DType>
   MSHADOW_XINLINE static void Map(index_t i, DType* out, const DType* data,
                                   const mshadow::Shape<ndim> dshape,
                                   const mshadow::Shape<ndim> oshape,
                                   const common::StaticArray<index_t, ndim> begin,
                                   const common::StaticArray<index_t, ndim> step) {
     const index_t data_last_dim_size = dshape[ndim-1];
     const index_t out_last_dim_size = oshape[ndim-1];
     const index_t step_last_dim = step[ndim-1];
     const index_t begin_last_dim = begin[ndim-1];
     const index_t j = i % out_last_dim_size;
     index_t irow = 0;  // row id of flattend 2D data
     index_t stride = 1;
     index_t idx = i / out_last_dim_size;
     #pragma unroll
     for (int k = ndim - 2; k >= 0; --k) {
       irow += stride * ((idx % oshape[k]) * step[k] + begin[k]);
       idx /= oshape[k];
       stride *= dshape[k];
     }
     KERNEL_ASSIGN(out[i], req,
                   data[irow * data_last_dim_size + j * step_last_dim + begin_last_dim]);
   }
 };

 template<int ndim, int req>
 struct slice_forward<ndim, req, cpu> {
   // i is the i-th row after flattening out into 2D tensor
   template<typename DType>
   MSHADOW_XINLINE static void Map(index_t i, DType* out, const DType* data,
                                   const mshadow::Shape<ndim> dshape,
                                   const mshadow::Shape<ndim> oshape,
                                   const common::StaticArray<index_t, ndim> begin,
                                   const common::StaticArray<index_t, ndim> step) {
     const index_t data_last_dim_size = dshape[ndim-1];
     const index_t out_last_dim_size = oshape[ndim-1];
     const index_t step_last_dim = step[ndim-1];
     const index_t begin_last_dim = begin[ndim-1];
     index_t out_offset = i * out_last_dim_size;
     for (index_t j = 0; j < out_last_dim_size; ++j) {
       index_t irow = 0;  // row id of flattend 2D data
       index_t stride = 1;
       index_t idx = i;
       #pragma unroll
       for (int k = ndim - 2; k >= 0; --k) {
         irow += stride * ((idx % oshape[k]) * step[k] + begin[k]);
         idx /= oshape[k];
         stride *= dshape[k];
       }
       KERNEL_ASSIGN(out[out_offset++], req,
                     data[irow * data_last_dim_size + j * step_last_dim + begin_last_dim]);
     }
   }
 };

 template<typename xpu>
 void SliceOpForward(const nnvm::NodeAttrs& attrs,
                     const OpContext& ctx,
                     const std::vector<TBlob>& inputs,
                     const std::vector<OpReqType>& req,
                     const std::vector<TBlob>& outputs) {
   CHECK_EQ(inputs.size(), 1U);
   CHECK_EQ(outputs.size(), 1U);
   CHECK_EQ(req.size(), 1U);
   if (req[0] == kNullOp) return;
   using namespace mshadow;
   Stream<xpu>* s = ctx.get_stream<xpu>();
   const TBlob& data = inputs[0];
   const TBlob& out = outputs[0];
   const SliceParam& param = nnvm::get<SliceParam>(attrs.parsed);
   MXNET_NDIM_SWITCH(data.ndim(), ndim, {
     common::StaticArray<index_t, ndim> begin, end, step;
     GetIndexRange(data.shape_, param.begin, param.end, param.step, &begin, &end, &step);
     MSHADOW_TYPE_SWITCH(out.type_flag_, DType, {
       MXNET_ASSIGN_REQ_SWITCH(req[0], Req, {
         size_t num_threads = out.shape_.FlatTo2D()[0];
         if (std::is_same<xpu, gpu>::value) {
           num_threads *= out.shape_.get<ndim>()[ndim - 1];
         }
         mxnet_op::Kernel<slice_forward<ndim, Req, xpu>, xpu>::Launch(s, num_threads,
             out.dptr<DType>(), data.dptr<DType>(),
             data.shape_.get<ndim>(), out.shape_.get<ndim>(), begin, step);
       })
     })
   })
 }

 template<int ndim, int req, typename xpu>
 struct slice_assign;

 template<int ndim, int req>
 struct slice_assign<ndim, req, cpu> {
   // i is the i-th row after flattening out into 2D tensor
   template<typename DType>
   MSHADOW_XINLINE static void Map(index_t i, DType* out, const DType* val,
                                   const mshadow::Shape<ndim> oshape,
                                   const mshadow::Shape<ndim> vshape,
                                   const common::StaticArray<index_t, ndim> begin,
                                   const common::StaticArray<index_t, ndim> step) {
     const index_t data_last_dim_size = oshape[ndim-1];
     const index_t out_last_dim_size = vshape[ndim-1];
     const index_t step_last_dim = step[ndim-1];
     const index_t begin_last_dim = begin[ndim-1];
     index_t offset = i * out_last_dim_size;
     for (index_t j = 0; j < out_last_dim_size; ++j) {
       index_t irow = 0;  // row id of flattend 2D out
       index_t stride = 1;
       index_t idx = i;
       #pragma unroll
       for (int k = ndim - 2; k >= 0; --k) {
         irow += stride * ((idx % vshape[k]) * step[k] + begin[k]);
         idx /= vshape[k];
         stride *= oshape[k];
       }
       KERNEL_ASSIGN(out[irow * data_last_dim_size + j * step_last_dim + begin_last_dim],
                     req, val[offset++]);
     }
   }
 };

 template<int ndim, int req>
 struct slice_assign<ndim, req, gpu> {
   // i is the i-th row after flattening out into 2D tensor
   template<typename DType>
   MSHADOW_XINLINE static void Map(index_t i, DType* out, const DType* val,
                                   const mshadow::Shape<ndim> oshape,
                                   const mshadow::Shape<ndim> vshape,
                                   const common::StaticArray<index_t, ndim> begin,
                                   const common::StaticArray<index_t, ndim> step) {
     const index_t data_last_dim_size = oshape[ndim-1];
     const index_t out_last_dim_size = vshape[ndim-1];
     const index_t step_last_dim = step[ndim-1];
     const index_t begin_last_dim = begin[ndim-1];
     const index_t j = i % out_last_dim_size;
     index_t irow = 0;  // row id of flattend 2D out
     index_t stride = 1;
     index_t idx = i / out_last_dim_size;
     #pragma unroll
     for (int k = ndim - 2; k >= 0; --k) {
       irow += stride * ((idx % vshape[k]) * step[k] + begin[k]);
       idx /= vshape[k];
       stride *= oshape[k];
     }
     KERNEL_ASSIGN(out[irow * data_last_dim_size + j * step_last_dim + begin_last_dim],
                   req, val[i]);
   }
 };

 template<typename xpu>
 void SliceOpBackward(const nnvm::NodeAttrs& attrs,
                      const OpContext& ctx,
                      const std::vector<TBlob>& inputs,
                      const std::vector<OpReqType>& req,
                      const std::vector<TBlob>& outputs) {
   CHECK_EQ(inputs.size(), 1U);
   CHECK_EQ(outputs.size(), 1U);
   CHECK_EQ(req.size(), 1U);
   if (req[0] == kNullOp) return;
   using namespace mshadow;
   Stream<xpu>* s = ctx.get_stream<xpu>();
   const TBlob& ograd = inputs[0];
   const TBlob& igrad = outputs[0];
   const SliceParam& param = nnvm::get<SliceParam>(attrs.parsed);
   if (req[0] == kWriteTo) {
     Fill(s, igrad, req[0], 0);
   } else if (req[0] == kWriteInplace) {
     LOG(FATAL) << "_slice_backward does not support kWriteInplace";
   }
   MXNET_NDIM_SWITCH(ograd.ndim(), ndim, {
     common::StaticArray<index_t, ndim> begin, end, step;
     GetIndexRange(igrad.shape_, param.begin, param.end, param.step, &begin, &end, &step);
     MSHADOW_TYPE_SWITCH(ograd.type_flag_, DType, {
       MXNET_ASSIGN_REQ_SWITCH(req[0], Req, {
       int num_threads = ograd.shape_.FlatTo2D()[0];
       if (std::is_same<xpu, gpu>::value) {
         num_threads *= ograd.shape_.get<ndim>()[ndim - 1];
       }
       mxnet_op::Kernel<slice_assign<ndim, Req, xpu>, xpu>::Launch(s, num_threads,
           igrad.dptr<DType>(), ograd.dptr<DType>(),
           igrad.shape_.get<ndim>(), ograd.shape_.get<ndim>(), begin, step);
       })
     })
   })
 }

 inline bool SliceAssignOpShape(const nnvm::NodeAttrs& attrs,
                                mxnet::ShapeVector *in_attrs,
                                mxnet::ShapeVector *out_attrs) {
   CHECK_EQ(in_attrs->size(), 2U);
   CHECK_EQ(out_attrs->size(), 1U);
   const mxnet::TShape& dshape = (*in_attrs)[0];
   if (dshape.ndim() == 0U || dshape.Size() == 0U) return false;
   mxnet::TShape vshape = dshape;  // vshape is the value shape on the right hand side
   const SliceParam& param = nnvm::get<SliceParam>(attrs.parsed);
   MXNET_NDIM_SWITCH(dshape.ndim(), ndim, {
     common::StaticArray<index_t, ndim> begin, end, step;
     GetIndexRange(dshape, param.begin, param.end, param.step, &begin, &end, &step);
     for (index_t i = 0; i < param.begin.ndim(); ++i) {
       const int b = begin[i], e = end[i], s = step[i];
       SetSliceOpOutputDimSize(i, b, e, s, &vshape);
     }
   });
   SHAPE_ASSIGN_CHECK(*in_attrs, 1, vshape);
   SHAPE_ASSIGN_CHECK(*out_attrs, 0, dshape);
   return true;
 }

 template<typename xpu>
 void SliceAssignOpForward(const nnvm::NodeAttrs& attrs,
                           const OpContext& ctx,
                           const std::vector<TBlob>& inputs,
                           const std::vector<OpReqType>& req,
                           const std::vector<TBlob>& outputs) {
   using namespace mshadow;
   CHECK_EQ(inputs.size(), 2U);  // data[index] = val, data and val are two inputs
   CHECK_EQ(outputs.size(), 1U);
   if (req[0] == kNullOp) return;
   Stream<xpu> *s = ctx.get_stream<xpu>();
   const TBlob& data = inputs[0];
   const TBlob& val = inputs[1];
   const TBlob& out = outputs[0];
   if (req[0] == kWriteTo) {
     MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {
       Tensor<xpu, 1, DType> in = inputs[0].FlatTo1D<xpu, DType>(s);
       Tensor<xpu, 1, DType> out = outputs[0].FlatTo1D<xpu, DType>(s);
       Copy(out, in, s);
     });
   } else if (req[0] != kWriteInplace) {
     LOG(FATAL) << "_slice_assign only supports kWriteTo and kWriteInplace";
   }

   const SliceParam& param = nnvm::get<SliceParam>(attrs.parsed);
   MXNET_NDIM_SWITCH(data.ndim(), ndim, {
     common::StaticArray<index_t, ndim> begin, end, step;
     GetIndexRange(data.shape_, param.begin, param.end, param.step, &begin, &end, &step);
     MSHADOW_TYPE_SWITCH(out.type_flag_, DType, {
       MXNET_ASSIGN_REQ_SWITCH(req[0], Req, {
         int num_threads = val.shape_.FlatTo2D()[0];
         if (std::is_same<xpu, gpu>::value) {
           num_threads *= val.shape_.get<ndim>()[ndim - 1];
         }
         mxnet_op::Kernel<slice_assign<ndim, Req, xpu>, xpu>::Launch(s, num_threads,
             out.dptr<DType>(), val.dptr<DType>(),
             out.shape_.get<ndim>(), val.shape_.get<ndim>(), begin, step);
       })
     })
   })
 }

 struct SliceAssignScalarParam : public dmlc::Parameter<SliceAssignScalarParam> {
   double scalar;
   nnvm::Tuple<dmlc::optional<int>> begin, end;
   nnvm::Tuple<dmlc::optional<int>> step;
   DMLC_DECLARE_PARAMETER(SliceAssignScalarParam) {
     DMLC_DECLARE_FIELD(scalar)
     .set_default(0)
     .describe("The scalar value for assignment.");
     DMLC_DECLARE_FIELD(begin)
     .describe("starting indices for the slice operation, supports negative indices.");
     DMLC_DECLARE_FIELD(end)
     .describe("ending indices for the slice operation, supports negative indices.");
     DMLC_DECLARE_FIELD(step)
     .set_default(nnvm::Tuple<dmlc::optional<int>>())
     .describe("step for the slice operation, supports negative values.");
   }
 };

 inline bool SliceAssignScalarOpShape(const nnvm::NodeAttrs& attrs,
                                     mxnet::ShapeVector *in_attrs,
                                     mxnet::ShapeVector *out_attrs) {
   CHECK_EQ(in_attrs->size(), 1U);
   CHECK_EQ(out_attrs->size(), 1U);
   const mxnet::TShape& dshape = (*in_attrs)[0];
   if (dshape.ndim() == 0U || dshape.Size() == 0U) return false;
   SHAPE_ASSIGN_CHECK(*out_attrs, 0, dshape);
   return true;
 }

 template<int ndim>
 struct slice_assign_scalar {
   // i is the i-th row after flattening out into 2D tensor
   template<typename DType>
   MSHADOW_XINLINE static void Map(index_t i, DType* out, const DType val,
                                   const OpReqType req,
                                   const mshadow::Shape<ndim> oshape,
                                   const mshadow::Shape<ndim> vshape,
                                   const common::StaticArray<index_t, ndim> begin,
                                   const common::StaticArray<index_t, ndim> step) {
     const index_t data_last_dim_size = oshape[ndim-1];
     const index_t out_last_dim_size = vshape[ndim-1];
     const index_t step_last_dim = step[ndim-1];
     const index_t begin_last_dim = begin[ndim-1];
     for (index_t j = 0; j < out_last_dim_size; ++j) {
       index_t irow = 0;  // row id of flattend 2D out
       index_t stride = 1;
       index_t idx = i;
       #pragma unroll
       for (int k = ndim - 2; k >= 0; --k) {
         irow += stride * ((idx % vshape[k]) * step[k] + begin[k]);
         idx /= vshape[k];
         stride *= oshape[k];
       }
       KERNEL_ASSIGN(out[irow * data_last_dim_size + j * step_last_dim + begin_last_dim], req, val);
     }
   }
 };

 template<typename xpu>
 void SliceAssignScalarOpForward(const nnvm::NodeAttrs& attrs,
                                 const OpContext& ctx,
                                 const std::vector<TBlob>& inputs,
                                 const std::vector<OpReqType>& req,
                                 const std::vector<TBlob>& outputs) {
   CHECK_EQ(inputs.size(), 1U);
   CHECK_EQ(outputs.size(), 1U);
   CHECK_EQ(req.size(), 1U);
   using namespace mshadow;
   Stream<xpu> *s = ctx.get_stream<xpu>();

   const TBlob& data = inputs[0];
   const TBlob& out = outputs[0];
   if (req[0] == kWriteTo) {
     MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {
       Tensor<xpu, 1, DType> in = inputs[0].FlatTo1D<xpu, DType>(s);
       Tensor<xpu, 1, DType> out = outputs[0].FlatTo1D<xpu, DType>(s);
       Copy(out, in, s);
     });
   } else if (req[0] != kWriteInplace) {
     LOG(FATAL) << "_crop_assign_scalar only supports kWriteTo and kWriteInplace";
   }

   mxnet::TShape vshape = data.shape_;
   const SliceAssignScalarParam& param = nnvm::get<SliceAssignScalarParam>(attrs.parsed);
   MXNET_NDIM_SWITCH(data.ndim(), ndim, {
     common::StaticArray<index_t, ndim> begin, end, step;
     GetIndexRange(data.shape_, param.begin, param.end, param.step, &begin, &end, &step);
     for (index_t i = 0; i < param.begin.ndim(); ++i) {
       const int b = begin[i], e = end[i], s = step[i];
       SetSliceOpOutputDimSize(i, b, e, s, &vshape);
     }
     MSHADOW_TYPE_SWITCH(out.type_flag_, DType, {
       mxnet_op::Kernel<slice_assign_scalar<ndim>, xpu>::Launch(s, vshape.FlatTo2D()[0],
           out.dptr<DType>(), static_cast<DType>(param.scalar), req[0],
           out.shape_.get<ndim>(), vshape.get<ndim>(), begin, step);
     })
   })
 }

 struct SliceAxisParam : public dmlc::Parameter<SliceAxisParam> {
   int axis;
   int begin;
   dmlc::optional<int> end;
   DMLC_DECLARE_PARAMETER(SliceAxisParam) {
     DMLC_DECLARE_FIELD(axis)
       .describe("Axis along which to be sliced, supports negative indexes.");
     DMLC_DECLARE_FIELD(begin)
       .describe("The beginning index along the axis to be sliced, "
                 " supports negative indexes.");
     DMLC_DECLARE_FIELD(end)
       .describe("The ending index along the axis to be sliced, "
                 " supports negative indexes.");
   }
 };

 inline void GetSliceAxisParams(const SliceAxisParam& param, const mxnet::TShape& ishape,
                            int* axis, index_t* begin, index_t* end) {
   *axis = param.axis;
   if (*axis < 0) {
     *axis += static_cast<int>(ishape.ndim());
   }
   CHECK(*axis < static_cast<int>(ishape.ndim()) && *axis >= 0) <<
     "Transformed axis must be smaller than the source ndim and larger than zero! Recieved axis=" <<
     param.axis << ", src_ndim=" << ishape.ndim() << ", transformed axis=" << *axis;
   index_t axis_size = static_cast<index_t>(ishape[*axis]);
   *begin = param.begin;
   *end = -1;
   if (*begin < 0) {
     *begin += axis_size;
   }
   if (axis_size) {
     if (!static_cast<bool>(param.end)) {
       *end = axis_size;
     } else {
       *end = param.end.value();
       if (*end < 0) {
         *end += axis_size;
       }
     }
     CHECK(*end <= axis_size) << "Invalid end for end=" << *end << " as axis_size is " << axis_size;
     CHECK((*begin < *end))
       << "Invalid begin, end, get begin=" << param.begin << ", end=" << param.end;
   } else {
     *begin = 0;
     *end = 0;
   }
   CHECK(*end >= 0)
     << "Invalid begin, end, get begin=" << param.begin << ", end=" << param.end;
   CHECK(*begin >= 0) << "Invalid begin for begin=" << param.begin;
 }

 inline bool SliceAxisShape(const nnvm::NodeAttrs& attrs,
                        mxnet::ShapeVector *in_attrs,
                        mxnet::ShapeVector *out_attrs) {
   const SliceAxisParam& param = nnvm::get<SliceAxisParam>(attrs.parsed);
   CHECK_EQ(in_attrs->size(), 1U);
   CHECK_EQ(out_attrs->size(), 1U);
   mxnet::TShape& ishape = (*in_attrs)[0];
   int axis;
   index_t begin, end;
   GetSliceAxisParams(param, ishape, &axis, &begin, &end);
   mxnet::TShape shape(ishape.ndim());
   for (index_t i = 0; i < ishape.ndim(); ++i) {
     if (static_cast<int>(i) == axis) {
       shape[i] = static_cast<index_t>(end - begin);
     } else {
       shape[i] = ishape[i];
     }
   }
   SHAPE_ASSIGN_CHECK(*out_attrs, 0, shape);
   return true;
 }


 template<typename xpu>
 void SliceAxis(const nnvm::NodeAttrs& attrs,
            const OpContext& ctx,
            const std::vector<TBlob>& inputs,
            const std::vector<OpReqType>& req,
            const std::vector<TBlob>& outputs) {
   using namespace mshadow::expr;
   const SliceAxisParam& param = nnvm::get<SliceAxisParam>(attrs.parsed);
   mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
   int axis;
   index_t begin, end;
   GetSliceAxisParams(param, inputs[0].shape_, &axis, &begin, &end);
   int ndim = static_cast<int>(outputs[0].ndim());

   if (axis + 1 == ndim) {
     MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {
         mshadow::Tensor<xpu, 2, DType> in =
             inputs[0].FlatTo2D<xpu, DType>(s);
         mshadow::Tensor<xpu, 2, DType> out =
             outputs[0].FlatTo2D<xpu, DType>(s);
         ASSIGN_DISPATCH(out, req[0], slice<1>(in, begin, end));
       });
   } else {
     MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {
         mshadow::Tensor<xpu, 3, DType> in =
             inputs[0].FlatTo3D<xpu, DType>(axis, s);
         mshadow::Tensor<xpu, 3, DType> out =
             outputs[0].FlatTo3D<xpu, DType>(axis, s);
         ASSIGN_DISPATCH(out, req[0], slice<1>(in, begin, end));
       });
   }
 }

 // Backward pass of broadcast over the given axis
 template<typename xpu>
 void SliceAxisGrad_(const nnvm::NodeAttrs& attrs,
                 const OpContext& ctx,
                 const std::vector<TBlob>& inputs,
                 const std::vector<OpReqType>& req,
                 const std::vector<TBlob>& outputs) {
   const SliceAxisParam& param = nnvm::get<SliceAxisParam>(attrs.parsed);
   using namespace mshadow::op;
   using namespace mshadow::expr;
   mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
   int axis;
   index_t begin, end;
   GetSliceAxisParams(param, outputs[0].shape_, &axis, &begin, &end);
   int ndim = static_cast<int>(outputs[0].shape_.ndim());

   if (axis + 1 == ndim) {
     MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {
         mshadow::Tensor<xpu, 2, DType> ograd =
             inputs[0].FlatTo2D<xpu, DType>(s);
         mshadow::Tensor<xpu, 2, DType> igrad =
             outputs[0].FlatTo2D<xpu, DType>(s);
         if (req[0] == kAddTo) {
           slice<1>(igrad, begin, end) += F<identity>(ograd);
         } else if (req[0] == kWriteTo) {
           igrad = 0.0f;
           slice<1>(igrad, begin, end) = F<identity>(ograd);
         } else {
           CHECK_EQ(req[0], kNullOp);
         }
       });
   } else {
     MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {
         mshadow::Tensor<xpu, 3, DType> ograd =
             inputs[0].FlatTo3D<xpu, DType>(axis, s);
         mshadow::Tensor<xpu, 3, DType> igrad =
             outputs[0].FlatTo3D<xpu, DType>(axis, s);
         if (req[0] == kAddTo) {
           slice<1>(igrad, begin, end) += F<identity>(ograd);
         } else if (req[0] == kWriteTo) {
           igrad = 0.0f;
           slice<1>(igrad, begin, end) = F<identity>(ograd);
         } else {
           CHECK_EQ(req[0], kNullOp);
         }
       });
   }
 }

 struct SliceLikeParam : public dmlc::Parameter<SliceLikeParam> {
   mxnet::TShape axes;
   DMLC_DECLARE_PARAMETER(SliceLikeParam) {
     DMLC_DECLARE_FIELD(axes).set_default(mxnet::TShape())
     .describe("List of axes on which input data will be sliced according to the "
               "corresponding size of the second input. By default will slice on "
               "all axes. Negative axes are supported.");
   }
 };

 inline bool SliceLikeShape(const nnvm::NodeAttrs& attrs,
                            mxnet::ShapeVector *in_attrs,
                            mxnet::ShapeVector *out_attrs) {
   const SliceLikeParam& param = nnvm::get<SliceLikeParam>(attrs.parsed);
   CHECK_EQ(in_attrs->size(), 2U);
   CHECK_EQ(out_attrs->size(), 1U);
   mxnet::TShape& ishape = (*in_attrs)[0];
   mxnet::TShape& from_shape = (*in_attrs)[1];
   if (param.axes.ndim() == 0) {
     CHECK_EQ(ishape.ndim(), from_shape.ndim())
       << "By default slice_axis performs slice on all axes, but ndim mismatch "
          "for inputs: " << ishape.ndim() << " vs. " << from_shape.ndim();
     for (index_t i = 0; i < ishape.ndim(); ++i) {
       CHECK_GE(ishape[i], from_shape[i])
         << "Slice axis " << i << " with size " << from_shape[i]
         << "exceeds limit of input with size " << ishape[i];
     }
     SHAPE_ASSIGN_CHECK(*out_attrs, 0, from_shape);
   } else {
     mxnet::TShape shape(ishape);
     for (index_t i = 0; i < param.axes.ndim(); ++i) {
       int axis = static_cast<int>(param.axes[i]);
       if (axis < 0) {
         axis += static_cast<int>(ishape.ndim());
       }
       CHECK_GE(axis, 0)
         << "Slice axis: " << static_cast<int>(param.axes[i]) << " too small";
       CHECK_GT(ishape.ndim(), axis)
         << "Slice axis: " << axis << " exceeds first input: " << ishape.ndim();
       CHECK_GT(from_shape.ndim(), axis)
         << "Slice axis: " << axis << " exceeds second input: " << from_shape.ndim();
       shape[axis] = from_shape[axis];
       CHECK_GE(ishape[axis], from_shape[axis])
         << "Slice axis " << axis << " with size " << from_shape[axis]
         << "exceeds limit of input with size " << ishape[axis];
     }
     SHAPE_ASSIGN_CHECK(*out_attrs, 0, shape);
   }
   return true;
 }

 inline void SliceLikeInferRanges(const mxnet::TShape& dshape,
                                  const mxnet::TShape& fshape,
                                  const mxnet::TShape& axes,
                                  nnvm::Tuple<dmlc::optional<int>>* param_begin,
                                  nnvm::Tuple<dmlc::optional<int>>* param_end,
                                  nnvm::Tuple<dmlc::optional<int>>* param_step) {
   std::vector<dmlc::optional<int>> pb(dshape.ndim());
   std::vector<dmlc::optional<int>> pe(dshape.ndim());
   std::vector<dmlc::optional<int>> ps(dshape.ndim());
   if (axes.ndim() == 0) {
     for (index_t i = 0; i < dshape.ndim(); ++i) {
       pb[i] = 0;
       pe[i] = fshape[i];
       ps[i] = 1;
     }
   } else {
     for (index_t i = 0; i < axes.ndim(); ++i) {
       int axis = static_cast<int>(axes[i]);
       if (axis < 0) {
         axis += static_cast<int>(dshape.ndim());
       }
       CHECK_GE(axis, 0)
         << "Slice axis: " << static_cast<int>(axes[i]) << " too small";
       CHECK_LT(axis, static_cast<int>(dshape.ndim()))
         << "Slice axis: " << axis << " exceeds first input: " << dshape.ndim();
       CHECK_LT(axis, static_cast<int>(fshape.ndim()))
         << "Slice axis: " << axis << " exceeds first input: " << fshape.ndim();
       pb[axis] = 0;
       pe[axis] = fshape[axis];
       ps[axis] = 1;
     }
   }
   *param_begin = nnvm::Tuple<dmlc::optional<int>>(pb.begin(), pb.end());
   *param_end = nnvm::Tuple<dmlc::optional<int>>(pe.begin(), pe.end());
   *param_step = nnvm::Tuple<dmlc::optional<int>>(ps.begin(), ps.end());
 }

 template<typename xpu>
 void SliceLikeForward(const nnvm::NodeAttrs& attrs,
                       const OpContext& ctx,
                       const std::vector<TBlob>& inputs,
                       const std::vector<OpReqType>& req,
                       const std::vector<TBlob>& outputs) {
   CHECK_EQ(inputs.size(), 2U);
   CHECK_EQ(outputs.size(), 1U);
   CHECK_EQ(req.size(), 1U);
   using namespace mshadow::expr;
   const SliceLikeParam& param = nnvm::get<SliceLikeParam>(attrs.parsed);
   mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
   const TBlob& data = inputs[0];
   const TBlob& out = outputs[0];
   const mxnet::TShape& ishape = data.shape_;
   const mxnet::TShape& from_shape = inputs[1].shape_;
   nnvm::Tuple<dmlc::optional<int>> param_begin;
   nnvm::Tuple<dmlc::optional<int>> param_end;
   nnvm::Tuple<dmlc::optional<int>> param_step;
   SliceLikeInferRanges(ishape, from_shape, param.axes, &param_begin, &param_end, &param_step);

   MXNET_NDIM_SWITCH(data.ndim(), ndim, {
     common::StaticArray<index_t, ndim> begin, end, step;
     GetIndexRange(data.shape_, param_begin, param_end, param_step, &begin, &end, &step);
     MSHADOW_TYPE_SWITCH(out.type_flag_, DType, {
       MXNET_ASSIGN_REQ_SWITCH(req[0], Req, {
         int num_threads = out.shape_.FlatTo2D()[0];
         if (std::is_same<xpu, gpu>::value) {
           num_threads *= out.shape_.get<ndim>()[ndim - 1];
         }
         mxnet_op::Kernel<slice_forward<ndim, Req, xpu>, xpu>::Launch(s,
             num_threads, out.dptr<DType>(), data.dptr<DType>(),
             data.shape_.get<ndim>(), out.shape_.get<ndim>(), begin, step);
       })
     })
   })
 }

 template<typename xpu>
 void SliceLikeBackward(const nnvm::NodeAttrs& attrs,
                        const OpContext& ctx,
                        const std::vector<TBlob>& inputs,
                        const std::vector<OpReqType>& req,
                        const std::vector<TBlob>& outputs) {
   CHECK_EQ(inputs.size(), 1U);
   CHECK_EQ(outputs.size(), 2U);
   CHECK_EQ(req.size(), 2U);
   using namespace mshadow;
   Stream<xpu>* s = ctx.get_stream<xpu>();
   if (req[1] != kNullOp && req[1] != kAddTo) {
     Fill(s, outputs[1], req[1], 0);  // Second input not relavant to gradients.
   }
   if (req[0] == kNullOp) return;
   const TBlob& ograd = inputs[0];
   const TBlob& igrad = outputs[0];
   const SliceLikeParam& param = nnvm::get<SliceLikeParam>(attrs.parsed);
   if (req[0] == kWriteTo) {
     Fill(s, igrad, req[0], 0);
   } else if (req[0] == kWriteInplace) {
     LOG(FATAL) << "_slice_like_backward does not support kWriteInplace";
   }

   const mxnet::TShape& ishape = ograd.shape_;
   const mxnet::TShape& from_shape = outputs[1].shape_;
   nnvm::Tuple<dmlc::optional<int>> param_begin;
   nnvm::Tuple<dmlc::optional<int>> param_end;
   nnvm::Tuple<dmlc::optional<int>> param_step;
   SliceLikeInferRanges(ishape, from_shape, param.axes, &param_begin, &param_end, &param_step);

   MXNET_NDIM_SWITCH(ograd.ndim(), ndim, {
     common::StaticArray<index_t, ndim> begin, end, step;
     GetIndexRange(ograd.shape_, param_begin, param_end, param_step, &begin, &end, &step);
     MSHADOW_TYPE_SWITCH(ograd.type_flag_, DType, {
       MXNET_ASSIGN_REQ_SWITCH(req[0], Req, {
         int num_threads = ograd.shape_.FlatTo2D()[0];
         if (std::is_same<xpu, gpu>::value) {
           num_threads *= ograd.shape_.get<ndim>()[ndim - 1];
         }
         mxnet_op::Kernel<slice_assign<ndim, Req, xpu>, xpu>::Launch(s, num_threads,
             igrad.dptr<DType>(), ograd.dptr<DType>(),
             igrad.shape_.get<ndim>(), ograd.shape_.get<ndim>(), begin, step);
       })
     })
   })
 }

 struct ClipParam : public dmlc::Parameter<ClipParam> {
   real_t a_min, a_max;
   DMLC_DECLARE_PARAMETER(ClipParam) {
     DMLC_DECLARE_FIELD(a_min)
     .describe("Minimum value");
     DMLC_DECLARE_FIELD(a_max)
     .describe("Maximum value");
   }
 };


 struct clip {
   template<typename DType>
   MSHADOW_XINLINE static void Map(index_t i, DType* out, const DType* datas,
                                   DType a_min, DType a_max) {
     DType data = datas[i];
     if (data > a_max) {
       out[i] = a_max;
     } else if (data < a_min) {
       out[i] = a_min;
     } else {
       out[i] = data;
     }
   }
 };


 struct clip_grad {
   template<typename DType>
   MSHADOW_XINLINE static void Map(index_t i, DType* out, const DType* grad, const DType* datas,
                                   DType a_min, DType a_max) {
     DType data = datas[i];
     if (data > a_max) {
       out[i] = 0;
     } else if (data < a_min) {
       out[i] = 0;
     } else {
       out[i] = grad[i];
     }
   }
 };


 template<typename xpu>
 void Clip(const nnvm::NodeAttrs& attrs,
           const OpContext& ctx,
           const std::vector<TBlob>& inputs,
           const std::vector<OpReqType>& req,
           const std::vector<TBlob>& outputs) {
   using namespace mshadow;
   const ClipParam& param = nnvm::get<ClipParam>(attrs.parsed);
   CHECK_EQ(inputs[0].type_flag_, outputs[0].type_flag_);
   Stream<xpu> *s = ctx.get_stream<xpu>();

   MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {
     mxnet_op::Kernel<mxnet::op::clip, xpu>::Launch(s, outputs[0].Size(), outputs[0].dptr<DType>(),
                                                    inputs[0].dptr<DType>(),
                                                    DType(param.a_min), DType(param.a_max));
   });
 }

 template<typename xpu>
 void ClipEx(const nnvm::NodeAttrs& attrs,
             const OpContext& ctx,
             const std::vector<NDArray>& inputs,
             const std::vector<OpReqType>& req,
             const std::vector<NDArray>& outputs) {
   CHECK_EQ(inputs[0].dtype(), outputs[0].dtype());
   CHECK_EQ(inputs[0].storage_type(), outputs[0].storage_type());
   CHECK_NE(inputs[0].storage_type(), kDefaultStorage);
   UnaryOp::MapToFCompute<xpu>(attrs, ctx, inputs, req, outputs, Clip<xpu>);
 }

 template<typename xpu>
 void ClipGrad_(const nnvm::NodeAttrs& attrs,
                const OpContext& ctx,
                const std::vector<TBlob>& inputs,
                const std::vector<OpReqType>& req,
                const std::vector<TBlob>& outputs) {
   using namespace mshadow;
   using namespace mxnet_op;
   const ClipParam& param = nnvm::get<ClipParam>(attrs.parsed);
   CHECK_EQ(inputs[0].type_flag_, outputs[0].type_flag_);
   Stream<xpu> *s = ctx.get_stream<xpu>();
   MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {
     Kernel<clip_grad, xpu>::Launch(s, outputs[0].Size(), outputs[0].dptr<DType>(),
     inputs[0].dptr<DType>(), inputs[1].dptr<DType>(), DType(param.a_min), DType(param.a_max));
   });
 }

 /*!
  * \brief The parameters of the repeat operator include
  * the number of repeating time and axis (optional).
  * The parameters will be later used to deduce the
  * output ndarray shape in bool RepeatShape() function.
  */
 struct RepeatParam : public dmlc::Parameter<RepeatParam> {
   int repeats = 1;
   dmlc::optional<int> axis;
   DMLC_DECLARE_PARAMETER(RepeatParam) {
     DMLC_DECLARE_FIELD(repeats)
       .describe("The number of repetitions for each element.");
     DMLC_DECLARE_FIELD(axis)
       .set_default(dmlc::optional<int>())
       .describe("The axis along which to repeat values."
                 " The negative numbers are interpreted counting from the backward."
                 " By default, use the flattened input array,"
                 " and return a flat output array.");
   }
 };

 /*!
  * \brief Helper function for getting user input params for the operator repeat.
  * Sanity check the user input values.
  */
 inline void GetRepeatParams(const RepeatParam& param, const mxnet::TShape& ishape,
                             int* repeats, dmlc::optional<int>* axisOpt) {
   *repeats = param.repeats;
   CHECK_GE(*repeats, 0) << "repeats cannot be a negative number";
   *axisOpt = param.axis;
   if (static_cast<bool>(*axisOpt)) {
     int ndims = static_cast<int>(ishape.ndim());
     int axis = axisOpt->value();
     if (axis < 0) {
       axis += ndims;
     }
     CHECK(axis >= 0 && axis < ndims) << "axis = " << axisOpt->value() << " out of bounds";
   }
 }

 inline bool RepeatOpShape(const nnvm::NodeAttrs& attrs,
                         mxnet::ShapeVector *in_attrs,
                         mxnet::ShapeVector *out_attrs) {
   const RepeatParam& param = nnvm::get<RepeatParam>(attrs.parsed);
   CHECK_EQ(in_attrs->size(), 1U);
   CHECK_EQ(out_attrs->size(), 1U);
   const mxnet::TShape& ishape = (*in_attrs)[0];
   int repeats = 0;
   dmlc::optional<int> axisOpt;
   GetRepeatParams(param, ishape, &repeats, &axisOpt);
   // If 0 repeats, return an empty 0 dim array
   if (0 == repeats) {
     SHAPE_ASSIGN_CHECK(*out_attrs, 0, mxnet::TShape());
     return true;
   }

   // If repeats > 0, multiply the size of the corresponding axis by repeats
   if (static_cast<bool>(axisOpt)) {
     int ndims = static_cast<int>(ishape.ndim());
     int axis = axisOpt.value();
     if (axis < 0) {
       axis += ndims;
     }
     mxnet::TShape shape(ishape.ndim());
     for (index_t i = 0; i < ishape.ndim(); ++i) {
       if (static_cast<int>(i) == axis) {
         shape[i] = static_cast<index_t>(repeats) * ishape[i];
       } else {
         shape[i] = ishape[i];
       }
     }
     SHAPE_ASSIGN_CHECK(*out_attrs, 0, shape);
   } else {  // If axis is not input by user, return a flat 1D array of size = in.size*repeats
     mxnet::TShape shape(1);
     shape[0] = ishape.Size() * static_cast<index_t>(repeats);
     SHAPE_ASSIGN_CHECK(*out_attrs, 0, shape);
   }
   return true;
 }

 inline bool RepeatOpType(const nnvm::NodeAttrs& attrs,
                          std::vector<int>* in_attrs,
                          std::vector<int>* out_attrs) {
   CHECK_EQ(in_attrs->size(), 1U);
   if ((*in_attrs)[0] != -1) {
     TYPE_ASSIGN_CHECK(*out_attrs, 0, (*in_attrs)[0]);
   } else if ((*out_attrs)[0] != -1) {
     TYPE_ASSIGN_CHECK(*in_attrs, 0, (*out_attrs)[0]);
   }
   return true;
 }

 /*!
  * \brief Reshape the input and output tensors for
  * using broadcast_to to achieve the funcitonality
  * of operator repeat.
  * \return a pair of mxnet::TShape's, first is the reshaped
  * input shape, second is the reshaped output shape.
  */
 inline std::pair<mxnet::TShape, mxnet::TShape> ReshapeInputOutputForRepeatOp(
   const mxnet::TShape& ishape,
   const dmlc::optional<int>& axisOpt,
   const int repeats) {
   if (static_cast<bool>(axisOpt)) {
     int axis = axisOpt.value();
     int ndim = static_cast<int>(ishape.ndim());
     if (axis < 0)  {
       axis += ndim;
     }
     CHECK(axis >= 0 && axis < static_cast<int>(ishape.ndim())) << "Invalid input of axis";

     // reshape the input tensor by adding a dim at the (axis+1)-th dim
     mxnet::TShape rshape(ishape.ndim()+1);
     // the shape we want to broadcast to
     mxnet::TShape bshape(rshape.ndim());
     int i = 0;
     while (i <= axis) {
       rshape[i] = bshape[i] = ishape[i];
       ++i;
     }
     rshape[i] = 1;
     bshape[i] = repeats;
     while (i < static_cast<int>(ishape.ndim())) {
       rshape[i+1] = ishape[i];
       bshape[i+1] = ishape[i];
       ++i;
     }
     return std::make_pair(rshape, bshape);
   } else {
     // axis is not input by user
     // reshape the tensor into shape (ishape.Size(), 1)
     // then add one dim at axis = 1 and broadcast to
     // shape (ishape.Size(), repeats)
     mxnet::TShape rshape(2);
     rshape[0] = ishape.Size();
     rshape[1] = 1;

     mxnet::TShape bshape(2);
     bshape[0] = rshape[0];
     bshape[1] = repeats;
     return std::make_pair(rshape, bshape);
   }
 }

 template<typename xpu>
 void RepeatOpForward(const nnvm::NodeAttrs& attrs,
                      const OpContext& ctx,
                      const std::vector<TBlob>& inputs,
                      const std::vector<OpReqType>& req,
                      const std::vector<TBlob>& outputs) {
   const TBlob& iTBlob = inputs[0];
   const mxnet::TShape& ishape = iTBlob.shape_;
   if (ishape.ndim() == 0) return;

   int repeats = 0;
   dmlc::optional<int> axisOpt;
   const RepeatParam& param = nnvm::get<RepeatParam>(attrs.parsed);
   GetRepeatParams(param, ishape, &repeats, &axisOpt);
   if (0 == repeats) return;

   std::pair<mxnet::TShape, mxnet::TShape> rshapes = \
     ReshapeInputOutputForRepeatOp(ishape, axisOpt, repeats);

   // reshaped input tblob
   TBlob iblob(inputs[0].dptr_, rshapes.first, inputs[0].dev_mask(),
     inputs[0].type_flag_, inputs[0].dev_id());
   std::vector<TBlob> newInputs = {iblob};

   // reshaped output tblob
   TBlob oblob(outputs[0].dptr_, rshapes.second, outputs[0].dev_mask(),
     outputs[0].type_flag_, outputs[0].dev_id());
   std::vector<TBlob> newOutputs = {oblob};

   BroadcastCompute<xpu>(attrs, ctx, newInputs, req, newOutputs);
 }

 /*!
  * \brief Compute the gradient of the loss function
  * with respect to the input of the operator.
  * Backpropagation is employed to implement the
  * chain rule.
  * \param inputs the gradient of the loss function
  * with respect to the outputs of the operator
  * \param outputs the gradient of the loss function
  * with respect to the inputs of the operator
  */
 template<typename xpu>
 void RepeatOpBackward(const nnvm::NodeAttrs& attrs,
                       const OpContext& ctx,
                       const std::vector<TBlob>& inputs,
                       const std::vector<OpReqType>& req,
                       const std::vector<TBlob>& outputs) {
   CHECK_EQ(inputs.size(), 1U);
   CHECK_EQ(outputs.size(), 1U);

   const mxnet::TShape& oshape = outputs[0].shape_;
   if (oshape.ndim() == 0) return;

   int repeats = 0;
   dmlc::optional<int> axisOpt;
   const RepeatParam& param = nnvm::get<RepeatParam>(attrs.parsed);
   GetRepeatParams(param, oshape, &repeats, &axisOpt);
   if (0 == repeats) return;

   std::pair<mxnet::TShape, mxnet::TShape> rshapes =
     ReshapeInputOutputForRepeatOp(oshape, axisOpt, repeats);

   // reshaped output grad tblob
   TBlob oblob(outputs[0].dptr_, rshapes.first, outputs[0].dev_mask(),
     outputs[0].type_flag_, outputs[0].dev_id());
   std::vector<TBlob> newOutputs = {oblob};

   // reshaped input grad tblob
   TBlob iblob(inputs[0].dptr_, rshapes.second, inputs[0].dev_mask(),
     inputs[0].type_flag_, inputs[0].dev_id());
   std::vector<TBlob> newInputs = {iblob};

   ReduceAxesComputeImpl<xpu, mshadow::red::sum, false>(
       ctx, newInputs, req, newOutputs, rshapes.first);
 }

 struct TileParam : public dmlc::Parameter<TileParam> {
   mxnet::TShape reps;
   DMLC_DECLARE_PARAMETER(TileParam) {
     DMLC_DECLARE_FIELD(reps)
       .describe("The number of times for repeating the tensor a. Each dim size of reps"
                 " must be a positive integer."
                 " If reps has length d, the result will have dimension of max(d, a.ndim);"
                 " If a.ndim < d, a is promoted to be d-dimensional by prepending new axes."
                 " If a.ndim > d, reps is promoted to a.ndim by pre-pending 1's to it.");
   }
 };

 inline bool TileOpShape(const nnvm::NodeAttrs& attrs,
                         mxnet::ShapeVector *in_attrs,
                         mxnet::ShapeVector *out_attrs) {
   CHECK_EQ(in_attrs->size(), 1U);
   CHECK_EQ(out_attrs->size(), 1U);
   const TileParam& param = nnvm::get<TileParam>(attrs.parsed);
   const mxnet::TShape& ishape = (*in_attrs)[0];
   const mxnet::TShape& reps = param.reps;
   // If reps is empty, return a identical input array
   if (reps.ndim() == 0 || ishape.ndim() == 0) {
     SHAPE_ASSIGN_CHECK(*out_attrs, 0, ishape);
     return true;
   }
   for (size_t i = 0; i < reps.ndim(); ++i) {
     CHECK_GT(reps[i], 0) << "invalid reps=" << i << ", dim size must be greater than zero";
   }
   mxnet::TShape oshape(std::max(ishape.ndim(), reps.ndim()));
   int i1 = static_cast<int>(ishape.ndim()) - 1;
   int i2 = static_cast<int>(reps.ndim()) - 1;
   for (int i = static_cast<int>(oshape.ndim()) - 1; i >= 0; --i) {
     if (i1 >= 0 && i2 >= 0) {
       oshape[i] = ishape[i1--] * reps[i2--];
     } else if (i1 >= 0) {
       oshape[i] = ishape[i1--];
     } else if (i2 >= 0) {
       oshape[i] = reps[i2--];
     }
   }
   SHAPE_ASSIGN_CHECK(*out_attrs, 0, oshape);
   return true;
 }

 inline bool TileOpType(const nnvm::NodeAttrs& attrs,
                        std::vector<int>* in_attrs,
                        std::vector<int>* out_attrs) {
   CHECK_EQ(in_attrs->size(), 1U);
   if ((*in_attrs)[0] != -1) {
     TYPE_ASSIGN_CHECK(*out_attrs, 0, (*in_attrs)[0]);
   } else if ((*out_attrs)[0] != -1) {
     TYPE_ASSIGN_CHECK(*in_attrs, 0, (*out_attrs)[0]);
   }
   return true;
 }

 /*!
  * \brief Reshape the input and output tensors for
  * using broadcast_to to achieve the funcitonality
  * of operator tile.
  * \return a pair of mxnet::TShape's, first is the reshaped
  * input shape, second is the reshaped output shape.
  */
 inline std::pair<mxnet::TShape, mxnet::TShape> ReshapeInputOutputForTileOp(
   const mxnet::TShape& ishape,
   const mxnet::TShape& reps) {
   if (ishape.ndim() == 0 || reps.ndim() == 0) {
     return std::make_pair(ishape, ishape);
   }

   // The shape we want to broadcast to
   mxnet::TShape bshape(std::max(ishape.ndim(), reps.ndim()) * 2);

   // The shape of the input tensor after adding new axes before each dim
   mxnet::TShape rshape(bshape.ndim());

   int i1 = static_cast<int>(ishape.ndim()) - 1;
   int i2 = static_cast<int>(reps.ndim()) - 1;
   for (int i = static_cast<int>(bshape.ndim()) - 1; i >= 0; --i) {
     if (0 == (i & 1)) {
       bshape[i] = (i2 >= 0? reps[i2--] : 1);
       rshape[i] = 1;
     } else {
       rshape[i] = bshape[i] = (i1 >= 0? ishape[i1--] : 1);
     }
   }

   return std::make_pair(rshape, bshape);
 }

 /*!
  * \brief Implementation of tiling the input tensor a based
  * on the user-input shape, reps.
  * If a.ndim < reps.ndim, new axes are pre-pended to a. For example,
  * the input tensor has shape (3,), and the reps is (2, 4); the input
  * tensor would be reshaped to (1, 3).
  * If a.ndim > reps.ndim, pre-pending 1's to reps. For example,
  * the input tensor has shape (2, 3, 4, 5), and reps is (2, 2);
  * the reps would be changed to (1, 1, 2, 2).
  * Suppose we have a.ndim = reps.ndim now. To achieve tiling,
  * we utilize the operator broadcast_to. For example, for a tensor
  * of shape (2, 3, 4, 5) and reps (2, 8, 9, 3), we first reshape
  * the tensor to the shape (1, 2, 1, 3, 1, 4, 1, 5) by adding
  * one axis before each dimension. Then, we want to broadcast
  * the new tensor to shape (2, 2, 8, 3, 9, 4, 3, 5). The final
  * output tensor would have shape (2*2, 8*3, 9*4, 3*5).
  */
 template<typename xpu>
 void TileOpForward(const nnvm::NodeAttrs& attrs,
                    const OpContext& ctx,
                    const std::vector<TBlob>& inputs,
                    const std::vector<OpReqType>& req,
                    const std::vector<TBlob>& outputs) {
   CHECK_EQ(inputs.size(), 1U);
   CHECK_EQ(outputs.size(), 1U);

   if (inputs[0].Size() == 0) return;
   const mxnet::TShape& ishape = inputs[0].shape_;
   const mxnet::TShape& reps = nnvm::get<TileParam>(attrs.parsed).reps;

   // If any one of the number in reps is zero, return immediately
   for (index_t i = 0; i < reps.ndim(); ++i) {
     if (0 == reps[i]) return;
   }

   std::pair<mxnet::TShape, mxnet::TShape> rshapes = ReshapeInputOutputForTileOp(ishape, reps);

   // reshaped input tblob
   TBlob iblob(inputs[0].dptr_, rshapes.first, inputs[0].dev_mask(),
     inputs[0].type_flag_, inputs[0].dev_id());
   std::vector<TBlob> newInputs = {iblob};
   // reshaped output tblob
   TBlob oblob(outputs[0].dptr_, rshapes.second, outputs[0].dev_mask(),
     outputs[0].type_flag_, outputs[0].dev_id());
   std::vector<TBlob> newOutputs = {oblob};

   BroadcastCompute<xpu>(attrs, ctx, newInputs, req, newOutputs);
 }

 /*!
  * \brief Compute the gradient of the loss function
  * with respect to the input of the operator.
  * Backpropagation is employed to implement the
  * chain rule.
  * \param inputs the gradient of the loss function
  * with respect to the outputs of the operator
  * \param outputs the gradient of the loss function
  * with respect to the inputs of the operator
  */
 template<typename xpu>
 void TileOpBackward(const nnvm::NodeAttrs& attrs,
                     const OpContext& ctx,
                     const std::vector<TBlob>& inputs,
                     const std::vector<OpReqType>& req,
                     const std::vector<TBlob>& outputs) {
   CHECK_EQ(inputs.size(), 1U);
   CHECK_EQ(outputs.size(), 1U);

   if (inputs[0].Size() == 0) return;
   const mxnet::TShape& oshape = outputs[0].shape_;
   const mxnet::TShape& reps = nnvm::get<TileParam>(attrs.parsed).reps;

   // If any one of the number in reps is zero, return immediately
   for (index_t i = 0; i < reps.ndim(); ++i) {
     if (0 == reps[i]) return;
   }

   std::pair<mxnet::TShape, mxnet::TShape> rshapes = ReshapeInputOutputForTileOp(oshape, reps);

   // reshaped output grad tblob
   TBlob oblob(outputs[0].dptr_, rshapes.first, outputs[0].dev_mask(),
     outputs[0].type_flag_, outputs[0].dev_id());
   std::vector<TBlob> newOutputs = {oblob};
   // reshaped input grad tblob
   TBlob iblob(inputs[0].dptr_, rshapes.second, inputs[0].dev_mask(),
     inputs[0].type_flag_, inputs[0].dev_id());
   std::vector<TBlob> newInputs = {iblob};

   ReduceAxesComputeImpl<xpu, mshadow::red::sum, false>(
       ctx, newInputs, req, newOutputs, rshapes.first);
 }

 struct ReverseParam : public dmlc::Parameter<ReverseParam> {
   nnvm::Tuple<int> axis;
   DMLC_DECLARE_PARAMETER(ReverseParam) {
     DMLC_DECLARE_FIELD(axis)
     .describe("The axis which to reverse elements.");
   }
 };


 #define REVERSE_MAX_DIM 10U

 struct reverse {
   MSHADOW_XINLINE static int ReverseIndex(index_t idx,
                                           index_t nreversedim,
                                           const index_t * stride_,
                                           const index_t * trailing_) {
     index_t outputIndex = idx;
     for (index_t i = 0; i < nreversedim; ++i) {
       const index_t low = outputIndex % trailing_[i];
       index_t high = outputIndex / trailing_[i];
       const index_t x = high%stride_[i];
       high /= stride_[i];
       outputIndex = (high*stride_[i] + stride_[i] - 1 - x)*trailing_[i] + low;
     }
     return outputIndex;
   }
 #ifdef __CUDACC__
   template<typename DType>
   __device__  static void Map(index_t index, index_t nreversedim, const DType *src, DType *dst,
                               const index_t * stride_,
                               const index_t * trailing_) {
     __shared__ index_t stride_share[REVERSE_MAX_DIM];
     __shared__ index_t trailing_share[REVERSE_MAX_DIM];
     if (threadIdx.x < REVERSE_MAX_DIM) {
       stride_share[threadIdx.x] = stride_[threadIdx.x];
       trailing_share[threadIdx.x] = trailing_[threadIdx.x];
     }
     __syncthreads();
     index_t new_idx = ReverseIndex(index, nreversedim, stride_share, trailing_share);
     dst[new_idx] = src[index];
   }
 #else
   template<typename DType>
   MSHADOW_XINLINE  static void Map(index_t index, index_t nreversedim, const DType *src, DType *dst,
                                    const index_t * stride_,
                                    const index_t * trailing_) {
     index_t new_idx = ReverseIndex(index, nreversedim, stride_, trailing_);
     dst[new_idx] = src[index];
   }
 #endif
 };


 template<typename xpu>
 void ReverseOpForward(const nnvm::NodeAttrs& attrs,
                       const OpContext& ctx,
                       const std::vector<TBlob>& inputs,
                       const std::vector<OpReqType>& req,
                       const std::vector<TBlob>& outputs) {
   using namespace mshadow;
   using namespace mxnet_op;
   const ReverseParam& param = nnvm::get<ReverseParam>(attrs.parsed);
   CHECK_EQ(inputs[0].type_flag_, outputs[0].type_flag_);
   CHECK_LT(param.axis.ndim(), REVERSE_MAX_DIM);
   Stream<xpu> *s = ctx.get_stream<xpu>();

   const mxnet::TShape& ishape = inputs[0].shape_;

   std::vector<index_t> stride_(param.axis.ndim());
   std::vector<index_t>  trailing_(param.axis.ndim());
   index_t reverse_index = 0;
   for (int axis : param.axis) {
     CHECK_LT(axis, static_cast<int>(ishape.ndim()));
     stride_[reverse_index] = ishape[axis];
     trailing_[reverse_index] = 1;
     for (index_t i2 = axis + 1; i2 < ishape.ndim(); ++i2) {
       trailing_[reverse_index] *= ishape[i2];
     }
     reverse_index++;
   }

 #ifdef __CUDACC__
   mshadow::Tensor<xpu, 1, uint8_t> workspace =
     ctx.requested[0].get_space_typed<xpu, 1, uint8_t>(
       mshadow::Shape1(reverse_index * sizeof(index_t) * 2), s);

   auto stride_workspace = workspace.dptr_;
   auto trailing_workspace = workspace.dptr_ + reverse_index * sizeof(index_t);

   cudaMemcpyAsync(stride_workspace, thrust::raw_pointer_cast(stride_.data()),
                   stride_.size() * sizeof(index_t),
                   cudaMemcpyHostToDevice, mshadow::Stream<gpu>::GetStream(s));
   cudaMemcpyAsync(trailing_workspace, thrust::raw_pointer_cast(trailing_.data()),
                   trailing_.size() * sizeof(index_t),
                   cudaMemcpyHostToDevice, mshadow::Stream<gpu>::GetStream(s));

 #endif

 #ifdef __CUDACC__
   MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {
     Kernel<reverse, xpu>::Launch(s, inputs[0].Size(), reverse_index,
     inputs[0].dptr<DType>(), outputs[0].dptr<DType>(),
     reinterpret_cast<index_t*>(stride_workspace), reinterpret_cast<index_t*>(trailing_workspace));
   });
 #else
   MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {
     Kernel<reverse, xpu>::Launch(s, inputs[0].Size(), reverse_index,
     inputs[0].dptr<DType>(), outputs[0].dptr<DType>(),
     stride_.data(), trailing_.data());
   });
 #endif
 }


 struct StackParam : public dmlc::Parameter<StackParam> {
   int axis;
   int num_args;
   DMLC_DECLARE_PARAMETER(StackParam) {
     DMLC_DECLARE_FIELD(axis)
     .set_default(0)
     .describe("The axis in the result array along which the input arrays are stacked.");
     DMLC_DECLARE_FIELD(num_args).set_lower_bound(1)
     .describe("Number of inputs to be stacked.");
   }
 };


 inline bool StackOpShape(const nnvm::NodeAttrs& attrs,
                          mxnet::ShapeVector *in_attrs,
                          mxnet::ShapeVector *out_attrs) {
   const StackParam& param = dmlc::get<StackParam>(attrs.parsed);

   mxnet::TShape dshape;
   for (const mxnet::TShape& i : (*in_attrs)) {
     shape_assign(&dshape, i);
   }
   if (dshape.ndim() == 0) return false;

   mxnet::TShape oshape(dshape.ndim() + 1);
   int axis = CheckAxis(param.axis, oshape.ndim());
   for (int i = 0; i < axis; ++i) {
     oshape[i] = dshape[i];
   }
   oshape[axis] = param.num_args;
   for (index_t i = axis + 1; i < oshape.ndim(); ++i) {
     oshape[i] = dshape[i-1];
   }
   SHAPE_ASSIGN_CHECK(*out_attrs, 0, oshape);

   return true;
 }


 template<typename xpu>
 void StackOpForward(const nnvm::NodeAttrs& attrs,
                     const OpContext& ctx,
                     const std::vector<TBlob>& inputs,
                     const std::vector<OpReqType>& req,
                     const std::vector<TBlob>& outputs) {
   using namespace mshadow;
   using namespace mshadow::expr;
   const StackParam& param = dmlc::get<StackParam>(attrs.parsed);
   int axis = CheckAxis(param.axis, outputs[0].ndim());

   Stream<xpu> *s = ctx.get_stream<xpu>();
   MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {
     std::vector<Tensor<xpu, 3, DType> > data(inputs.size());
     Tensor<xpu, 3, DType> out;
     size_t leading = 1, trailing = 1;
     for (int i = 0; i < axis; ++i) {
       leading *= outputs[0].shape_[i];
     }
     for (int i = axis + 1; i < outputs[0].ndim(); ++i) {
       trailing *= outputs[0].shape_[i];
     }
     size_t mid = outputs[0].shape_[axis];
     Shape<3> oshape = Shape3(leading, mid, trailing);
     out = outputs[0].get_with_shape<xpu, 3, DType>(oshape, s);

     for (size_t i = 0; i < inputs.size(); ++i) {
       Shape<3> dshape = Shape3(leading, 1, trailing);
       data[i] = inputs[i].get_with_shape<xpu, 3, DType>(dshape, s);
     }
     Concatenate(data, &out, 1, req[0]);
   })
 }

 template<typename xpu>
 void StackOpBackward(const nnvm::NodeAttrs& attrs,
                      const OpContext& ctx,
                      const std::vector<TBlob>& inputs,
                      const std::vector<OpReqType>& req,
                      const std::vector<TBlob>& outputs) {
   using namespace mshadow;
   using namespace mshadow::expr;
   const StackParam& param = dmlc::get<StackParam>(attrs.parsed);
   int axis = CheckAxis(param.axis, inputs[0].ndim());

   Stream<xpu> *s = ctx.get_stream<xpu>();
   MSHADOW_TYPE_SWITCH(inputs[0].type_flag_, DType, {
     std::vector<Tensor<xpu, 3, DType> > grad_in(outputs.size());
     Tensor<xpu, 3, DType> grad;
     size_t leading = 1, trailing = 1;
     for (int i = 0; i < axis; ++i) {
       leading *= inputs[0].shape_[i];
     }
     for (int i = axis + 1; i < inputs[0].ndim(); ++i) {
       trailing *= inputs[0].shape_[i];
     }
     size_t mid = inputs[0].shape_[axis];
     Shape<3> oshape = Shape3(leading, mid, trailing);
     grad = inputs[0].get_with_shape<xpu, 3, DType>(oshape, s);

     for (size_t i = 0; i < outputs.size(); ++i) {
       Shape<3> dshape = Shape3(leading, 1, trailing);
       grad_in[i] = outputs[i].get_with_shape<xpu, 3, DType>(dshape, s);
     }
     Split(grad, &grad_in, 1, req);
   })
 }

 struct SqueezeParam : public dmlc::Parameter<SqueezeParam> {
   dmlc::optional<mxnet::TShape> axis;
   DMLC_DECLARE_PARAMETER(SqueezeParam) {
     DMLC_DECLARE_FIELD(axis)
     .set_default(dmlc::optional<mxnet::TShape>())
     .describe("Selects a subset of the single-dimensional entries in the shape."
               " If an axis is selected with shape entry greater than one, an error is raised.");
   }
 };

 // Given a shape that may have dim size equal to 0,
 // move all the zeros to the last of the shape array
 // and keep the relative order of the non-zero values.
 // Returns the new shape size after moving all zeros to the end.
 inline size_t SqueezeShapeHelper(mxnet::TShape* shape) {
   CHECK(shape != nullptr);
   size_t count = 0;
   for (size_t i = 0; i < shape->ndim(); ++i) {
     if ((*shape)[i] == 0) {
       ++count;
     } else {
       std::swap((*shape)[i], (*shape)[i-count]);
     }
   }
   return shape->ndim() - count;
 }

 inline bool SqueezeShape(const nnvm::NodeAttrs& attrs,
                          mxnet::ShapeVector *in_attrs,
                          mxnet::ShapeVector *out_attrs) {
   const SqueezeParam& param = nnvm::get<SqueezeParam>(attrs.parsed);
   CHECK_EQ(in_attrs->size(), 1U) << "Input: [data]";
   CHECK_EQ(out_attrs->size(), 1U);
   const mxnet::TShape& dshape = in_attrs->at(0);
   const int dndim = dshape.ndim();
   if (shape_is_none(dshape)) return false;
   mxnet::TShape oshape = dshape;
   if (param.axis.has_value()) {
     // preprocess axis
     mxnet::TShape axes = param.axis.value();
     for (size_t i = 0; i < axes.ndim(); ++i) {
       if (axes[i] < 0) {
         axes[i] += dndim;
         CHECK_GE(axes[i], 0)
           << "axis " << axes[i] - dndim << " is out of bounds for array of dimension " << dndim;
       }
       CHECK_LT(axes[i], dndim)
         << "axis " << axes[i] << " is out of bounds for array of dimension " << dndim;
       CHECK_EQ(dshape[axes[i]], 1)
         << "cannot select an axis to squeeze out which has size="
         << dshape[axes[i]] << " not equal to one";
       CHECK_NE(oshape[axes[i]], 0) << "duplicate value in axis";
       oshape[axes[i]] = 0;
     }
   } else {
     for (size_t i = 0; i < oshape.ndim(); ++i) {
       if (oshape[i] == 1) oshape[i] = 0;
     }
   }
   size_t oshape_size = SqueezeShapeHelper(&oshape);
   if (oshape_size == 0) {  // corner case when dshape is (1, 1, 1, 1)
     oshape[0] = 1;
     oshape_size = 1;
   }
   SHAPE_ASSIGN_CHECK(*out_attrs, 0, mxnet::TShape(oshape.data(), oshape.data()+oshape_size));
   return true;
 }

 struct DepthToSpaceParam : public dmlc::Parameter<DepthToSpaceParam> {
   int block_size;
   DMLC_DECLARE_PARAMETER(DepthToSpaceParam) {
     DMLC_DECLARE_FIELD(block_size)
       .describe("Blocks of [block_size. block_size] are moved");
   }
 };

 inline bool DepthToSpaceOpShape(const nnvm::NodeAttrs& attrs,
                                 mxnet::ShapeVector* in_attrs,
                                 mxnet::ShapeVector* out_attrs) {
   const DepthToSpaceParam& param = nnvm::get<DepthToSpaceParam>(attrs.parsed);
   CHECK_EQ(in_attrs->size(), 1U);
   CHECK_EQ(out_attrs->size(), 1U);
   CHECK_EQ(in_attrs->at(0).ndim(), 4) << "Operation Depth To Space requires exactly 4D tensor";

   mxnet::TShape expected_out(4);

   mxnet::TShape& in_shape = in_attrs->at(0);
   int block = param.block_size;
   CHECK_NE(block, 0) << "block_size must be a positive integer value";
   CHECK_NE(in_shape[1], 0) << "Depth dimension:1 cannot be 0";
   CHECK_EQ(in_shape[1] % (block * block), 0)
     << "Cannot perform Depth To Space operation on the specified tensor."
        " Dimension:1(depth dimension) should be a multiple of 'block^2'";
   CHECK_NE(in_shape[0], 0)
     << "Operation requires a 4D tensor. Size of dimension:0 cannot be 0";
   CHECK_NE(in_shape[2], 0)
     << "Operation requires a 4D tensor. Size of dimension:2 cannot be 0";
   CHECK_NE(in_shape[3], 0)
     << "Operation requires a 4D tensor. Size of dimension:3 cannot be 0";

   expected_out[0] = in_shape[0];
   expected_out[1] = in_shape[1] / (block * block);
   size_t i = 2;
   while (i < expected_out.ndim()) {
     expected_out[i] = in_shape[i] * block;
     ++i;
   }

   SHAPE_ASSIGN_CHECK(*out_attrs, 0, expected_out);
   return true;
 }

 inline bool DepthToSpaceOpType(const nnvm::NodeAttrs& attrs,
                                std::vector<int>* in_attrs,
                                std::vector<int>* out_attrs) {
   CHECK_EQ(in_attrs->size(), 1U);
   CHECK_EQ(out_attrs->size(), 1U);

   TYPE_ASSIGN_CHECK(*out_attrs, 0, in_attrs->at(0));
   TYPE_ASSIGN_CHECK(*in_attrs, 0, out_attrs->at(0));
   return out_attrs->at(0) != -1;
 }

 /*!
  * \brief This function updates the value of input index from where the data element
  * needs to be fetched and written out to the ith location in output tensor
  * \param index_position    index within offset array to get offset of given dimension
  * \param dim_size          size of current dimension
  * \param idx               output tensor index
  * \param inp_index         index within input tensor from where value is retrieved
  * \param offset_arr        array containing the linear offset of input tensor
  */
 MSHADOW_XINLINE void update_index(index_t index_position, index_t dim_size, index_t *idx,
                                   index_t *inp_index, const index_t* offset_arr) {
   index_t next_idx_val = *idx / dim_size;
   *inp_index += (*idx - next_idx_val * dim_size) * offset_arr[index_position];
   *idx = next_idx_val;
 }

 /*!
  * \brief This function performs the tensor transpose (0, 1, 2, 3, 4, 5) ->
  * (0, 3, 4, 1, 5, 2) by computing linear index within input tensor to be mapped
  * to the ith index of output tensor
  * \param i           tensor index
  * \param out_data    output tensor
  * \param in_data     input tensor
  * \param block       size of chunks to be moved out of depth dimension
  * \param size        array containing the size of each dimension of input tensor
  * \param offset_arr  array containing the linear offset of input tensor
  */
 template<int req>
 struct depth_to_space_forward {
   template<typename DType>
   MSHADOW_XINLINE static void Map(index_t i, DType* out_data, const DType* in_data,
                                   const int block, const index_t* size, const index_t* offset_arr) {
     index_t inp_index = 0, idx = i, dim_size;
     dim_size = block;
     update_index(2, dim_size, &idx, &inp_index, offset_arr);
     dim_size = size[3];
     update_index(5, dim_size, &idx, &inp_index, offset_arr);
     dim_size = block;
     update_index(1, dim_size, &idx, &inp_index, offset_arr);
     dim_size = size[2];
     update_index(4, dim_size, &idx, &inp_index, offset_arr);
     dim_size = size[1] / (block * block);
     update_index(3, dim_size, &idx, &inp_index, offset_arr);
     dim_size = size[0];
     update_index(0, dim_size, &idx, &inp_index, offset_arr);
     KERNEL_ASSIGN(out_data[i], req, in_data[inp_index]);
   }
 };

 /*!
  * \brief This function calculates the linear offset for each dimension of
  * input tensor and stores them in an array, which is later used in
  * performing depth_to_space operation
  * \param i           global thread id
  * \param offset_arr  array to be populated with offset values
  * \param size        array to be populated with size of each dimension of input tensor
  * \param block       size of chunks to be moved out of depth dimension
  * \param size0       size of Dim 0 of input tensor
  * \param size1       size of Dim 1 of input tensor
  * \param size2       size of Dim 2 of input tensor
  * \param size3       size of Dim 3 of input tensor
  */
 template<int req>
 struct compute_offset_for_depth_to_space {
   template<typename DType>
   MSHADOW_XINLINE static void Map(index_t i, DType* offset_arr, DType* size, const int block,
                                   const index_t size0, const index_t size1, const index_t size2,
                                   const index_t size3) {
     size[0] = size0;
     size[1] = size1;
     size[2] = size2;
     size[3] = size3;

     offset_arr[5] = 1;
     offset_arr[4] = offset_arr[5] * size[3];
     offset_arr[3] = offset_arr[4] * size[2];
     offset_arr[2] = offset_arr[3] * size[1] / (block * block);
     offset_arr[1] = offset_arr[2] * block;
     offset_arr[0] = offset_arr[1] * block;
   }
 };

 template<typename xpu>
 void DepthToSpaceOpForward(const nnvm::NodeAttrs& attrs,
                            const OpContext& ctx,
                            const std::vector<TBlob>& inputs,
                            const std::vector<OpReqType>& req,
                            const std::vector<TBlob>& outputs) {
   CHECK_EQ(inputs.size(), 1U);
   CHECK_EQ(outputs.size(), 1U);
   CHECK_EQ(req.size(), 1U);
   mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
   const TBlob& in_data = inputs[0];
   const TBlob& out_data = outputs[0];
   const DepthToSpaceParam& param = nnvm::get<DepthToSpaceParam>(attrs.parsed);
   using namespace mxnet_op;
   int block = param.block_size;

   mshadow::Tensor<xpu, 1, char> workspace =
     ctx.requested[0].get_space_typed<xpu, 1, char>(mshadow::Shape1(sizeof(index_t) * 10), s);
   char* workspace_curr_ptr = workspace.dptr_;
   index_t* offset_arr = reinterpret_cast<index_t*>(workspace_curr_ptr);
   index_t* size = reinterpret_cast<index_t*>(workspace_curr_ptr + sizeof(index_t) * 6);

   MSHADOW_TYPE_SWITCH(out_data.type_flag_, DType, {
     MXNET_ASSIGN_REQ_SWITCH(req[0], req_type, {
       Kernel<compute_offset_for_depth_to_space<req_type>, xpu>::Launch(
           s, 1, offset_arr, size, block, in_data.shape_[0], in_data.shape_[1],
           in_data.shape_[2], in_data.shape_[3]);

       Kernel<depth_to_space_forward<req_type>, xpu>::Launch(
           s, out_data.Size(), out_data.dptr<DType>(), in_data.dptr<DType>(),
           block, size, offset_arr);
     });
   });
 }

 inline bool SpaceToDepthOpShape(const nnvm::NodeAttrs& attrs,
                                 mxnet::ShapeVector* in_attrs,
                                 mxnet::ShapeVector* out_attrs) {
   const DepthToSpaceParam& param = nnvm::get<DepthToSpaceParam>(attrs.parsed);
   CHECK_EQ(in_attrs->size(), 1U);
   CHECK_EQ(out_attrs->size(), 1U);
   CHECK_EQ(in_attrs->at(0).ndim(), 4) << "Operation Space To Depth requires exactly 4D tensor";

   mxnet::TShape expected_out(in_attrs->at(0).ndim());

   mxnet::TShape& in_shape = in_attrs->at(0);
   int block = param.block_size;
   CHECK_NE(block, 0) << "block_size must be a positive integer value";
   CHECK_NE(in_shape[0], 0)
     << "Operation requires a 4D tensor. Size of dimension:0 cannot be 0";
   CHECK_NE(in_shape[1], 0) << "Depth dimension:1 cannot be 0";
   CHECK_NE(in_shape[2], 0)
     << "Operation requires a 4D tensor. Size of dimension:2 cannot be 0";
   CHECK_EQ(in_shape[2] % block, 0)
     << "Cannot perform Depth To Space operation on the specified tensor."
        " Dimension:2(1st Space dimension) should be a multiple of 'block' ";
   CHECK_NE(in_shape[3], 0)
     << "Operation requires a 4D tensor. Size of dimension:3 cannot be 0";
   CHECK_EQ(in_shape[3] % block, 0)
     << "Cannot perform Depth To Space operation on the specified tensor."
        " Dimension:3(2nd space dimension) should be a multiple of 'block' ";

   expected_out[0] = in_shape[0];
   expected_out[1] = in_shape[1] * block * block;
   uint32_t i = 2;
   while (i < expected_out.ndim()) {
     expected_out[i] = in_shape[i] / block;
     ++i;
   }

   SHAPE_ASSIGN_CHECK(*out_attrs, 0, expected_out);
   return true;
 }

 inline bool SpaceToDepthOpType(const nnvm::NodeAttrs& attrs,
                                std::vector<int>* in_attrs,
                                std::vector<int>* out_attrs) {
   CHECK_EQ(in_attrs->size(), 1U);
   CHECK_EQ(out_attrs->size(), 1U);

   TYPE_ASSIGN_CHECK(*out_attrs, 0, in_attrs->at(0));
   TYPE_ASSIGN_CHECK(*in_attrs, 0, out_attrs->at(0));
   return out_attrs->at(0) != -1;
 }

 /*!
  * \brief This function preforms the tensor transpose (0, 1, 2, 3, 4, 5) ->
  * (0, 3, 5, 1, 2, 4) by computing linear index within input tensor to be mapped
  * to the ith index of output tensor
  * \param i           tensor index
  * \param out_data    output tensor
  * \param in_data     input tensor
  * \param block       size of chunks to be moved out of depth dimension
  * \param size        array containing the size of each dimension of input tensor
  * \param offset_arr  array containing the linear offset of input tensor
  */
 template<int req>
 struct space_to_depth_forward {
   template<typename DType>
   MSHADOW_XINLINE static void Map(index_t i, DType* out_data, const DType* in_data, const int block,
                                   const index_t* size, const index_t* offset_arr) {
     index_t inp_index = 0, idx = i, dim_size;
     dim_size = size[3] / block;
     update_index(4, dim_size, &idx, &inp_index, offset_arr);
     dim_size = size[2] / block;
     update_index(2, dim_size, &idx, &inp_index, offset_arr);
     dim_size = size[1];
     update_index(1, dim_size, &idx, &inp_index, offset_arr);
     dim_size = block;
     update_index(5, dim_size, &idx, &inp_index, offset_arr);
     dim_size = block;
     update_index(3, dim_size, &idx, &inp_index, offset_arr);
     dim_size = size[0];
     update_index(0, dim_size, &idx, &inp_index, offset_arr);
     KERNEL_ASSIGN(out_data[i], req, in_data[inp_index]);
   }
 };

 /*!
  * \brief This function calculates the linear offset for each dimension of
  * input tensor and stores them in an array, which is later used in
  * performing space_to_depth operation
  * \param i           global thread id
  * \param offset_arr  array to be populated with offset values
  * \param size        array to be populated with size of each dimension of input tensor
  * \param block       size of chunks to be moved out of depth dimension
  * \param size0       size of Dim 0 of input tensor
  * \param size1       size of Dim 1 of input tensor
  * \param size2       size of Dim 2 of input tensor
  * \param size3       size of Dim 3 of input tensor
  */
 template<int req>
 struct compute_offset_for_space_to_depth {
   template<typename DType>
   MSHADOW_XINLINE static void Map(index_t i, DType* offset_arr, DType* size, const int block,
                                   const index_t size0, const index_t size1,
                                   const index_t size2, const index_t size3) {
     size[0] = size0;
     size[1] = size1;
     size[2] = size2;
     size[3] = size3;

     offset_arr[5] = 1;
     offset_arr[4] = offset_arr[5] * block;
     offset_arr[3] = offset_arr[4] * size[3] / block;
     offset_arr[2] = offset_arr[3] * block;
     offset_arr[1] = offset_arr[2] * size[2] / block;
     offset_arr[0] = offset_arr[1] * size[1];
   }
 };

 template<typename xpu>
 void SpaceToDepthOpForward(const nnvm::NodeAttrs& attrs,
                            const OpContext& ctx,
                            const std::vector<TBlob>& inputs,
                            const std::vector<OpReqType>& req,
                            const std::vector<TBlob>& outputs) {
   CHECK_EQ(inputs.size(), 1U);
   CHECK_EQ(outputs.size(), 1U);
   CHECK_EQ(req.size(), 1U);
   mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
   const TBlob& in_data = inputs[0];
   const TBlob& out_data = outputs[0];
   const DepthToSpaceParam& param = nnvm::get<DepthToSpaceParam>(attrs.parsed);
   using namespace mxnet_op;
   int block = param.block_size;

   mshadow::Tensor<xpu, 1, char> workspace =
     ctx.requested[0].get_space_typed<xpu, 1, char>(mshadow::Shape1(sizeof(index_t) * 10), s);
   char* workspace_curr_ptr = workspace.dptr_;
   index_t* offset_arr = reinterpret_cast<index_t*>(workspace_curr_ptr);
   index_t* size = reinterpret_cast<index_t*>(workspace_curr_ptr + sizeof(index_t) * 6);

   MSHADOW_TYPE_SWITCH(out_data.type_flag_, DType, {
     MXNET_ASSIGN_REQ_SWITCH(req[0], req_type, {
       Kernel<compute_offset_for_space_to_depth<req_type>, xpu>::Launch(
           s, 1, offset_arr, size, block, in_data.shape_[0], in_data.shape_[1],
           in_data.shape_[2], in_data.shape_[3]);
       Kernel<space_to_depth_forward<req_type>, xpu>::Launch(
           s, out_data.Size(), out_data.dptr<DType>(), in_data.dptr<DType>(),
           block, size, offset_arr);
     });
   });
 }

 namespace split_enum {
 enum SplitOpInputs {kData};
 }  // namespace split_enum

 struct SplitParam : public dmlc::Parameter<SplitParam> {
   mxnet::TShape indices;
   int axis;
   bool squeeze_axis;
   int sections;
   DMLC_DECLARE_PARAMETER(SplitParam) {
     DMLC_DECLARE_FIELD(indices)
     .describe("Indices of splits. The elements should denote the boundaries of at which split"
               " is performed along the `axis`.");
     DMLC_DECLARE_FIELD(axis).set_default(1)
     .describe("Axis along which to split.");
     DMLC_DECLARE_FIELD(squeeze_axis).set_default(0)
     .describe("If true, Removes the axis with length 1 from the shapes of the output arrays."
               " **Note** that setting `squeeze_axis` to ``true`` removes axis with length 1"
               " only along the `axis` which it is split."
               " Also `squeeze_axis` can be set to ``true``"
               " only if ``input.shape[axis] == num_outputs``.");
     DMLC_DECLARE_FIELD(sections).set_default(0)
     .describe("Number of sections if equally splitted. Default to 0 which means split by indices.");
   }
 };  // struct SplitParam

 inline mxnet::TShape GetSplitIndices(const mxnet::TShape& ishape, int axis, int sections) {
   mxnet::TShape indices(sections+1);
   indices[0] = 0;
   int64_t section_size = ishape[axis] / sections;
   for (int i = 0; i < sections; ++i) {
     indices[i+1] = section_size * (i + 1);
   }
   return indices;
 }

 inline bool SplitOpType(const nnvm::NodeAttrs& attrs,
                         std::vector<int>* in_attrs,
                         std::vector<int>* out_attrs) {
   CHECK_EQ(in_attrs->size(), 1U);
   int dtype = (*in_attrs)[0];
   CHECK_NE(dtype, -1) << "First input must have specified type";
   const SplitParam& param = nnvm::get<SplitParam>(attrs.parsed);
   out_attrs->clear();
   int num_outputs = (param.sections > 0) ? param.sections : param.indices.ndim();
   for (int i = 0; i < num_outputs; ++i) {
     out_attrs->push_back(dtype);
   }
   return true;
 }

 inline bool SplitOpShape(const nnvm::NodeAttrs& attrs,
                          mxnet::ShapeVector* in_attrs,
                          mxnet::ShapeVector* out_attrs) {
   using namespace mshadow;
   const SplitParam& param = nnvm::get<SplitParam>(attrs.parsed);
   CHECK_EQ(in_attrs->size(), 1U);
   mxnet::TShape dshape = in_attrs->at(split_enum::kData);
   mxnet::TShape ishape = in_attrs->at(split_enum::kData);
   if (dshape.ndim() == 0) return false;
   if (param.axis >= 0) {
     CHECK_LT(static_cast<size_t>(param.axis), dshape.ndim());
   } else {
     CHECK_LT(param.axis + dshape.ndim(), dshape.ndim());
   }
   int real_axis = param.axis;
   if (real_axis < 0) {
     real_axis += dshape.ndim();
   }
   const mxnet::TShape indices =
     (param.sections > 0) ? GetSplitIndices(ishape, real_axis, param.sections) : param.indices;
   int num_outputs = (param.sections > 0) ? indices.ndim() - 1 : indices.ndim();
   // Pre-compute squeezed output shape for future usage
   mxnet::TShape squeezed_dshape = dshape;
   for (int d = real_axis; d < static_cast<int>(squeezed_dshape.ndim()) - 1; ++d) {
     squeezed_dshape[d] = squeezed_dshape[d+1];
   }
   squeezed_dshape = mxnet::TShape(&squeezed_dshape[0], &squeezed_dshape[squeezed_dshape.ndim()-1]);
   // Assign shape to every output
   for (int i = 0; i < num_outputs; ++i) {
     int start = indices[i];
     int end = (i < num_outputs - 1) ? indices[i + 1] : ishape[real_axis];
     CHECK(start < end)
       << "start " << start << " is not less than end " << end << "for subarray " << i;
     CHECK(end <= ishape[real_axis])
       << "end " << end << " is no less than the size of the axis " << ishape[real_axis];
     dshape[real_axis] = (end - start);
     if (param.squeeze_axis) {
       CHECK_EQ(end - start, 1U) << "expected axis size of 1 but got " << end - start;
       SHAPE_ASSIGN_CHECK(*out_attrs, i, squeezed_dshape);
     } else {
       SHAPE_ASSIGN_CHECK(*out_attrs, i, dshape);
     }
   }
   mxnet::TShape back_calculate_dshape = ishape;
   back_calculate_dshape[real_axis] = 0;
   for (int d = 0; d < real_axis; ++d) {
     back_calculate_dshape[d] = (*out_attrs)[0][d];
   }
   if (param.squeeze_axis) {
     back_calculate_dshape[real_axis] = num_outputs;
   } else {
     for (int i = 0; i < num_outputs; ++i) {
       back_calculate_dshape[real_axis] += (*out_attrs)[i][real_axis];
     }
   }
   for (int d = real_axis + 1; d < static_cast<int>(ishape.ndim()); ++d) {
     if (param.squeeze_axis) {
       back_calculate_dshape[d] = (*out_attrs)[0][d - 1];
     } else {
       back_calculate_dshape[d] = (*out_attrs)[0][d];
     }
   }
   SHAPE_ASSIGN_CHECK(*in_attrs, split_enum::kData, back_calculate_dshape);
   return true;
 }

 struct SplitKernel {
   /*!
    * \brief Map function for forward split_v2 operator
    * \param i              global thread id
    * \param in_data        ptr to input buffer
    * \param out_data       ptr to ptr of outputs buffer
    * \param indices        ptr to indices buffer
    * \param num_sections   # of sections after split
    * \param axis_size      size of axis to be splitted on
    * \param trailing_size  step size within the data buffer of the axis to be splitted on
    */
   template<typename DType>
   static MSHADOW_XINLINE void Map(size_t i,
                                   const DType *in_data, DType** out_data, const size_t* indices,
                                   const size_t num_sections, const size_t axis_size,
                                   const size_t trailing_size) {
     size_t idx = i / trailing_size % axis_size;
     size_t target = 0;
     for (size_t section = 0;
          section < num_sections && indices[section] <= idx;
          target = section++) {}
     DType* target_data = out_data[target];
     const size_t mid_idx = idx - indices[target];
     const size_t head_idx = i / (trailing_size * axis_size);
     const size_t tail_idx = i % trailing_size;
     const size_t section_size = indices[target + 1] - indices[target];
     const size_t target_idx =
       head_idx * trailing_size * section_size + mid_idx * trailing_size + tail_idx;
     target_data[target_idx] = in_data[i];
   }
 };

 struct ConcatenateKernel {
   /*!
    * \brief Map function for backward split_v2 operator
    * \param i              global thread id
    * \param out_grad       ptr to ptr of out grads buffer
    * \param in_grad        ptr to input grad buffer
    * \param indices        ptr to indices buffer
    * \param num_sections   # of sections after split
    * \param axis_size      size of axis to be splitted on
    * \param trailing_size  step size within the data buffer of the axis to be splitted on
    */
   template<typename DType>
   static MSHADOW_XINLINE void Map(size_t i,
                                   DType** out_grad, DType* in_grad, const size_t* indices,
                                   const size_t num_sections, const size_t axis_size,
                                   const size_t trailing_size) {
     size_t idx = i / trailing_size % axis_size;
     size_t src = 0;
     for (size_t section = 0;
          section < num_sections && indices[section] <= idx;
          src = section++) {}
     DType* src_grad = out_grad[src];
     const size_t mid_idx = idx - indices[src];
     const size_t head_idx = i / (trailing_size * axis_size);
     const size_t tail_idx = i % trailing_size;
     const size_t section_size = indices[src + 1] - indices[src];
     const size_t src_idx =
       head_idx * trailing_size * section_size + mid_idx * trailing_size + tail_idx;
     in_grad[i] = src_grad[src_idx];
   }
 };

 template<typename xpu>
 inline void SplitOpForward(const nnvm::NodeAttrs& attrs,
                            const OpContext& ctx,
                            const std::vector<TBlob>& inputs,
                            const std::vector<OpReqType>& req,
                            const std::vector<TBlob>& outputs) {
   using namespace mshadow;
   using namespace mshadow::expr;
   using namespace mxnet_op;
   const SplitParam& param = nnvm::get<SplitParam>(attrs.parsed);
   CHECK_EQ(inputs.size(), 1U);
   CHECK_EQ(outputs.size(), (param.sections > 0) ? param.sections : param.indices.ndim());
   Stream<xpu> *s = ctx.get_stream<xpu>();
   const TBlob& input_data = inputs[split_enum::kData];
   size_t leading = 1, trailing = 1;
   int real_axis = param.axis;
   if (real_axis < 0) {
     real_axis += input_data.ndim();
   }
   CHECK_LT(real_axis, input_data.ndim());
   size_t mid = input_data.shape_[real_axis];
   for (int i = 0; i < real_axis; ++i) {
     leading *= input_data.shape_[i];
   }
   for (int i = real_axis + 1; i < input_data.ndim(); ++i) {
     trailing *= input_data.shape_[i];
   }

   size_t workspace_size = 0;
   const mxnet::TShape& ishape = input_data.shape_;
   const mxnet::TShape split_pts =
     (param.sections > 0) ? GetSplitIndices(ishape, real_axis, param.sections) : param.indices;
   std::vector<size_t> indices;
   for (const auto& section : split_pts) {
     indices.push_back(section);
   }
   if (param.sections == 0) {
     indices.push_back(ishape[real_axis]);
   }
   workspace_size += indices.size() * sizeof(size_t);
   MSHADOW_TYPE_SWITCH(input_data.type_flag_, DType, {
     std::vector<DType*> output_data;
     for (const TBlob& data : outputs) {
       output_data.push_back(data.dptr<DType>());
     }
     workspace_size += output_data.size() * sizeof(DType*);
     Tensor<xpu, 1, char> workspace =
       ctx.requested[0].get_space_typed<xpu, 1, char>(Shape1(workspace_size), s);
     Tensor<cpu, 1, size_t> indices_cpu_tensor(indices.data(), Shape1(indices.size()));
     Tensor<xpu, 1, size_t> indices_xpu_tensor(
       reinterpret_cast<size_t*>(workspace.dptr_), Shape1(indices.size()));
     Tensor<cpu, 1, DType*> ptrs_cpu_tensor(output_data.data(), Shape1(output_data.size()));
     Tensor<xpu, 1, DType*> ptrs_xpu_tensor(
       reinterpret_cast<DType**>(workspace.dptr_ + indices.size() * sizeof(size_t)),
       Shape1(output_data.size()));
     mshadow::Copy(indices_xpu_tensor, indices_cpu_tensor, s);
     mshadow::Copy(ptrs_xpu_tensor, ptrs_cpu_tensor, s);
     Kernel<SplitKernel, xpu>::Launch(
       s, input_data.Size(), input_data.dptr<DType>(), ptrs_xpu_tensor.dptr_,
       indices_xpu_tensor.dptr_, indices.size() - 1, mid, trailing);
   });
 }

 template<typename xpu>
 inline void SplitOpBackward(const nnvm::NodeAttrs& attrs,
                             const OpContext& ctx,
                             const std::vector<TBlob>& inputs,
                             const std::vector<OpReqType>& req,
                             const std::vector<TBlob>& outputs) {
   using namespace mshadow;
   using namespace mshadow::expr;
   using namespace mxnet_op;
   const SplitParam& param = nnvm::get<SplitParam>(attrs.parsed);
   CHECK_EQ(inputs.size(), (param.sections > 0) ? param.sections : param.indices.ndim())
     << "out grad vector size mush match the output size";
   CHECK_EQ(outputs.size(), 1U);
   Stream<xpu> *s = ctx.get_stream<xpu>();
   TBlob input_grad = outputs[split_enum::kData];
   size_t leading = 1, trailing = 1;
   int real_axis = param.axis;
   if (real_axis < 0) {
       real_axis += input_grad.ndim();
   }
   CHECK_LT(real_axis, input_grad.ndim());
   size_t mid = input_grad.shape_[real_axis];
   for (int i = 0; i < real_axis; ++i) {
     leading *= input_grad.shape_[i];
   }
   for (int i = real_axis + 1; i < input_grad.ndim(); ++i) {
     trailing *= input_grad.shape_[i];
   }

   size_t workspace_size = 0;
   const mxnet::TShape& ishape = input_grad.shape_;
   const mxnet::TShape split_pts =
     (param.sections > 0) ? GetSplitIndices(ishape, real_axis, param.sections) : param.indices;
   std::vector<size_t> indices;
   for (const auto& section : split_pts) {
     indices.push_back(section);
   }
   if (param.sections == 0) {
     indices.push_back(ishape[real_axis]);
   }
   workspace_size += indices.size() * sizeof(size_t);
   MSHADOW_TYPE_SWITCH(input_grad.type_flag_, DType, {
     std::vector<DType*> out_grads;
     for (const TBlob& output_grad : inputs) {
       out_grads.push_back(output_grad.dptr<DType>());
     }
     workspace_size += out_grads.size() * sizeof(DType*);
     Tensor<xpu, 1, char> workspace =
       ctx.requested[0].get_space_typed<xpu, 1, char>(Shape1(workspace_size), s);
     Tensor<cpu, 1, size_t> indices_cpu_tensor(indices.data(), Shape1(indices.size()));
     Tensor<xpu, 1, size_t> indices_xpu_tensor(
       reinterpret_cast<size_t*>(workspace.dptr_), Shape1(indices.size()));
     Tensor<cpu, 1, DType*> ptrs_cpu_tensor(out_grads.data(), Shape1(inputs.size()));
     Tensor<xpu, 1, DType*> ptrs_xpu_tensor(
       reinterpret_cast<DType**>(workspace.dptr_ + indices.size() * sizeof(size_t)),
       Shape1(inputs.size()));
     mshadow::Copy(indices_xpu_tensor, indices_cpu_tensor, s);
     mshadow::Copy(ptrs_xpu_tensor, ptrs_cpu_tensor, s);
     Kernel<ConcatenateKernel, xpu>::Launch(
       s, input_grad.Size(), ptrs_xpu_tensor.dptr_, input_grad.dptr<DType>(),
       indices_xpu_tensor.dptr_, indices.size() - 1, mid, trailing);
   });
 }

 inline uint32_t SplitNumOutputs(const NodeAttrs& attrs) {
   const SplitParam& param = nnvm::get<SplitParam>(attrs.parsed);
   return (param.sections > 0) ? param.sections : param.indices.ndim();
 }

 }  // namespace op
 }  // namespace mxnet

 #endif  // MXNET_OPERATOR_TENSOR_MATRIX_OP_INL_H_