src/operator/tensor/matrix_op-inl.h - mxnet-test - Git at Google

 /*!
  *  Copyright (c) 2015 by Contributors
  * \file matrix_op-inl.h
  * \brief Function defintion of matrix related operators
  */
 #ifndef MXNET_OPERATOR_TENSOR_MATRIX_OP_INL_H_
 #define MXNET_OPERATOR_TENSOR_MATRIX_OP_INL_H_

 #include <mxnet/operator_util.h>
 #include <vector>
 #include <algorithm>
 #include <utility>
 #include "../mshadow_op.h"
 #include "../elemwise_op_common.h"
 #include "../mxnet_op.h"
 #include "broadcast_reduce_op.h"

 #if MXNET_USE_CUDA
 #include <thrust/device_vector.h>
 #endif

 namespace mxnet {
 namespace op {

 struct ReshapeParam : public dmlc::Parameter<ReshapeParam> {
   TShape target_shape;
   bool keep_highest;
   nnvm::Tuple<int> shape;
   bool reverse;
   DMLC_DECLARE_PARAMETER(ReshapeParam) {
     int tmp[] = {0, 0};
     DMLC_DECLARE_FIELD(target_shape)
     .set_default(TShape(tmp, tmp + 2))
     .describe("(Deprecated! Use ``shape`` instead.) "
               "Target new shape. One and only one dim can be 0, "
               "in which case it will be inferred from the rest of dims");
     DMLC_DECLARE_FIELD(keep_highest).set_default(false)
     .describe("(Deprecated! Use ``shape`` instead.) Whether keep the highest dim unchanged."
               "If set to true, then the first dim in target_shape is ignored,"
               "and always fixed as input");
     DMLC_DECLARE_FIELD(shape)
     .set_default(nnvm::Tuple<int>())
     .describe("The target shape");
     DMLC_DECLARE_FIELD(reverse)
     .set_default(false)
     .describe("If true then translating the input shape from right to left");
   }
 };

 inline bool ReshapeShape(const nnvm::NodeAttrs& attrs,
                              std::vector<TShape> *in_attrs,
                              std::vector<TShape> *out_attrs) {
   const ReshapeParam& param_ = nnvm::get<ReshapeParam>(attrs.parsed);
   CHECK_EQ(in_attrs->size(), 1U) << "Input: [data]";
   CHECK_EQ(out_attrs->size(), 1U);
   CHECK_EQ(param_.target_shape.ndim() > 0 ||
            param_.shape.ndim() > 0, true) << "targe_shape or shape must be present.";
   const TShape &dshape = (*in_attrs)[0];
   if (dshape.ndim() == 0) return false;
   if (param_.shape.ndim() != 0) {
     std::vector<int> dshape_vec;
     std::vector<int> param_shape_vec(param_.shape.begin(), param_.shape.end());
     for (index_t i = 0; i < dshape.ndim(); ++i) {
       dshape_vec.push_back(dshape[i]);
     }
     std::vector<int> tmp;
     size_t src_idx = 0;
     int inf_idx = -1;
     if (param_.reverse) {
       std::reverse(dshape_vec.begin(), dshape_vec.end());
       std::reverse(param_shape_vec.begin(), param_shape_vec.end());
     }
     auto dshape_len = dshape_vec.size();
     auto params_len = param_shape_vec.size();
     for (index_t i = 0; i < params_len; ++i) {
       int proposed_dim = param_shape_vec[i];
       if (proposed_dim == 0) {
         // keep same
         CHECK_LT(src_idx, dshape_len);
         tmp.push_back(dshape_vec[src_idx++]);
       } else if (proposed_dim == -1) {
         // infer
         CHECK_LT(inf_idx, 0) << "One and only one dim can be inferred";
         inf_idx = i;
         tmp.push_back(1);
         src_idx++;
       } else if (proposed_dim == -2) {
         // copy all remaining dims from source
         while (src_idx < dshape_len) {
           size_t dn = dshape_vec[src_idx++];
           tmp.push_back(dn);
         }
       } else if (proposed_dim == -3) {
         // merge two dims from source
         CHECK_LT(src_idx, dshape_len-1);
         size_t d1 = dshape_vec[src_idx++];
         size_t d2 = dshape_vec[src_idx++];
         size_t dn = d1 * d2;
         tmp.push_back(dn);
       } else if (proposed_dim == -4) {
         // split the source dim s into two dims
         // read the left dim and then the right dim (either can be -1)
         CHECK_LT(i + 2, params_len);
         CHECK_LT(src_idx, dshape_len);
         size_t d0 = dshape_vec[src_idx++];
         int d1 = param_shape_vec[++i];
         int d2 = param_shape_vec[++i];
         CHECK(d1 != -1 || d2 != -1) << "Split dims cannot both be -1.";
         if (d1 == -1) d1 = d0 / d2;
         if (d2 == -1) d2 = d0 / d1;
         CHECK_EQ(d1 * d2, static_cast<int>(d0)) <<
           "Split dims " << d1 << ", " << d2 << " do not divide original dim " << d0;
         tmp.push_back(d1);
         tmp.push_back(d2);
       } else {
         // greater than 0, new shape
         tmp.push_back(proposed_dim);
         src_idx++;
       }
     }

     if (inf_idx >= 0) {
       if (dshape.Size() > 0) {
         int new_size = 1;
         for (int x : tmp) new_size *= x;
         tmp[inf_idx] = dshape.Size() / new_size;
       } else {
         tmp[inf_idx] = 0;
       }
     }
     if (param_.reverse) {
       std::reverse(param_shape_vec.begin(), param_shape_vec.end());
       std::reverse(dshape_vec.begin(), dshape_vec.end());
       std::reverse(tmp.begin(), tmp.end());
     }
     TShape oshape(tmp.begin(), tmp.end());
     CHECK_EQ(oshape.Size(), dshape.Size())
       << "Target shape size is different to source. "
       << "Target: " << oshape
       << "\nSource: " << dshape;
     out_attrs->clear();
     out_attrs->push_back(oshape);
   } else {
     LOG(INFO) << "Using target_shape will be deprecated.";
     TShape oshape = param_.target_shape;
     int neg_count = 0;
     index_t inf_idx = 0;
     index_t start_idx = param_.keep_highest ? 1 : 0;
     if (param_.keep_highest) {
       oshape[0] = dshape[0];
     }
     for (index_t i = start_idx; i < oshape.ndim(); ++i) {
       if (oshape[i] == 0) {
         neg_count++;
         inf_idx = i;
       }
     }
     if (neg_count == 1) {
       oshape[inf_idx] = 1;
       oshape[inf_idx] = dshape.Size() / oshape.Size();
     }

     CHECK(oshape.Size() == dshape.Size())
         << "Target shape size is different to source. "
         << "Target: " << param_.target_shape.Size()
         << "\nSource: " << dshape.Size();
     out_attrs->clear();
     out_attrs->push_back(oshape);
   }
   return true;
 }

 inline bool FlattenShape(const nnvm::NodeAttrs& attrs,
                          std::vector<TShape> *in_attrs,
                          std::vector<TShape> *out_attrs) {
   CHECK_EQ(in_attrs->size(), 1U) << "Input: [data]";
   CHECK_EQ(out_attrs->size(), 1U);
   const TShape &dshape = (*in_attrs)[0];
   if (dshape.ndim() == 0) return false;
   out_attrs->clear();
   uint32_t target_dim = 1;
   for (uint32_t i = 1; i < dshape.ndim(); ++i) {
     target_dim *= dshape[i];
   }
   out_attrs->push_back(mshadow::Shape2(dshape[0], target_dim));
   return true;
 }

 struct TransposeParam : public dmlc::Parameter<TransposeParam> {
   TShape axes;
   DMLC_DECLARE_PARAMETER(TransposeParam) {
     DMLC_DECLARE_FIELD(axes).set_default(TShape())
     .describe("Target axis order. By default the axes will be inverted.");
   }
 };

 template<typename xpu>
 void TransposeImpl(RunContext ctx,
                    const TBlob& src,
                    const TBlob& ret,
                    const TShape& axes) {
   using namespace mshadow;
   using namespace mshadow::expr;
   CHECK_EQ(src.type_flag_, ret.type_flag_);
   Stream<xpu> *s = ctx.get_stream<xpu>();
   MSHADOW_TYPE_SWITCH(ret.type_flag_, DType, {
     switch (axes.ndim()) {
      case 0:
       break;
      case 1: {
       Tensor<xpu, 1, DType> in = src.get<xpu, 1, DType>(s);
       Tensor<xpu, 1, DType> out = ret.get<xpu, 1, DType>(s);
       Copy(out, in, s);
       break;
      }
      case 2: {
       mshadow::Tensor<xpu, 2, DType> in = src.FlatTo2D<xpu, DType>(s);
       mshadow::Tensor<xpu, 2, DType> out = ret.FlatTo2D<xpu, DType>(s);
       if (axes[0] == 1 && axes[1] == 0) {
         out = in.T();
       } else {
         Copy(out, in, s);
       }
       break;
      }
      case 3: {
       Tensor<xpu, 3, DType> in = src.get<xpu, 3, DType>(s);
       Tensor<xpu, 3, DType> out = ret.get<xpu, 3, DType>(s);
       out = transpose(in, axes.get<3>());
       break;
      }
      case 4: {
       Tensor<xpu, 4, DType> in = src.get<xpu, 4, DType>(s);
       Tensor<xpu, 4, DType> out = ret.get<xpu, 4, DType>(s);
       out = transpose(in, axes.get<4>());
       break;
      }
      case 5: {
       Tensor<xpu, 5, DType> in = src.get<xpu, 5, DType>(s);
       Tensor<xpu, 5, DType> out = ret.get<xpu, 5, DType>(s);
       out = transpose(in, axes.get<5>());
       break;
      }
      default:
       LOG(FATAL) << "Transpose support at most 5 dimensions";
       break;
     }
   });
 }

 // matrix transpose
 template<typename xpu>
 void Transpose(const nnvm::NodeAttrs& attrs,
                const OpContext& ctx,
                const std::vector<TBlob>& inputs,
                const std::vector<OpReqType>& req,
                const std::vector<TBlob>& outputs) {
   const TransposeParam& param = nnvm::get<TransposeParam>(attrs.parsed);
   CHECK_EQ(req[0], kWriteTo) << "Transpose does not support inplace";
   if (param.axes.ndim() == 0) {
     TShape axes = TShape(inputs[0].ndim());
     for (index_t i = 0; i < axes.ndim(); ++i) {
       axes[i] = axes.ndim() - 1 - i;
     }
     TransposeImpl<xpu>(ctx.run_ctx, inputs[0], outputs[0], axes);
   } else {
     TransposeImpl<xpu>(ctx.run_ctx, inputs[0], outputs[0], param.axes);
   }
 }

 inline bool TransposeShape(const nnvm::NodeAttrs& attrs,
                              std::vector<TShape> *in_attrs,
                              std::vector<TShape> *out_attrs) {
   const TransposeParam& param = nnvm::get<TransposeParam>(attrs.parsed);
   CHECK_EQ(in_attrs->size(), 1U);
   CHECK_EQ(out_attrs->size(), 1U);
   TShape& shp = (*in_attrs)[0];
   CHECK_LE(shp.ndim(), 5U) << "Transpose support at most 5 dimensions";
   TShape ret(shp.ndim());
   if (param.axes.ndim() == 0) {
     for (index_t i = 0; i < shp.ndim(); ++i) {
       ret[i] = shp[shp.ndim()-1-i];
     }
   } else {
     CHECK_EQ(shp.ndim(), param.axes.ndim());
     for (index_t i = 0; i < shp.ndim(); ++i) {
       CHECK(param.axes[i] < shp.ndim());
       ret[i] = shp[param.axes[i]];
     }
   }
   SHAPE_ASSIGN_CHECK(*out_attrs, 0, ret);
   return true;
 }


 struct ExpandDimParam : public dmlc::Parameter<ExpandDimParam> {
   index_t axis;
   DMLC_DECLARE_PARAMETER(ExpandDimParam) {
     DMLC_DECLARE_FIELD(axis)
     .describe("Position (amongst axes) where new axis is to be inserted.");
   }
 };


 inline bool ExpandDimShape(const nnvm::NodeAttrs& attrs,
                            std::vector<TShape> *in_attrs,
                            std::vector<TShape> *out_attrs) {
   const ExpandDimParam& param = nnvm::get<ExpandDimParam>(attrs.parsed);
   CHECK_EQ(in_attrs->size(), 1U);
   CHECK_EQ(out_attrs->size(), 1U);
   TShape& shp = (*in_attrs)[0];
   CHECK_LE(param.axis, shp.ndim())
       << "axis exceeds the dimension of the array";
   TShape ret(shp.ndim() + 1);
   for (index_t i = 0; i < param.axis; ++i) ret[i] = shp[i];
   ret[param.axis] = 1;
   for (index_t i = param.axis+1; i < ret.ndim(); ++i) ret[i] = shp[i-1];
   SHAPE_ASSIGN_CHECK(*out_attrs, 0, ret);
   return true;
 }

 struct DotParam : public dmlc::Parameter<DotParam> {
   bool transpose_a;
   bool transpose_b;
   DMLC_DECLARE_PARAMETER(DotParam) {
     DMLC_DECLARE_FIELD(transpose_a)
       .describe("If true then transpose the first input before dot.")
       .set_default(false);
     DMLC_DECLARE_FIELD(transpose_b)
       .describe("If true then transpose the second input before dot.")
       .set_default(false);
   }
 };

 template<typename xpu>
 void DotForward_(const nnvm::NodeAttrs& attrs,
                  const OpContext& ctx,
                  const std::vector<TBlob>& inputs,
                  const std::vector<OpReqType>& req,
                  const std::vector<TBlob>& outputs) {
   using namespace mshadow;
   using namespace mshadow::expr;
   const DotParam& param = nnvm::get<DotParam>(attrs.parsed);
   Stream<xpu> *s = ctx.get_stream<xpu>();
   CHECK_EQ(outputs[0].type_flag_, inputs[0].type_flag_)
       << "Binary function only support input/output with the same type";
   CHECK_EQ(outputs[0].type_flag_, inputs[1].type_flag_)
       << "Binary function only support input/output with the same type";
   CHECK_EQ(outputs[0].type_flag_, kFloat32)
       << "dot only support 32 bit float so far";

   if (inputs[0].ndim() == 1 && inputs[1].ndim() == 1) {
     CHECK_NE(req[0], kAddTo) << "AddTo not yet suported";
     Tensor<xpu, 1, real_t> out = outputs[0].get<xpu, 1, real_t>(s);
     VectorDot(out,
               inputs[0].get<xpu, 1, real_t>(s),
               inputs[1].get<xpu, 1, real_t>(s));
   } else {
     int ma, na, mb, nb, m, n;
     if (param.transpose_a) {
       ma = inputs[0].size(0);
       na = inputs[0].Size()/ma;
       m = na;
     } else {
       na = inputs[0].size(inputs[0].ndim()-1);
       ma = inputs[0].Size()/na;
       m = ma;
     }
     if (param.transpose_b) {
       nb = inputs[1].size(inputs[1].ndim()-1);
       mb = inputs[1].Size()/nb;
       n = mb;
     } else {
       mb = inputs[1].size(0);
       nb = inputs[1].Size()/mb;
       n = nb;
     }

     Tensor<xpu, 2, real_t> input0 =
       inputs[0].get_with_shape<xpu, 2, real_t>(Shape2(ma, na), s);
     Tensor<xpu, 2, real_t> input1 =
       inputs[1].get_with_shape<xpu, 2, real_t>(Shape2(mb, nb), s);
     Tensor<xpu, 2, real_t> out =
       outputs[0].get_with_shape<xpu, 2, real_t>(Shape2(m, n), s);
     if (param.transpose_a && param.transpose_b) {
       ASSIGN_DISPATCH(out, req[0], dot(input0.T(), input1.T()));
     } else if (!param.transpose_a && param.transpose_b) {
       ASSIGN_DISPATCH(out, req[0], dot(input0, input1.T()));
     } else if (param.transpose_a && !param.transpose_b) {
       ASSIGN_DISPATCH(out, req[0], dot(input0.T(), input1));
     } else {
       ASSIGN_DISPATCH(out, req[0], dot(input0, input1));
     }
   }
 }

 template<typename xpu>
 void DotBackward_(const nnvm::NodeAttrs& attrs,
                   const OpContext& ctx,
                   const std::vector<TBlob>& inputs,
                   const std::vector<OpReqType>& req,
                   const std::vector<TBlob>& outputs) {
   using namespace mshadow;
   using namespace mshadow::expr;
   const DotParam& param = nnvm::get<DotParam>(attrs.parsed);
   Stream<xpu> *s = ctx.get_stream<xpu>();
   CHECK_NE(req[0], kWriteInplace);
   CHECK_NE(req[1], kWriteInplace);

   if (inputs[1].ndim() == 1 && inputs[2].ndim() == 1) {
     Tensor<xpu, 1, real_t> mout_grad = inputs[0].get<xpu, 1, real_t>(s);
     Tensor<xpu, 1, real_t> mlhs_data = inputs[1].get<xpu, 1, real_t>(s);
     Tensor<xpu, 1, real_t> mrhs_data = inputs[2].get<xpu, 1, real_t>(s);
     Tensor<xpu, 1, real_t> mlhs_grad = outputs[0].get<xpu, 1, real_t>(s);
     Tensor<xpu, 1, real_t> mrhs_grad = outputs[1].get<xpu, 1, real_t>(s);
     ASSIGN_DISPATCH(mrhs_grad, req[1],
                     broadcast_scalar(mout_grad, mlhs_data.shape_) * mlhs_data);
     ASSIGN_DISPATCH(mlhs_grad, req[0],
                     broadcast_scalar(mout_grad, mlhs_data.shape_) * mrhs_data);
   } else {
     int ma, na, mb, nb, m, n;
     if (param.transpose_a) {
       ma = outputs[0].size(0);
       na = outputs[0].Size()/ma;
       m = na;
     } else {
       na = outputs[0].size(outputs[0].ndim()-1);
       ma = outputs[0].Size()/na;
       m = ma;
     }
     if (param.transpose_b) {
       nb = outputs[1].size(outputs[1].ndim()-1);
       mb = outputs[1].Size()/nb;
       n = mb;
     } else {
       mb = outputs[1].size(0);
       nb = outputs[1].Size()/mb;
       n = nb;
     }

     Tensor<xpu, 2, real_t> mout_grad =
       inputs[0].get_with_shape<xpu, 2, real_t>(Shape2(m, n), s);
     Tensor<xpu, 2, real_t> mlhs_data =
       inputs[1].get_with_shape<xpu, 2, real_t>(Shape2(ma, na), s);
     Tensor<xpu, 2, real_t> mrhs_data =
       inputs[2].get_with_shape<xpu, 2, real_t>(Shape2(mb, nb), s);
     Tensor<xpu, 2, real_t> mlhs_grad =
       outputs[0].get_with_shape<xpu, 2, real_t>(Shape2(ma, na), s);
     Tensor<xpu, 2, real_t> mrhs_grad =
       outputs[1].get_with_shape<xpu, 2, real_t>(Shape2(mb, nb), s);
     if (param.transpose_a && param.transpose_b) {
       // Gradient of z = dot(x.T, y.T)
       // dy = dot(x, dz).T = dot(dz.T, x.T)
       // dx = dot(dz, y).T = dot(y.T, dz.T)
       ASSIGN_DISPATCH(mrhs_grad, req[1], dot(mout_grad.T(), mlhs_data.T()));
       ASSIGN_DISPATCH(mlhs_grad, req[0], dot(mrhs_data.T(), mout_grad.T()));
     } else if (!param.transpose_a && param.transpose_b) {
       // Gradient of z = dot(x, y.T)
       // dy = dot(x.T, dz).T = dot(dz.T, x)
       // dx = dot(dz, y)
       ASSIGN_DISPATCH(mrhs_grad, req[1], dot(mout_grad.T(), mlhs_data));
       ASSIGN_DISPATCH(mlhs_grad, req[0], dot(mout_grad, mrhs_data));
     } else if (param.transpose_a && !param.transpose_b) {
       // Gradient of z = dot(x.T, y)
       // dy = dot(x, dz)
       // dx = dot(dz, y.T).T = dot(y, dz.T)
       ASSIGN_DISPATCH(mrhs_grad, req[1], dot(mlhs_data, mout_grad));
       ASSIGN_DISPATCH(mlhs_grad, req[0], dot(mrhs_data, mout_grad.T()));
     } else {
       // Gradient of z = dot(x, y)
       // dy = dot(x.T, dz)
       // dx = dot(dz, y.T)
       ASSIGN_DISPATCH(mrhs_grad, req[1], dot(mlhs_data.T(), mout_grad));
       ASSIGN_DISPATCH(mlhs_grad, req[0], dot(mout_grad, mrhs_data.T()));
     }
   }
 }

 inline bool DotShape(const nnvm::NodeAttrs& attrs,
                      std::vector<TShape> *in_attrs,
                      std::vector<TShape> *out_attrs) {
   const DotParam& param = nnvm::get<DotParam>(attrs.parsed);
   CHECK_EQ(in_attrs->size(), 2U);
   CHECK_EQ(out_attrs->size(), 1U);
   TShape& lshape = (*in_attrs)[0];
   TShape& rshape = (*in_attrs)[1];
   if (lshape.ndim() == 1 && rshape.ndim() == 1) {
     CHECK(!param.transpose_a && !param.transpose_b) << "Cannot transpose vectors";
     CHECK_EQ(lshape[0], rshape[0]) << "dot shape error: " << lshape << " X " << rshape;
     SHAPE_ASSIGN_CHECK(*out_attrs, 0, mshadow::Shape1(1));
   } else {
     bool Ta = param.transpose_a, Tb = param.transpose_b;
     TShape L[2], R[2];
     if (Ta) {
       L[0] = mshadow::Shape1(lshape[0]);
       L[1] = lshape.ndim() > 1 ? TShape(&lshape[1], &lshape[lshape.ndim()]) : TShape(1);
     } else {
       L[0] = lshape.ndim() > 1 ? TShape(&lshape[0], &lshape[lshape.ndim()-1]) : TShape(1);
       L[1] = mshadow::Shape1(lshape[lshape.ndim()-1]);
     }
     if (Tb) {
       R[0] = rshape.ndim() > 1 ? TShape(&rshape[0], &rshape[rshape.ndim()-1]) : TShape(1);
       R[1] = mshadow::Shape1(rshape[rshape.ndim()-1]);
     } else {
       R[0] = mshadow::Shape1(rshape[0]);
       R[1] = rshape.ndim() > 1 ? TShape(&rshape[1], &rshape[rshape.ndim()]) : TShape(1);
     }

     if (L[!Ta].Size() != 0 && R[Tb].Size() != 0) {
       CHECK_EQ(L[!Ta].Size(), R[Tb].Size())
         << "dot shape error: " << lshape << " X " << rshape;
     }
     std::vector<index_t> buf;
     if (lshape.ndim() > 1) buf.insert(buf.end(), &L[Ta][0], &L[Ta][L[Ta].ndim()]);
     if (rshape.ndim() > 1) buf.insert(buf.end(), &R[!Tb][0], &R[!Tb][R[!Tb].ndim()]);
     TShape oshape(buf.begin(), buf.end());
     SHAPE_ASSIGN_CHECK(*out_attrs, 0, oshape);
   }
   return true;
 }

 template<typename xpu>
 void BatchDotForward_(const nnvm::NodeAttrs& attrs,
                       const OpContext& ctx,
                       const std::vector<TBlob>& inputs,
                       const std::vector<OpReqType>& req,
                       const std::vector<TBlob>& outputs) {
   using namespace mshadow::expr;
   mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
   const DotParam& param = nnvm::get<DotParam>(attrs.parsed);
   CHECK_EQ(outputs[0].type_flag_, inputs[0].type_flag_)
       << "Binary function only support input/output with the same type";
   CHECK_EQ(outputs[0].type_flag_, inputs[1].type_flag_)
       << "Binary function only support input/output with the same type";
   CHECK_EQ(outputs[0].type_flag_, mshadow::kFloat32)
       << "dot only support 32 bit float so far";

   mshadow::Tensor<xpu, 3, real_t> out = outputs[0].get<xpu, 3, real_t>(s);
   mshadow::Tensor<xpu, 3, real_t> mlhs = inputs[0].get<xpu, 3, real_t>(s);
   mshadow::Tensor<xpu, 3, real_t> mrhs = inputs[1].get<xpu, 3, real_t>(s);
   mshadow::Tensor<xpu, 1, real_t*> workspace =
     ctx.requested[0].get_space_typed<xpu, 1, real_t*>(mshadow::Shape1(3 * out.size(0)), s);
   if (kNullOp != req[0]) {
     if (param.transpose_a && param.transpose_b) {
       mshadow::BatchGEMM<true, true>(out, mlhs, mrhs, 1.0f,
                                      (kAddTo == req[0]) ? 1.0f : 0.0f,
                                      workspace);
     } else if (!param.transpose_a && param.transpose_b) {
       mshadow::BatchGEMM<false, true>(out, mlhs, mrhs, 1.0f,
                                      (kAddTo == req[0]) ? 1.0f : 0.0f,
                                      workspace);
     } else if (param.transpose_a && !param.transpose_b) {
       mshadow::BatchGEMM<true, false>(out, mlhs, mrhs, 1.0f,
                                      (kAddTo == req[0]) ? 1.0f : 0.0f,
                                      workspace);
     } else {
       mshadow::BatchGEMM<false, false>(out, mlhs, mrhs, 1.0f,
                                      (kAddTo == req[0]) ? 1.0f : 0.0f,
                                      workspace);
     }
   }
 }

 template<typename xpu>
 void BatchDotBackward_(const nnvm::NodeAttrs& attrs,
                        const OpContext& ctx,
                        const std::vector<TBlob>& inputs,
                        const std::vector<OpReqType>& req,
                        const std::vector<TBlob>& outputs) {
   using namespace mshadow::expr;
   mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
   const DotParam& param = nnvm::get<DotParam>(attrs.parsed);
   CHECK_NE(req[1], kWriteInplace);
   CHECK_NE(req[0], kWriteInplace);

   mshadow::Tensor<xpu, 3, real_t> mout_grad = inputs[0].get<xpu, 3, real_t>(s);
   mshadow::Tensor<xpu, 3, real_t> mlhs_data = inputs[1].get<xpu, 3, real_t>(s);
   mshadow::Tensor<xpu, 3, real_t> mrhs_data = inputs[2].get<xpu, 3, real_t>(s);
   mshadow::Tensor<xpu, 3, real_t> mlhs_grad = outputs[0].get<xpu, 3, real_t>(s);
   mshadow::Tensor<xpu, 3, real_t> mrhs_grad = outputs[1].get<xpu, 3, real_t>(s);
   mshadow::Tensor<xpu, 2, real_t*> workspace =
     ctx.requested[0].get_space_typed<xpu, 2, real_t*>(
       mshadow::Shape2(2, 3 * mout_grad.size(0)), s);
   mshadow::Tensor<xpu, 1, real_t*> rhs_workspace = workspace[0];
   mshadow::Tensor<xpu, 1, real_t*> lhs_workspace = workspace[1];
   if (param.transpose_a && param.transpose_b) {
     // Gradient of z = dot(x.T, y.T)
     // dy = dot(x, dz).T = dot(dz.T, x.T)
     // dx = dot(dz, y).T = dot(y.T, dz.T)
     if (kNullOp != req[1]) {
       mshadow::BatchGEMM<true, true>(mrhs_grad, mout_grad, mlhs_data, 1.0f,
                                       (kAddTo == req[1]) ? 1.0f : 0.0f,
                                       rhs_workspace);
     }
     if (kNullOp != req[0]) {
       mshadow::BatchGEMM<true, true>(mlhs_grad, mrhs_data, mout_grad, 1.0f,
                                       (kAddTo == req[0]) ? 1.0f : 0.0f,
                                       lhs_workspace);
     }
   } else if (!param.transpose_a && param.transpose_b) {
     // Gradient of z = dot(x, y.T)
     // dy = dot(x.T, dz).T = dot(dz.T, x)
     // dx = dot(dz, y)
     if (kNullOp != req[1]) {
       mshadow::BatchGEMM<true, false>(mrhs_grad, mout_grad, mlhs_data, 1.0f,
                                       (kAddTo == req[1]) ? 1.0f : 0.0f,
                                       rhs_workspace);
     }
     if (kNullOp != req[0]) {
       mshadow::BatchGEMM<false, false>(mlhs_grad, mout_grad, mrhs_data, 1.0f,
                                       (kAddTo == req[0]) ? 1.0f : 0.0f,
                                       lhs_workspace);
     }
   } else if (param.transpose_a && !param.transpose_b) {
     // Gradient of z = dot(x.T, y)
     // dy = dot(x, dz)
     // dx = dot(dz, y.T).T = dot(y, dz.T)
     if (kNullOp != req[1]) {
       mshadow::BatchGEMM<false, false>(mrhs_grad, mlhs_data, mout_grad, 1.0f,
                                       (kAddTo == req[1]) ? 1.0f : 0.0f,
                                       rhs_workspace);
     }
     if (kNullOp != req[0]) {
       mshadow::BatchGEMM<false, true>(mlhs_grad, mrhs_data, mout_grad, 1.0f,
                                       (kAddTo == req[0]) ? 1.0f : 0.0f,
                                       lhs_workspace);
     }
   } else {
     // Gradient of z = dot(x, y)
     // dy = dot(x.T, dz)
     // dx = dot(dz, y.T)
     if (kNullOp != req[1]) {
       mshadow::BatchGEMM<true, false>(mrhs_grad, mlhs_data, mout_grad, 1.0f,
                                       (kAddTo == req[1]) ? 1.0f : 0.0f,
                                       rhs_workspace);
     }
     if (kNullOp != req[0]) {
       mshadow::BatchGEMM<false, true>(mlhs_grad, mout_grad, mrhs_data, 1.0f,
                                       (kAddTo == req[0]) ? 1.0f : 0.0f,
                                       lhs_workspace);
     }
   }
 }

 inline bool BatchDotShape(const nnvm::NodeAttrs& attrs,
                           std::vector<TShape> *in_attrs,
                           std::vector<TShape> *out_attrs) {
   CHECK_EQ(in_attrs->size(), 2U);
   CHECK_EQ(out_attrs->size(), 1U);
   const DotParam& param = nnvm::get<DotParam>(attrs.parsed);
   TShape& lshape = (*in_attrs)[0];
   TShape& rshape = (*in_attrs)[1];
   if (lshape.ndim() == 3 && rshape.ndim() == 3) {
     CHECK(lshape[0] == rshape[0])
       << "batch_dot shape error(batch_size must be equal): " << lshape << " X " << rshape
       << " trans_a=" << param.transpose_a << " trans_b=" << param.transpose_b;
     index_t out_m = param.transpose_a ? lshape[2] : lshape[1];
     index_t lshape_k = param.transpose_a ? lshape[1] : lshape[2];
     index_t out_n = param.transpose_b ? rshape[1] : rshape[2];
     index_t rshape_k = param.transpose_b ? rshape[2] : rshape[1];
     CHECK(lshape_k == rshape_k)
       << "batch_dot shape error(shape mismatch): " << lshape << " X " << rshape
       << " trans_a=" << param.transpose_a << " trans_b=" << param.transpose_b;
     SHAPE_ASSIGN_CHECK(*out_attrs, 0, mshadow::Shape3(lshape[0], out_m, out_n));
   } else {
     LOG(FATAL) << "batch_dot currently only support 3D*3D array"
                << lshape << " v.s. " << rshape;
   }
   return true;
 }

 struct SliceParam : public dmlc::Parameter<SliceParam> {
   nnvm::Tuple<dmlc::optional<int> > begin, end;
   DMLC_DECLARE_PARAMETER(SliceParam) {
     DMLC_DECLARE_FIELD(begin)
     .describe("starting coordinates");
     DMLC_DECLARE_FIELD(end)
     .describe("ending coordinates");
   }
 };

 inline TShape GetSliceShape(const SliceParam& param, const TShape& dshape) {
   CHECK_LE(param.begin.ndim(), dshape.ndim())
     << "Slicing axis exceeds data dimensions";
   CHECK_LE(param.end.ndim(), dshape.ndim())
     << "Slicing axis exceeds data dimensions";

   TShape oshape(dshape.ndim());
   for (index_t i = 0; i < dshape.ndim(); ++i) {
     int s = 0, e = dshape[i];
     if (e != 0) {
       if (param.begin[i]) {
         CHECK_LE(*param.begin[i], e)
           << "Slicing begin exceeds data dimensions "
           << param.begin << " vs " << dshape;
         s = *param.begin[i];
         if (s < 0) s += dshape[i];
       }
       if (param.end[i]) {
         CHECK_LE(*param.end[i], e)
           << "Slicing end exceeds data dimensions "
           << param.end << " vs " << dshape;
         e = *param.end[i];
         if (e < 0) e += dshape[i];
       }
       CHECK(s >= 0 && s < e && e <= static_cast<int>(dshape[i]))
         << "Invalid slicing begin " << param.begin << " and end "
         << param.end << " for data of shape " << dshape;
     }
     oshape[i] = e - s;
   }
   return oshape;
 }

 inline bool SliceShape(const nnvm::NodeAttrs& attrs,
                        std::vector<TShape> *in_attrs,
                        std::vector<TShape> *out_attrs) {
   const TShape& dshape = (*in_attrs)[0];
   if (dshape.ndim() == 0) return false;
   const SliceParam& param = nnvm::get<SliceParam>(attrs.parsed);
   SHAPE_ASSIGN_CHECK(*out_attrs, 0, GetSliceShape(param, dshape));
   return true;
 }

 // matrix crop for multi dimensional cropping: see also slice
 template<typename xpu>
 void Slice(const nnvm::NodeAttrs& attrs,
           const OpContext& ctx,
           const std::vector<TBlob>& inputs,
           const std::vector<OpReqType>& req,
           const std::vector<TBlob>& outputs) {
   using namespace mshadow;
   using namespace mshadow::expr;
   const SliceParam& param = nnvm::get<SliceParam>(attrs.parsed);
   index_t N = inputs[0].ndim();
   TShape begin(N), end(N);
   for (index_t i = 0; i < N; ++i) {
     int s = 0;
     if (param.begin[i]) {
       s = *param.begin[i];
       if (s < 0) s += inputs[0].size(i);
     }
     begin[i] = s;
     end[i] = s + outputs[0].size(i);
   }

   Stream<xpu> *s = ctx.get_stream<xpu>();
   MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {
     switch (inputs[0].ndim()) {
      case 0:
       break;
      case 1: {
       Tensor<xpu, 1, DType> in = inputs[0].get<xpu, 1, DType>(s);
       Tensor<xpu, 1, DType> out = outputs[0].get<xpu, 1, DType>(s);
       out = slice(in, begin.get<1>(), end.get<1>());
       break;
      }
      case 2: {
       Tensor<xpu, 2, DType> in = inputs[0].get<xpu, 2, DType>(s);
       Tensor<xpu, 2, DType> out = outputs[0].get<xpu, 2, DType>(s);
       out = slice(in, begin.get<2>(), end.get<2>());
       break;
      }
      case 3: {
       Tensor<xpu, 3, DType> in = inputs[0].get<xpu, 3, DType>(s);
       Tensor<xpu, 3, DType> out = outputs[0].get<xpu, 3, DType>(s);
       out = slice(in, begin.get<3>(), end.get<3>());
       break;
      }
      case 4: {
       Tensor<xpu, 4, DType> in = inputs[0].get<xpu, 4, DType>(s);
       Tensor<xpu, 4, DType> out = outputs[0].get<xpu, 4, DType>(s);
       out = slice(in, begin.get<4>(), end.get<4>());
       break;
      }
      case 5: {
       Tensor<xpu, 5, DType> in = inputs[0].get<xpu, 5, DType>(s);
       Tensor<xpu, 5, DType> out = outputs[0].get<xpu, 5, DType>(s);
       out = slice(in, begin.get<5>(), end.get<5>());
       break;
      }
      default:
       LOG(FATAL) << "crop supports at most 5 dimensions";
       break;
     }
   });
 }

 inline bool SliceAssignShape(const nnvm::NodeAttrs& attrs,
                              std::vector<TShape> *in_attrs,
                              std::vector<TShape> *out_attrs) {
   const TShape& lshape = (*in_attrs)[0];
   if (lshape.ndim() == 0) return false;
   const SliceParam& param = nnvm::get<SliceParam>(attrs.parsed);
   SHAPE_ASSIGN_CHECK(*in_attrs, 1, GetSliceShape(param, lshape));
   SHAPE_ASSIGN_CHECK(*out_attrs, 0, lshape);
   return true;
 }

 template<typename xpu>
 void SliceAssignImpl(mshadow::Stream<xpu> *s, const SliceParam& param,
                      const TBlob& dst, const TBlob& src) {
   using namespace mshadow;
   using namespace mshadow::expr;
   index_t N = dst.ndim();
   TShape begin(N), end(N);
   for (index_t i = 0; i < N; ++i) {
     int s = 0;
     if (param.begin[i]) {
       s = *param.begin[i];
       if (s < 0) s += dst.size(i);
     }
     begin[i] = s;
     end[i] = s + src.size(i);
   }

   MSHADOW_TYPE_SWITCH(dst.type_flag_, DType, {
     switch (dst.ndim()) {
       case 0:
         break;
       case 1: {
         Tensor<xpu, 1, DType> out = dst.get<xpu, 1, DType>(s);
         Tensor<xpu, 1, DType> in = src.get<xpu, 1, DType>(s);
         slice(out, begin.get<1>(), end.get<1>()) = in;
         break;
       }
       case 2: {
         Tensor<xpu, 2, DType> out = dst.get<xpu, 2, DType>(s);
         Tensor<xpu, 2, DType> in = src.get<xpu, 2, DType>(s);
         slice(out, begin.get<2>(), end.get<2>()) = in;
         break;
       }
       case 3: {
         Tensor<xpu, 3, DType> out = dst.get<xpu, 3, DType>(s);
         Tensor<xpu, 3, DType> in = src.get<xpu, 3, DType>(s);
         slice(out, begin.get<3>(), end.get<3>()) = in;
         break;
       }
       case 4: {
         Tensor<xpu, 4, DType> out = dst.get<xpu, 4, DType>(s);
         Tensor<xpu, 4, DType> in = src.get<xpu, 4, DType>(s);
         slice(out, begin.get<4>(), end.get<4>()) = in;
         break;
       }
       case 5: {
         Tensor<xpu, 5, DType> out = dst.get<xpu, 5, DType>(s);
         Tensor<xpu, 5, DType> in = src.get<xpu, 5, DType>(s);
         slice(out, begin.get<5>(), end.get<5>()) = in;
         break;
       }
       default:
         LOG(FATAL) << "CropAssign supports at most 5 dimensions";
         break;
     }
   });
 }

 template<typename xpu>
 void SliceAssign(const nnvm::NodeAttrs& attrs,
                  const OpContext& ctx,
                  const std::vector<TBlob>& inputs,
                  const std::vector<OpReqType>& req,
                  const std::vector<TBlob>& outputs) {
   using namespace mshadow;
   using namespace mshadow::expr;

   const SliceParam& param = nnvm::get<SliceParam>(attrs.parsed);
   Stream<xpu> *s = ctx.get_stream<xpu>();

   if (req[0] == kNullOp) {
     return;
   } else if (req[0] == kWriteTo) {
     MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {
       Tensor<xpu, 1, DType> in = inputs[0].FlatTo1D<xpu, DType>(s);
       Tensor<xpu, 1, DType> out = outputs[0].FlatTo1D<xpu, DType>(s);
       Copy(out, in, s);
     });
   } else if (req[0] != kWriteInplace) {
     LOG(FATAL) << "CropAssign only supports kWriteTo and kWriteInplace";
   }

   SliceAssignImpl<xpu>(s, param, outputs[0], inputs[1]);
 }

 template<typename xpu>
 void SliceBackward(const nnvm::NodeAttrs& attrs,
                    const OpContext& ctx,
                    const std::vector<TBlob>& inputs,
                    const std::vector<OpReqType>& req,
                    const std::vector<TBlob>& outputs) {
   using namespace mshadow;
   using namespace mshadow::expr;

   const SliceParam& param = nnvm::get<SliceParam>(attrs.parsed);
   Stream<xpu> *s = ctx.get_stream<xpu>();

   if (req[0] == kNullOp) {
     return;
   } else if (req[0] == kWriteTo) {
     MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {
       Tensor<xpu, 1, DType> out = outputs[0].FlatTo1D<xpu, DType>(s);
       out = DType(0);
     });
   } else {
     LOG(FATAL) << "CropAssign only supports kWriteTo";
   }

   SliceAssignImpl<xpu>(s, param, outputs[0], inputs[0]);
 }

 struct SimpleCropAssignScalarParam : public dmlc::Parameter<SimpleCropAssignScalarParam> {
   real_t scalar;
   TShape begin, end;
   DMLC_DECLARE_PARAMETER(SimpleCropAssignScalarParam) {
     DMLC_DECLARE_FIELD(scalar)
     .set_default(0)
     .describe("The scalar value for assignment.");
     DMLC_DECLARE_FIELD(begin)
     .describe("starting coordinates");
     DMLC_DECLARE_FIELD(end)
     .describe("ending coordinates");
   }
 };

 template<typename xpu>
 void CropAssignScalar(const nnvm::NodeAttrs& attrs,
                       const OpContext& ctx,
                       const std::vector<TBlob>& inputs,
                       const std::vector<OpReqType>& req,
                       const std::vector<TBlob>& outputs) {
   using namespace mshadow;
   using namespace mshadow::expr;
   const SimpleCropAssignScalarParam& param = nnvm::get<SimpleCropAssignScalarParam>(attrs.parsed);
   Stream<xpu> *s = ctx.get_stream<xpu>();

   if (req[0] == kWriteTo) {
     MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {
       Tensor<xpu, 1, DType> in = inputs[0].FlatTo1D<xpu, DType>(s);
       Tensor<xpu, 1, DType> out = outputs[0].FlatTo1D<xpu, DType>(s);
       Copy(out, in, s);
     });
   } else if (req[0] != kWriteInplace) {
     LOG(FATAL) << "CropAssignScalar only supports kWriteTo and kWriteInplace";
   }

   MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {
     switch (outputs[0].shape_.ndim()) {
       case 0:
         break;
       case 1: {
         Tensor<xpu, 1, DType> out = outputs[0].get<xpu, 1, DType>(s);
         slice(out, param.begin.get<1>(), param.end.get<1>()) = \
             static_cast<DType>(param.scalar);
         break;
       }
       case 2: {
         Tensor<xpu, 2, DType> out = outputs[0].get<xpu, 2, DType>(s);
         slice(out, param.begin.get<2>(), param.end.get<2>()) = \
             static_cast<DType>(param.scalar);
         break;
       }
       case 3: {
         Tensor<xpu, 3, DType> out = outputs[0].get<xpu, 3, DType>(s);
         slice(out, param.begin.get<3>(), param.end.get<3>()) = \
             static_cast<DType>(param.scalar);
         break;
       }
       case 4: {
         Tensor<xpu, 4, DType> out = outputs[0].get<xpu, 4, DType>(s);
         slice(out, param.begin.get<4>(), param.end.get<4>()) = \
             static_cast<DType>(param.scalar);
         break;
       }
       case 5: {
         Tensor<xpu, 5, DType> out = outputs[0].get<xpu, 5, DType>(s);
         slice(out, param.begin.get<5>(), param.end.get<5>()) = \
             static_cast<DType>(param.scalar);
         break;
       }
       default:
         LOG(FATAL) << "CropAssign supports at most 5 dimensions";
         break;
     }
   });
 }

 inline bool CropAssignScalarShape(const nnvm::NodeAttrs& attrs,
                                   std::vector<TShape> *in_attrs,
                                   std::vector<TShape> *out_attrs) {
   const SimpleCropAssignScalarParam& param = nnvm::get<SimpleCropAssignScalarParam>(attrs.parsed);
   TShape& lshape = (*in_attrs)[0];
   CHECK_EQ(lshape.ndim(), param.begin.ndim());
   CHECK_EQ(lshape.ndim(), param.end.ndim());
   for (index_t i = 0; i < lshape.ndim(); ++i) {
     CHECK_LT(param.begin[i], param.end[i]);
     CHECK_LE(param.end[i], lshape[i]);
   }
   SHAPE_ASSIGN_CHECK(*out_attrs, 0, lshape);
   return true;
 }

 struct SliceAxisParam : public dmlc::Parameter<SliceAxisParam> {
   int axis;
   int begin;
   dmlc::optional<int> end;
   DMLC_DECLARE_PARAMETER(SliceAxisParam) {
     DMLC_DECLARE_FIELD(axis)
       .describe("The axis to be sliced."
                 " Negative axis means to count from the last to the first axis.");
     DMLC_DECLARE_FIELD(begin)
       .describe("The beginning index to be sliced."
                 " Negative values are interpreted as counting from the backward.");
     DMLC_DECLARE_FIELD(end)
       .describe("The end index to be sliced."
                 " The end can be None, in which case all the rest elements are used."
                 " Also, negative values are interpreted as counting from the backward.");
   }
 };

 inline void GetSliceAxisParams(const SliceAxisParam& param, const TShape& ishape,
                            int* axis, int* begin, int* end) {
   *axis = param.axis;
   if (*axis < 0) {
     *axis += static_cast<int>(ishape.ndim());
   }
   CHECK(*axis < static_cast<int>(ishape.ndim()) && *axis >= 0) <<
     "Transformed axis must be smaller than the source ndim and larger than zero! Recieved axis=" <<
     param.axis << ", src_ndim=" << ishape.ndim() << ", transformed axis=" << *axis;
   int axis_size = static_cast<int>(ishape[*axis]);
   *begin = param.begin;
   *end = -1;
   if (*begin < 0) {
     *begin += axis_size;
   }
   if (!static_cast<bool>(param.end)) {
     *end = axis_size;
   } else {
     *end = param.end.value();
     if (*end < 0) {
       *end += axis_size;
     }
   }
   CHECK((*end <= axis_size) && (*end >= 0))
     << "Invalid begin, end, get begin=" << param.begin << ", end=" << param.end;
   CHECK((*begin < *end) && (*begin >= 0))
     << "Invalid begin, end, get begin=" << param.begin << ", end=" << param.end;
 }

 inline bool SliceAxisShape(const nnvm::NodeAttrs& attrs,
                        std::vector<TShape> *in_attrs,
                        std::vector<TShape> *out_attrs) {
   const SliceAxisParam& param = nnvm::get<SliceAxisParam>(attrs.parsed);
   CHECK_EQ(in_attrs->size(), 1U);
   CHECK_EQ(out_attrs->size(), 1U);
   TShape& ishape = (*in_attrs)[0];
   int axis, begin, end;
   GetSliceAxisParams(param, ishape, &axis, &begin, &end);
   TShape shape(ishape.ndim());
   for (index_t i = 0; i < ishape.ndim(); ++i) {
     if (static_cast<int>(i) == axis) {
       shape[i] = static_cast<index_t>(end - begin);
     } else {
       shape[i] = ishape[i];
     }
   }
   SHAPE_ASSIGN_CHECK(*out_attrs, 0, shape);
   return true;
 }


 template<typename xpu>
 void SliceAxis(const nnvm::NodeAttrs& attrs,
            const OpContext& ctx,
            const std::vector<TBlob>& inputs,
            const std::vector<OpReqType>& req,
            const std::vector<TBlob>& outputs) {
   using namespace mshadow::expr;
   const SliceAxisParam& param = nnvm::get<SliceAxisParam>(attrs.parsed);
   mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
   int axis, begin, end;
   GetSliceAxisParams(param, inputs[0].shape_, &axis, &begin, &end);
   int ndim = static_cast<int>(outputs[0].ndim());

   if (axis + 1 == ndim) {
     MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {
         mshadow::Tensor<xpu, 2, DType> in =
             inputs[0].FlatTo2D<xpu, DType>(s);
         mshadow::Tensor<xpu, 2, DType> out =
             outputs[0].FlatTo2D<xpu, DType>(s);
         ASSIGN_DISPATCH(out, req[0], slice<1>(in, begin, end));
       });
   } else {
     MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {
         mshadow::Tensor<xpu, 3, DType> in =
             inputs[0].FlatTo3D<xpu, DType>(axis, s);
         mshadow::Tensor<xpu, 3, DType> out =
             outputs[0].FlatTo3D<xpu, DType>(axis, s);
         ASSIGN_DISPATCH(out, req[0], slice<1>(in, begin, end));
       });
   }
 }

 // Backward pass of broadcast over the given axis
 template<typename xpu>
 void SliceAxisGrad_(const nnvm::NodeAttrs& attrs,
                 const OpContext& ctx,
                 const std::vector<TBlob>& inputs,
                 const std::vector<OpReqType>& req,
                 const std::vector<TBlob>& outputs) {
   const SliceAxisParam& param = nnvm::get<SliceAxisParam>(attrs.parsed);
   using namespace mshadow::op;
   using namespace mshadow::expr;
   mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
   int axis, begin, end;
   GetSliceAxisParams(param, outputs[0].shape_, &axis, &begin, &end);
   int ndim = static_cast<int>(outputs[0].shape_.ndim());

   if (axis + 1 == ndim) {
     MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {
         mshadow::Tensor<xpu, 2, DType> ograd =
             inputs[0].FlatTo2D<xpu, DType>(s);
         mshadow::Tensor<xpu, 2, DType> igrad =
             outputs[0].FlatTo2D<xpu, DType>(s);
         if (req[0] == kAddTo) {
           slice<1>(igrad, begin, end) += F<identity>(ograd);
         } else if (req[0] == kWriteTo) {
           igrad = 0.0f;
           slice<1>(igrad, begin, end) = F<identity>(ograd);
         } else {
           CHECK_EQ(req[0], kNullOp);
         }
       });
   } else {
     MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {
         mshadow::Tensor<xpu, 3, DType> ograd =
             inputs[0].FlatTo3D<xpu, DType>(axis, s);
         mshadow::Tensor<xpu, 3, DType> igrad =
             outputs[0].FlatTo3D<xpu, DType>(axis, s);
         if (req[0] == kAddTo) {
           slice<1>(igrad, begin, end) += F<identity>(ograd);
         } else if (req[0] == kWriteTo) {
           igrad = 0.0f;
           slice<1>(igrad, begin, end) = F<identity>(ograd);
         } else {
           CHECK_EQ(req[0], kNullOp);
         }
       });
   }
 }

 struct ClipParam : public dmlc::Parameter<ClipParam> {
   real_t a_min, a_max;
   DMLC_DECLARE_PARAMETER(ClipParam) {
     DMLC_DECLARE_FIELD(a_min)
     .describe("Minimum value");
     DMLC_DECLARE_FIELD(a_max)
     .describe("Maximum value");
   }
 };

 template<typename xpu>
 void Clip(const nnvm::NodeAttrs& attrs,
           const OpContext& ctx,
           const std::vector<TBlob>& inputs,
           const std::vector<OpReqType>& req,
           const std::vector<TBlob>& outputs) {
   using namespace mshadow;
   using namespace mxnet_op;
   const ClipParam& param = nnvm::get<ClipParam>(attrs.parsed);
   CHECK_EQ(inputs[0].type_flag_, outputs[0].type_flag_);
   Stream<xpu> *s = ctx.get_stream<xpu>();

   MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {
     Kernel<clip, xpu>::Launch(s, outputs[0].Size(), outputs[0].dptr<DType>(),
     inputs[0].dptr<DType>(), DType(param.a_min), DType(param.a_max));
   });
 }

 template<typename xpu>
 void ClipGrad_(const nnvm::NodeAttrs& attrs,
                const OpContext& ctx,
                const std::vector<TBlob>& inputs,
                const std::vector<OpReqType>& req,
                const std::vector<TBlob>& outputs) {
   using namespace mshadow;
   using namespace mxnet_op;
   const ClipParam& param = nnvm::get<ClipParam>(attrs.parsed);
   CHECK_EQ(inputs[0].type_flag_, outputs[0].type_flag_);
   Stream<xpu> *s = ctx.get_stream<xpu>();
   MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {
     Kernel<clip_grad, xpu>::Launch(s, outputs[0].Size(), outputs[0].dptr<DType>(),
     inputs[0].dptr<DType>(), inputs[1].dptr<DType>(), DType(param.a_min), DType(param.a_max));
   });
 }

 /*!
  * \brief The parameters of the repeat operator include
  * the number of repeating time and axis (optional).
  * The parameters will be later used to deduce the
  * output ndarray shape in bool RepeatShape() function.
  */
 struct RepeatParam : public dmlc::Parameter<RepeatParam> {
   int repeats = 1;
   dmlc::optional<int> axis;
   DMLC_DECLARE_PARAMETER(RepeatParam) {
     DMLC_DECLARE_FIELD(repeats)
       .describe("The number of repetitions for each element.");
     DMLC_DECLARE_FIELD(axis)
       .set_default(dmlc::optional<int>())
       .describe("The axis along which to repeat values."
                 " The negative numbers are interpreted counting from the backward."
                 " By default, use the flattened input array,"
                 " and return a flat output array.");
   }
 };

 /*!
  * \brief Helper function for getting user input params for the operator repeat.
  * Sanity check the user input values.
  */
 inline void GetRepeatParams(const RepeatParam& param, const TShape& ishape,
                             int* repeats, dmlc::optional<int>* axisOpt) {
   *repeats = param.repeats;
   CHECK_GE(*repeats, 0) << "repeats cannot be a negative number";
   *axisOpt = param.axis;
   if (static_cast<bool>(*axisOpt)) {
     int ndims = static_cast<int>(ishape.ndim());
     int axis = axisOpt->value();
     if (axis < 0) {
       axis += ndims;
     }
     CHECK(axis >= 0 && axis < ndims) << "axis = " << axisOpt->value() << " out of bounds";
   }
 }

 inline bool RepeatOpShape(const nnvm::NodeAttrs& attrs,
                         std::vector<TShape> *in_attrs,
                         std::vector<TShape> *out_attrs) {
   const RepeatParam& param = nnvm::get<RepeatParam>(attrs.parsed);
   CHECK_EQ(in_attrs->size(), 1U);
   CHECK_EQ(out_attrs->size(), 1U);
   const TShape& ishape = (*in_attrs)[0];
   int repeats = 0;
   dmlc::optional<int> axisOpt;
   GetRepeatParams(param, ishape, &repeats, &axisOpt);
   // If 0 repeats, return an empty 0 dim array
   if (0 == repeats) {
     SHAPE_ASSIGN_CHECK(*out_attrs, 0, TShape());
     return true;
   }

   // If repeats > 0, multiply the size of the corresponding axis by repeats
   if (static_cast<bool>(axisOpt)) {
     int ndims = static_cast<int>(ishape.ndim());
     int axis = axisOpt.value();
     if (axis < 0) {
       axis += ndims;
     }
     TShape shape(ishape.ndim());
     for (index_t i = 0; i < ishape.ndim(); ++i) {
       if (static_cast<int>(i) == axis) {
         shape[i] = static_cast<index_t>(repeats) * ishape[i];
       } else {
         shape[i] = ishape[i];
       }
     }
     SHAPE_ASSIGN_CHECK(*out_attrs, 0, shape);
   } else {  // If axis is not input by user, return a flat 1D array of size = in.size*repeats
     TShape shape(1);
     shape[0] = ishape.Size() * static_cast<index_t>(repeats);
     SHAPE_ASSIGN_CHECK(*out_attrs, 0, shape);
   }
   return true;
 }

 inline bool RepeatOpType(const nnvm::NodeAttrs& attrs,
                          std::vector<int>* in_attrs,
                          std::vector<int>* out_attrs) {
   CHECK_EQ(in_attrs->size(), 1U);
   if ((*in_attrs)[0] != -1) {
     TYPE_ASSIGN_CHECK(*out_attrs, 0, (*in_attrs)[0]);
   } else if ((*out_attrs)[0] != -1) {
     TYPE_ASSIGN_CHECK(*in_attrs, 0, (*out_attrs)[0]);
   }
   return true;
 }

 /*!
  * \brief Reshape the input and output tensors for
  * using broadcast_to to achieve the funcitonality
  * of operator repeat.
  * \return a pair of TShape's, first is the reshaped
  * input shape, second is the reshaped output shape.
  */
 inline std::pair<TShape, TShape> ReshapeInputOutputForRepeatOp(const TShape& ishape,
                                                                const dmlc::optional<int>& axisOpt,
                                                                const int repeats) {
   if (static_cast<bool>(axisOpt)) {
     int axis = axisOpt.value();
     int ndim = static_cast<int>(ishape.ndim());
     if (axis < 0)  {
       axis += ndim;
     }
     CHECK(axis >= 0 && axis < static_cast<int>(ishape.ndim())) << "Invalid input of axis";

     // reshape the input tensor by adding a dim at the (axis+1)-th dim
     TShape rshape(ishape.ndim()+1);
     // the shape we want to broadcast to
     TShape bshape(rshape.ndim());
     int i = 0;
     while (i <= axis) {
       rshape[i] = bshape[i] = ishape[i];
       ++i;
     }
     rshape[i] = 1;
     bshape[i] = repeats;
     while (i < static_cast<int>(ishape.ndim())) {
       rshape[i+1] = ishape[i];
       bshape[i+1] = ishape[i];
       ++i;
     }
     return std::make_pair(rshape, bshape);
   } else {
     // axis is not input by user
     // reshape the tensor into shape (ishape.Size(), 1)
     // then add one dim at axis = 1 and broadcast to
     // shape (ishape.Size(), repeats)
     TShape rshape(2);
     rshape[0] = ishape.Size();
     rshape[1] = 1;

     TShape bshape(2);
     bshape[0] = rshape[0];
     bshape[1] = repeats;
     return std::make_pair(rshape, bshape);
   }
 }

 template<typename xpu>
 void RepeatOpForward(const nnvm::NodeAttrs& attrs,
                      const OpContext& ctx,
                      const std::vector<TBlob>& inputs,
                      const std::vector<OpReqType>& req,
                      const std::vector<TBlob>& outputs) {
   const TBlob& iTBlob = inputs[0];
   const TShape& ishape = iTBlob.shape_;
   if (ishape.ndim() == 0) return;

   int repeats = 0;
   dmlc::optional<int> axisOpt;
   const RepeatParam& param = nnvm::get<RepeatParam>(attrs.parsed);
   GetRepeatParams(param, ishape, &repeats, &axisOpt);
   if (0 == repeats) return;

   mshadow::Stream<xpu>* s = ctx.get_stream<xpu>();
   std::pair<TShape, TShape> rshapes = ReshapeInputOutputForRepeatOp(ishape, axisOpt, repeats);

   // reshaped input tblob
   TBlob iblob(inputs[0].dptr_, rshapes.first, inputs[0].dev_mask_, inputs[0].type_flag_);
   std::vector<TBlob> newInputs = {iblob};

   // reshaped output tblob
   TBlob oblob(outputs[0].dptr_, rshapes.second, outputs[0].dev_mask_, outputs[0].type_flag_);
   std::vector<TBlob> newOutputs = {oblob};

   BroadcastCompute<xpu>(attrs, ctx, newInputs, req, newOutputs);
 }

 /*!
  * \brief Compute the gradient of the loss function
  * with respect to the input of the operator.
  * Backpropagation is employed to implement the
  * chain rule.
  * \param inputs the gradient of the loss function
  * with respect to the outputs of the operator
  * \param outputs the gradient of the loss function
  * with respect to the inputs of the operator
  */
 template<typename xpu>
 void RepeatOpBackward(const nnvm::NodeAttrs& attrs,
                       const OpContext& ctx,
                       const std::vector<TBlob>& inputs,
                       const std::vector<OpReqType>& req,
                       const std::vector<TBlob>& outputs) {
   CHECK_EQ(inputs.size(), 1U);
   CHECK_EQ(outputs.size(), 1U);

   const TShape& oshape = outputs[0].shape_;
   if (oshape.ndim() == 0) return;

   int repeats = 0;
   dmlc::optional<int> axisOpt;
   const RepeatParam& param = nnvm::get<RepeatParam>(attrs.parsed);
   GetRepeatParams(param, oshape, &repeats, &axisOpt);
   if (0 == repeats) return;

   std::pair<TShape, TShape> rshapes =
     ReshapeInputOutputForRepeatOp(oshape, axisOpt, repeats);

   // reshaped output grad tblob
   TBlob oblob(outputs[0].dptr_, rshapes.first, outputs[0].dev_mask_, outputs[0].type_flag_);
   std::vector<TBlob> newOutputs = {oblob};

   // reshaped input grad tblob
   TBlob iblob(inputs[0].dptr_, rshapes.second, inputs[0].dev_mask_, inputs[0].type_flag_);
   std::vector<TBlob> newInputs = {iblob};

   ReduceAxesComputeImpl<xpu, mshadow::red::sum, false>(
       attrs, ctx, newInputs, req, newOutputs, rshapes.first);
 }

 struct TileParam : public dmlc::Parameter<TileParam> {
   TShape reps;
   DMLC_DECLARE_PARAMETER(TileParam) {
     DMLC_DECLARE_FIELD(reps)
       .describe("The number of times for repeating the tensor a."
                 " If reps has length d, the result will have dimension of max(d, a.ndim);"
                 " If a.ndim < d, a is promoted to be d-dimensional by prepending new axes."
                 " If a.ndim > d, reps is promoted to a.ndim by pre-pending 1's to it.");
   }
 };

 inline bool TileOpShape(const nnvm::NodeAttrs& attrs,
                         std::vector<TShape> *in_attrs,
                         std::vector<TShape> *out_attrs) {
   CHECK_EQ(in_attrs->size(), 1U);
   CHECK_EQ(out_attrs->size(), 1U);
   const TileParam& param = nnvm::get<TileParam>(attrs.parsed);
   const TShape& ishape = (*in_attrs)[0];
   const TShape& reps = param.reps;
   // If reps is empty, return a identical input array
   if (reps.ndim() == 0 || ishape.ndim() == 0) {
     SHAPE_ASSIGN_CHECK(*out_attrs, 0, ishape);
     return true;
   }
   TShape oshape(std::max(ishape.ndim(), reps.ndim()));
   int i1 = static_cast<int>(ishape.ndim()) - 1;
   int i2 = static_cast<int>(reps.ndim()) - 1;
   for (int i = static_cast<int>(oshape.ndim()) - 1; i >= 0; --i) {
     if (i1 >= 0 && i2 >= 0) {
       oshape[i] = ishape[i1--] * reps[i2--];
     } else if (i1 >= 0) {
       oshape[i] = ishape[i1--];
     } else if (i2 >= 0) {
       oshape[i] = reps[i2--];
     }
   }
   SHAPE_ASSIGN_CHECK(*out_attrs, 0, oshape);
   return true;
 }

 inline bool TileOpType(const nnvm::NodeAttrs& attrs,
                        std::vector<int>* in_attrs,
                        std::vector<int>* out_attrs) {
   CHECK_EQ(in_attrs->size(), 1U);
   if ((*in_attrs)[0] != -1) {
     TYPE_ASSIGN_CHECK(*out_attrs, 0, (*in_attrs)[0]);
   } else if ((*out_attrs)[0] != -1) {
     TYPE_ASSIGN_CHECK(*in_attrs, 0, (*out_attrs)[0]);
   }
   return true;
 }

 /*!
  * \brief Reshape the input and output tensors for
  * using broadcast_to to achieve the funcitonality
  * of operator tile.
  * \return a pair of TShape's, first is the reshaped
  * input shape, second is the reshaped output shape.
  */
 inline std::pair<TShape, TShape> ReshapeInputOutputForTileOp(const TShape& ishape,
                                                              const TShape& reps) {
   if (ishape.ndim() == 0 || reps.ndim() == 0) {
     return std::make_pair(ishape, ishape);
   }

   // The shape we want to broadcast to
   TShape bshape(std::max(ishape.ndim(), reps.ndim()) * 2);

   // The shape of the input tensor after adding new axes before each dim
   TShape rshape(bshape.ndim());

   int i1 = static_cast<int>(ishape.ndim()) - 1;
   int i2 = static_cast<int>(reps.ndim()) - 1;
   for (int i = static_cast<int>(bshape.ndim()) - 1; i >= 0; --i) {
     if (0 == (i & 1)) {
       bshape[i] = (i2 >= 0? reps[i2--] : 1);
       rshape[i] = 1;
     } else {
       rshape[i] = bshape[i] = (i1 >= 0? ishape[i1--] : 1);
     }
   }

   return std::make_pair(rshape, bshape);
 }

 /*!
  * \brief Implementation of tiling the input tensor a based
  * on the user-input shape, reps.
  * If a.ndim < reps.ndim, new axes are pre-pended to a. For example,
  * the input tensor has shape (3,), and the reps is (2, 4); the input
  * tensor would be reshaped to (1, 3).
  * If a.ndim > reps.ndim, pre-pending 1's to reps. For example,
  * the input tensor has shape (2, 3, 4, 5), and reps is (2, 2);
  * the reps would be changed to (1, 1, 2, 2).
  * Suppose we have a.ndim = reps.ndim now. To achieve tiling,
  * we utilize the operator broadcast_to. For example, for a tensor
  * of shape (2, 3, 4, 5) and reps (2, 8, 9, 3), we first reshape
  * the tensor to the shape (1, 2, 1, 3, 1, 4, 1, 5) by adding
  * one axis before each dimension. Then, we want to broadcast
  * the new tensor to shape (2, 2, 8, 3, 9, 4, 3, 5). The final
  * output tensor would have shape (2*2, 8*3, 9*4, 3*5).
  */
 template<typename xpu>
 void TileOpForward(const nnvm::NodeAttrs& attrs,
                    const OpContext& ctx,
                    const std::vector<TBlob>& inputs,
                    const std::vector<OpReqType>& req,
                    const std::vector<TBlob>& outputs) {
   CHECK_EQ(inputs.size(), 1U);
   CHECK_EQ(outputs.size(), 1U);

   if (inputs[0].Size() == 0) return;
   const TShape& ishape = inputs[0].shape_;
   const TShape& reps = nnvm::get<TileParam>(attrs.parsed).reps;

   // If any one of the number in reps is zero, return immediately
   for (index_t i = 0; i < reps.ndim(); ++i) {
     if (0 == reps[i]) return;
   }

   std::pair<TShape, TShape> rshapes = ReshapeInputOutputForTileOp(ishape, reps);

   // reshaped input tblob
   TBlob iblob(inputs[0].dptr_, rshapes.first, inputs[0].dev_mask_, inputs[0].type_flag_);
   std::vector<TBlob> newInputs = {iblob};
   // reshaped output tblob
   TBlob oblob(outputs[0].dptr_, rshapes.second, outputs[0].dev_mask_, outputs[0].type_flag_);
   std::vector<TBlob> newOutputs = {oblob};

   BroadcastCompute<xpu>(attrs, ctx, newInputs, req, newOutputs);
 }

 /*!
  * \brief Compute the gradient of the loss function
  * with respect to the input of the operator.
  * Backpropagation is employed to implement the
  * chain rule.
  * \param inputs the gradient of the loss function
  * with respect to the outputs of the operator
  * \param outputs the gradient of the loss function
  * with respect to the inputs of the operator
  */
 template<typename xpu>
 void TileOpBackward(const nnvm::NodeAttrs& attrs,
                     const OpContext& ctx,
                     const std::vector<TBlob>& inputs,
                     const std::vector<OpReqType>& req,
                     const std::vector<TBlob>& outputs) {
   CHECK_EQ(inputs.size(), 1U);
   CHECK_EQ(outputs.size(), 1U);

   if (inputs[0].Size() == 0) return;
   const TShape& oshape = outputs[0].shape_;
   const TShape& reps = nnvm::get<TileParam>(attrs.parsed).reps;

   // If any one of the number in reps is zero, return immediately
   for (index_t i = 0; i < reps.ndim(); ++i) {
     if (0 == reps[i]) return;
   }

   std::pair<TShape, TShape> rshapes = ReshapeInputOutputForTileOp(oshape, reps);

   // reshaped output grad tblob
   TBlob oblob(outputs[0].dptr_, rshapes.first, outputs[0].dev_mask_, outputs[0].type_flag_);
   std::vector<TBlob> newOutputs = {oblob};
   // reshaped input grad tblob
   TBlob iblob(inputs[0].dptr_, rshapes.second, inputs[0].dev_mask_, inputs[0].type_flag_);
   std::vector<TBlob> newInputs = {iblob};

   ReduceAxesComputeImpl<xpu, mshadow::red::sum, false>(
       attrs, ctx, newInputs, req, newOutputs, rshapes.first);
 }

 struct ReverseParam : public dmlc::Parameter<ReverseParam> {
   nnvm::Tuple<int> axis;
   DMLC_DECLARE_PARAMETER(ReverseParam) {
     DMLC_DECLARE_FIELD(axis)
     .describe("The axis which to reverse elements.");
   }
 };


 template<typename xpu>
 void ReverseOpForward(const nnvm::NodeAttrs& attrs,
                       const OpContext& ctx,
                       const std::vector<TBlob>& inputs,
                       const std::vector<OpReqType>& req,
                       const std::vector<TBlob>& outputs) {
   using namespace mshadow;
   using namespace mxnet_op;
   const ReverseParam& param = nnvm::get<ReverseParam>(attrs.parsed);
   CHECK_EQ(inputs[0].type_flag_, outputs[0].type_flag_);
   CHECK_LT(param.axis.ndim(), REVERSE_MAX_DIM);
   Stream<xpu> *s = ctx.get_stream<xpu>();

   const TShape& ishape = inputs[0].shape_;

   std::vector<index_t> stride_(param.axis.ndim());
   std::vector<index_t>  trailing_(param.axis.ndim());
   index_t reverse_index = 0;
   for (auto axis_iter = param.axis.begin() ; axis_iter!= param.axis.end(); ++axis_iter) {
     CHECK_LT(*axis_iter, static_cast<int>(ishape.ndim()));
     stride_[reverse_index] = ishape[*axis_iter];
     trailing_[reverse_index] = 1;
     for (int i2 = *axis_iter + 1; i2 < ishape.ndim(); ++i2) {
       trailing_[reverse_index] *= ishape[i2];
     }
     reverse_index++;
   }

 #ifdef __CUDACC__
   mshadow::Tensor<xpu, 1, uint8_t> workspace =
     ctx.requested[0].get_space_typed<xpu, 1, uint8_t>(
       mshadow::Shape1(reverse_index * sizeof(index_t) * 2), s);

   auto stride_workspace = workspace.dptr_;
   auto trailing_workspace = workspace.dptr_ + reverse_index * sizeof(index_t);

   cudaMemcpyAsync(stride_workspace, thrust::raw_pointer_cast(stride_.data()),
                   stride_.size() * sizeof(index_t),
                   cudaMemcpyHostToDevice, mshadow::Stream<gpu>::GetStream(s));
   cudaMemcpyAsync(trailing_workspace, thrust::raw_pointer_cast(trailing_.data()),
                   trailing_.size() * sizeof(index_t),
                   cudaMemcpyHostToDevice, mshadow::Stream<gpu>::GetStream(s));

 #endif

 #ifdef __CUDACC__
   MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {
     Kernel<reverse, xpu>::Launch(s, inputs[0].Size(), reverse_index,
     inputs[0].dptr<DType>(), outputs[0].dptr<DType>(),
     reinterpret_cast<index_t*>(stride_workspace), reinterpret_cast<index_t*>(trailing_workspace));
   });
 #else
   MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {
     Kernel<reverse, xpu>::Launch(s, inputs[0].Size(), reverse_index,
     inputs[0].dptr<DType>(), outputs[0].dptr<DType>(),
     stride_.data(), trailing_.data());
   });
 #endif
 }


 }  // namespace op
 }  // namespace mxnet

 #endif  // MXNET_OPERATOR_TENSOR_MATRIX_OP_INL_H_