src/operator/tensor/sparse_retain-inl.h - mxnet - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *   http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
  */

 /*!
  * \file sparse_retain-inl.h
  * \brief Function definition of sparse_retain operator
 */
 #ifndef MXNET_OPERATOR_TENSOR_SPARSE_RETAIN_INL_H_
 #define MXNET_OPERATOR_TENSOR_SPARSE_RETAIN_INL_H_

 #include <mxnet/operator_util.h>
 #include <vector>
 #include <utility>
 #include "./init_op.h"
 #include "../mshadow_op.h"
 #include "../elemwise_op_common.h"
 #include "../mxnet_op.h"

 namespace mxnet {
 namespace op {

 /*!
  * \brief sparse retain namespace
  */
 namespace sr {
 enum SparseRetainOpInputs {kArr, kIdx};
 enum SparseRetainOpOutputs {kOut};
 }  // namespace sr

 inline bool SparseRetainOpShape(const nnvm::NodeAttrs& attrs,
                                 mxnet::ShapeVector *in_attrs,
                                 mxnet::ShapeVector *out_attrs) {
   CHECK_EQ(in_attrs->size(), 2U)
     << "sparse_retain operator takes 2 arguments (" << in_attrs->size() << " given)";
   CHECK_EQ(out_attrs->size(), 1U);

   mxnet::TShape tshape((*in_attrs)[sr::kArr]);
   shape_assign(&tshape, (*out_attrs)[sr::kOut]);
   SHAPE_ASSIGN_CHECK(*in_attrs, sr::kArr, tshape);
   SHAPE_ASSIGN_CHECK(*out_attrs, sr::kOut, tshape);
   return true;
 }

 inline bool SparseRetainOpType(const nnvm::NodeAttrs& attrs,
                                std::vector<int> *in_attrs,
                                std::vector<int> *out_attrs) {
   CHECK_EQ(in_attrs->size(), 2U);
   CHECK_EQ(out_attrs->size(), 1U);
   CHECK_NE((*in_attrs)[sr::kIdx], -1) << "Index type must be set for sparse_retain operator";

   TYPE_ASSIGN_CHECK(*out_attrs, 0, (*in_attrs)[sr::kArr]);
   TYPE_ASSIGN_CHECK(*in_attrs, 0, (*out_attrs)[sr::kOut]);
   return (*in_attrs)[0] != -1;
 }

 inline bool SparseRetainForwardInferStorageType(const nnvm::NodeAttrs& attrs,
                                                 const int dev_mask,
                                                 DispatchMode* dispatch_mode,
                                                 std::vector<int> *in_attrs,
                                                 std::vector<int> *out_attrs) {
   CHECK_EQ(in_attrs->size(), 2U);
   CHECK_EQ(out_attrs->size(), 1U);
   bool dispatched = false;
   auto &arr_stype = in_attrs->at(sr::kArr);
   auto &idx_stype = in_attrs->at(sr::kIdx);
   auto &out_stype = out_attrs->at(sr::kOut);
   if (!dispatched && arr_stype == kRowSparseStorage && idx_stype == kDefaultStorage) {
     // rsp, dns -> rsp
     dispatched = storage_type_assign(&out_stype, kRowSparseStorage,
                                      dispatch_mode, DispatchMode::kFComputeEx);
   }
   return dispatched;
 }

 inline bool SparseRetainBackwardInferStorageType(const nnvm::NodeAttrs& attrs,
                                                  const int dev_mask,
                                                  DispatchMode* dispatch_mode,
                                                  std::vector<int> *in_attrs,
                                                  std::vector<int> *out_attrs) {
   CHECK_EQ(in_attrs->size(), 2U);
   CHECK_EQ(out_attrs->size(), 2U);
   bool dispatched = false;
   const auto &ograd_stype = in_attrs->at(sr::kOut);
   const auto &idx_stype = in_attrs->at(sr::kArr);
   auto &arr_grad_stype = out_attrs->at(sr::kArr);
   auto &idx_grad_stype = out_attrs->at(sr::kIdx);
   if (!dispatched && ograd_stype == kDefaultStorage && idx_stype == kDefaultStorage) {
     if (type_assign(&arr_grad_stype, kRowSparseStorage) &&
         type_assign(&idx_grad_stype, kDefaultStorage)) {
       DISPATCH_MODE_ASSIGN_CHECK(dispatch_mode, 0, DispatchMode::kFComputeEx);
       dispatched = true;
     }
   }
   return dispatched;
 }

 /*!
  * \brief Each thread searches for a user input index in the input
  * row sparse ndarray alternatively. This ensures each thread
  * has the almost the same workload. The overhead is the binary
  * search. If all the indices of the idx array are contained
  * in the in_idx, one should use SparseRetainRspRowBlockKernel instead,
  * where each thread only perform binary search once.
  */
 struct SparseRetainRspThreadKernel {
   template<typename DType, typename RType, typename IType>
   MSHADOW_XINLINE static void Map(int i, DType* out_data, RType* out_idx,
                                   const DType* in_data, const RType* in_idx,
                                   const IType* idx, const size_t nnr,
                                   const size_t row_length) {
     const RType irow = idx[i];
     int j = -1, left = 0, right = nnr - 1;
     while (left <= right) {
       int m = left + (right - left) / 2;
       const auto in_idx_m = in_idx[m];
       if (in_idx_m == irow) {
         j = m;
         break;
       } else if (in_idx_m < irow) {
         left = m + 1;
       } else {
         right = m - 1;
       }
     }
     out_idx[i] = idx[i];
     if (j >= 0) {
       const size_t in_offset = j * row_length;
       const size_t out_offset = i * row_length;
       for (size_t k = 0; k < row_length; ++k) {
         out_data[out_offset+k] = in_data[in_offset+k];
       }
     }
   }
 };

 /*!
  * \brief This kernel should be invoked when the row indices
  * to be retained are all in the input rsp.
  * Each thread searches for a subarray of indices of
  * the user-input idx array for retain. The first index
  * in the subarray will be searched for using binary search.
  * The rest of the indices will be searched for starting from
  * the lower bound of the binary search. This kernel assumes
  * that idx has been sorted in ascending order.
  */
 struct SparseRetainRspRowBlockKernel {
   template<typename DType, typename RType, typename IType>
   MSHADOW_XINLINE static void Map(int i, DType* out_data, RType* out_idx,
                                   const DType* in_data, const RType* in_idx,
                                   const IType* idx, const size_t num_indices,
                                   const size_t nnr, const size_t row_length,
                                   const size_t seg_len) {
     const size_t seg_start = i * seg_len;
     if (seg_start >= num_indices) return;
     const size_t seg_end = (seg_start+seg_len < num_indices? seg_start+seg_len : num_indices);
     for (size_t j = seg_start; j < seg_end; ++j) {
       out_idx[j] = idx[j];
     }
     // use binary search to find the lower bound of idx[seg_start] in in_idx
     const RType* first = in_idx;
     const RType* last = in_idx + nnr;
     const auto val = idx[seg_start];
     const RType* it;
     int count = last - first, step;
     while (count > 0) {
       it = first;
       step = count / 2;
       it += step;
       if (*it < val) {
         first = ++it;
         count -= step + 1;
       } else {
         count = step;
       }
     }
     size_t cur_row_idx = first - in_idx;
     // end of binary search
     if (cur_row_idx == nnr ||  in_idx[cur_row_idx] > idx[seg_end-1]) {
       return;
     }
     size_t cur_idx = seg_start;
     while (cur_row_idx < nnr && cur_idx < seg_end) {
       if (in_idx[cur_row_idx] == idx[cur_idx]) {
         const size_t in_offset = cur_row_idx * row_length;
         const size_t out_offset = cur_idx * row_length;
         for (size_t k = 0; k < row_length; ++k) {
           out_data[out_offset+k] = in_data[in_offset+k];
         }
         ++cur_row_idx;
         ++cur_idx;
       } else if (in_idx[cur_row_idx] < idx[cur_idx]) {
         ++cur_row_idx;
       } else {
         ++cur_idx;
       }
     }
   }
 };

 /*!
  * Copy input indices to output indices.
  * Only used when input rsp is dense.
  */
 struct SparseRetainCopyIndices {
   template<typename RType, typename IType>
   MSHADOW_XINLINE static void Map(int i, RType* out_idx, IType* idx) {
     out_idx[i] = idx[i];
   }
 };

 /*!
  * Copy input retained rows to output rows.
  * Only used when input rsp is dense.
  * This kernel is only used when ctx is on GPU.
  * So it's parallelized by out_rows' elements,
  * instead of rows.
  */
 struct SparseRetainCopyRetainedRowsFromDnsPerElem {
   template<typename DType, typename IType>
   MSHADOW_XINLINE static void Map(int i, DType* out_rows, const DType* in_rows,
                                   const IType* idx, const size_t row_length) {
     const size_t irow = i / row_length;
     const size_t icol = i % row_length;
     out_rows[i] = in_rows[static_cast<size_t>(idx[irow]) * row_length + icol];
   }
 };

 /*!
  * Copy input retained rows to output rows.
  * Only used when input rsp is dense.
  * This kernel is only used when ctx is on CPU.
  * So it's parallelized by out_rows' rows instead of elements.
  */
 struct SparseRetainCopyRetainedRowsFromDnsPerRow {
   template<typename DType, typename IType>
   MSHADOW_XINLINE static void Map(int i, DType* out_rows, const DType* in_rows,
                                   const IType* idx, const size_t row_length) {
     const size_t dst_offset = i * row_length;
     const size_t src_offset = static_cast<size_t>(idx[i]) * row_length;
     for (size_t j = 0; j < row_length; j++) {
       out_rows[dst_offset + j] = in_rows[src_offset + j];
     }
   }
 };

 template<typename xpu>
 void SparseRetainOpForwardRspImpl(mshadow::Stream<xpu> *s,
                                   const NDArray& input_nd,
                                   const TBlob& idx_data,
                                   const OpReqType req,
                                   NDArray* output_nd) {
   if (req == kNullOp) return;
   CHECK_EQ(req, kWriteTo) << "SparseRetainOpForwardRspImpl only support req = kWriteTo now";
   CHECK_EQ(input_nd.storage_type(), kRowSparseStorage)
     << "SparseRetainOpForwardRspImpl operator only takes row sparse NDArray as input";
   CHECK_EQ(output_nd->storage_type(), kRowSparseStorage)
     << "SparseRetainOpForwardRspImpl operator only outputs row sparse NDArray";

   if (!input_nd.storage_initialized() || idx_data.Size() == 0U || input_nd.shape()[0] == 0) {
     FillZerosRspImpl(s, *output_nd);
     return;
   }

   const TBlob input_data = input_nd.data();
   const TBlob input_idx = input_nd.aux_data(rowsparse::kIdx);

   output_nd->CheckAndAlloc({mshadow::Shape1(idx_data.Size())});
   TBlob output_data = output_nd->data();
   TBlob output_idx = output_nd->aux_data(rowsparse::kIdx);
   const auto row_length = input_data.shape_.ProdShape(1, input_data.shape_.ndim());

   using namespace mxnet_op;
   MSHADOW_TYPE_SWITCH(output_data.type_flag_, DType, {  // output data type
     Kernel<set_zero, xpu>::Launch(s, output_data.Size(), output_data.dptr<DType>());
     MSHADOW_IDX_TYPE_SWITCH(output_idx.type_flag_, RType, {  // row index data type
       MSHADOW_TYPE_SWITCH(idx_data.type_flag_, IType, {  // index array data type
         if (input_idx.Size() == input_nd.shape()[0]) {  // input rsp is dense
           using namespace mshadow;
           // copy indices
           Tensor<xpu, 1, RType> output_idx_tensor = output_idx.FlatTo1D<xpu, RType>(s);
           const size_t num_rows_retained = output_idx.Size();
           if (output_idx.type_flag_ == idx_data.type_flag_) {  // same type, use Copy
             const Tensor<xpu, 1, RType> idx_tensor = idx_data.FlatTo1D<xpu, RType>(s);
             Copy(output_idx_tensor, idx_tensor, s);
           } else {  // different index types, use Kernel::Launch
             Kernel<SparseRetainCopyIndices, xpu>::Launch(s, num_rows_retained,
                 output_idx.dptr<RType>(), idx_data.dptr<IType>());
           }
           // copy data
           if (std::is_same<xpu, cpu>::value) {  // For cpu, parallelize by rows
             Kernel<SparseRetainCopyRetainedRowsFromDnsPerRow, xpu>::Launch(s, idx_data.Size(),
               output_data.dptr<DType>(), input_data.dptr<DType>(),
               idx_data.dptr<IType>(), row_length);
           } else {  // For gpu, parallelize by elements
             Kernel<SparseRetainCopyRetainedRowsFromDnsPerElem, xpu>::Launch(s, output_data.Size(),
               output_data.dptr<DType>(), input_data.dptr<DType>(),
               idx_data.dptr<IType>(), row_length);
           }
         } else {  // input rsp is not dense
           Kernel<SparseRetainRspThreadKernel, xpu>::Launch(s, idx_data.Size(),
               output_data.dptr<DType>(), output_idx.dptr<RType>(), input_data.dptr<DType>(),
               input_idx.dptr<RType>(), idx_data.dptr<IType>(), input_data.shape_[0], row_length);
         }
       });
     });
   });
 }

 template<typename xpu>
 void SparseRetainOpForwardEx(const nnvm::NodeAttrs& attrs,
                              const OpContext& ctx,
                              const std::vector<NDArray>& inputs,
                              const std::vector<OpReqType>& req,
                              const std::vector<NDArray>& outputs) {
   CHECK_EQ(inputs.size(), 2U);
   CHECK_EQ(outputs.size(), 1U);
   CHECK_EQ(req.size(), 1U);
   if (req[sr::kOut] == kNullOp) return;
   CHECK_EQ(req[sr::kOut], kWriteTo) << "sparse_retain only supports req=\'write\'";
   CHECK_EQ(inputs[sr::kIdx].storage_type(), kDefaultStorage)
     << "sparse_retain operator only takes default NDArray as its index array";
   if (inputs[sr::kArr].storage_type() == kRowSparseStorage) {
     NDArray output_nd = outputs[sr::kOut];
     SparseRetainOpForwardRspImpl<xpu>(ctx.get_stream<xpu>(), inputs[sr::kArr],
         inputs[sr::kIdx].data(), req[sr::kOut], &output_nd);
   } else {
     LOG(FATAL) << "sparse_retain op only supports row-sparse ndarrays as input";
   }
 }

 template<int req>
 struct SparseRetainRspGradKernel {
   template<typename DType, typename RType, typename IType>
   MSHADOW_XINLINE static void Map(int i, DType* in_grad, RType* in_grad_idx,
                                   const DType* out_grad, const IType* idx,
                                   const size_t row_length) {
     const RType irow = idx[i];
     in_grad_idx[i] = irow;
     const size_t out_offset = irow * row_length;
     const size_t in_offset = i * row_length;
     for (size_t j = 0; j < row_length; ++j) {
       KERNEL_ASSIGN(in_grad[in_offset+j], req, out_grad[out_offset+j]);
     }
   }
 };

 template<typename xpu>
 void SparseRetainOpBackwardEx(const nnvm::NodeAttrs& attrs,
                               const OpContext& ctx,
                               const std::vector<NDArray>& inputs,
                               const std::vector<OpReqType>& req,
                               const std::vector<NDArray>& outputs) {
   CHECK_EQ(req.size(), 2U);
   CHECK_EQ(req[sr::kIdx], kNullOp);
   if (req[sr::kArr] == kNullOp) return;
   CHECK_EQ(req[sr::kArr], kWriteTo);

   CHECK_EQ(inputs.size(), 2U);
   CHECK_EQ(outputs.size(), 2U)
     << "sparse_retain does not support calculating gradients of indices";

   CHECK_EQ(inputs[sr::kOut].storage_type(), kDefaultStorage)
     << "sparse_retain backward only takes default NDArray as ograd";
   CHECK_EQ(inputs[sr::kIdx].storage_type(), kDefaultStorage)
     << "sparse_retain backward only takes default NDArray as its index array";
   CHECK_EQ(outputs[sr::kArr].storage_type(), kRowSparseStorage)
     << "sparse_retain backward only outputs row sparse NDArray as grad of input";

   using namespace mxnet_op;
   using namespace mshadow;
   Stream<xpu> *s = ctx.get_stream<xpu>();
   const TBlob idx_data = inputs[sr::kIdx].data();
   if (idx_data.Size() == 0U) {
     FillZerosRspImpl(s, outputs[sr::kArr]);
     return;
   }

   const TBlob out_grad_data = inputs[sr::kOut].data();

   NDArray in_grad_nd = outputs[sr::kArr];
   in_grad_nd.CheckAndAlloc({mshadow::Shape1(idx_data.Size())});
   TBlob in_grad_data = in_grad_nd.data();
   TBlob in_grad_idx = in_grad_nd.aux_data(rowsparse::kIdx);
   const auto row_length = out_grad_data.shape_.ProdShape(1, out_grad_data.shape_.ndim());

   MSHADOW_TYPE_SWITCH(out_grad_data.type_flag_, DType, {  // output data type
     MSHADOW_IDX_TYPE_SWITCH(in_grad_idx.type_flag_, RType, {  // row index data type
       MSHADOW_TYPE_SWITCH(idx_data.type_flag_, IType, {  // index array data type
         MXNET_ASSIGN_REQ_SWITCH(req[sr::kArr], req_type, {
           Kernel<SparseRetainRspGradKernel<req_type>, xpu>::Launch(
               s, in_grad_idx.Size(), in_grad_data.dptr<DType>(), in_grad_idx.dptr<RType>(),
               out_grad_data.dptr<DType>(), idx_data.dptr<IType>(), row_length);
         });
       });
     });
   });
 }


 }  // namespace op
 }  // namespace mxnet

 #endif  // MXNET_OPERATOR_TENSOR_SPARSE_RETAIN_INL_H_
	/*
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing,
	* software distributed under the License is distributed on an
	* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	* KIND, either express or implied. See the License for the
	* specific language governing permissions and limitations
	* under the License.
	*/

	/*!
	* \file sparse_retain-inl.h
	* \brief Function definition of sparse_retain operator
	*/
	#ifndef MXNET_OPERATOR_TENSOR_SPARSE_RETAIN_INL_H_
	#define MXNET_OPERATOR_TENSOR_SPARSE_RETAIN_INL_H_

	#include <mxnet/operator_util.h>
	#include <vector>
	#include <utility>
	#include "./init_op.h"
	#include "../mshadow_op.h"
	#include "../elemwise_op_common.h"
	#include "../mxnet_op.h"

	namespace mxnet {
	namespace op {

	/*!
	* \brief sparse retain namespace
	*/
	namespace sr {
	enum SparseRetainOpInputs {kArr, kIdx};
	enum SparseRetainOpOutputs {kOut};
	} // namespace sr

	inline bool SparseRetainOpShape(const nnvm::NodeAttrs& attrs,
	mxnet::ShapeVector *in_attrs,
	mxnet::ShapeVector *out_attrs) {
	CHECK_EQ(in_attrs->size(), 2U)
	<< "sparse_retain operator takes 2 arguments (" << in_attrs->size() << " given)";
	CHECK_EQ(out_attrs->size(), 1U);

	mxnet::TShape tshape((*in_attrs)[sr::kArr]);
	shape_assign(&tshape, (*out_attrs)[sr::kOut]);
	SHAPE_ASSIGN_CHECK(*in_attrs, sr::kArr, tshape);
	SHAPE_ASSIGN_CHECK(*out_attrs, sr::kOut, tshape);
	return true;
	}

	inline bool SparseRetainOpType(const nnvm::NodeAttrs& attrs,
	std::vector<int> *in_attrs,
	std::vector<int> *out_attrs) {
	CHECK_EQ(in_attrs->size(), 2U);
	CHECK_EQ(out_attrs->size(), 1U);
	CHECK_NE((*in_attrs)[sr::kIdx], -1) << "Index type must be set for sparse_retain operator";

	TYPE_ASSIGN_CHECK(out_attrs, 0, (in_attrs)[sr::kArr]);
	TYPE_ASSIGN_CHECK(in_attrs, 0, (out_attrs)[sr::kOut]);
	return (*in_attrs)[0] != -1;
	}

	inline bool SparseRetainForwardInferStorageType(const nnvm::NodeAttrs& attrs,
	const int dev_mask,
	DispatchMode* dispatch_mode,
	std::vector<int> *in_attrs,
	std::vector<int> *out_attrs) {
	CHECK_EQ(in_attrs->size(), 2U);
	CHECK_EQ(out_attrs->size(), 1U);
	bool dispatched = false;
	auto &arr_stype = in_attrs->at(sr::kArr);
	auto &idx_stype = in_attrs->at(sr::kIdx);
	auto &out_stype = out_attrs->at(sr::kOut);
	if (!dispatched && arr_stype == kRowSparseStorage && idx_stype == kDefaultStorage) {
	// rsp, dns -> rsp
	dispatched = storage_type_assign(&out_stype, kRowSparseStorage,
	dispatch_mode, DispatchMode::kFComputeEx);
	}
	return dispatched;
	}

	inline bool SparseRetainBackwardInferStorageType(const nnvm::NodeAttrs& attrs,
	const int dev_mask,
	DispatchMode* dispatch_mode,
	std::vector<int> *in_attrs,
	std::vector<int> *out_attrs) {
	CHECK_EQ(in_attrs->size(), 2U);
	CHECK_EQ(out_attrs->size(), 2U);
	bool dispatched = false;
	const auto &ograd_stype = in_attrs->at(sr::kOut);
	const auto &idx_stype = in_attrs->at(sr::kArr);
	auto &arr_grad_stype = out_attrs->at(sr::kArr);
	auto &idx_grad_stype = out_attrs->at(sr::kIdx);
	if (!dispatched && ograd_stype == kDefaultStorage && idx_stype == kDefaultStorage) {
	if (type_assign(&arr_grad_stype, kRowSparseStorage) &&
	type_assign(&idx_grad_stype, kDefaultStorage)) {
	DISPATCH_MODE_ASSIGN_CHECK(dispatch_mode, 0, DispatchMode::kFComputeEx);
	dispatched = true;
	}
	}
	return dispatched;
	}

	/*!
	* \brief Each thread searches for a user input index in the input
	* row sparse ndarray alternatively. This ensures each thread
	* has the almost the same workload. The overhead is the binary
	* search. If all the indices of the idx array are contained
	* in the in_idx, one should use SparseRetainRspRowBlockKernel instead,
	* where each thread only perform binary search once.
	*/
	struct SparseRetainRspThreadKernel {
	template<typename DType, typename RType, typename IType>
	MSHADOW_XINLINE static void Map(int i, DType* out_data, RType* out_idx,
	const DType* in_data, const RType* in_idx,
	const IType* idx, const size_t nnr,
	const size_t row_length) {
	const RType irow = idx[i];
	int j = -1, left = 0, right = nnr - 1;
	while (left <= right) {
	int m = left + (right - left) / 2;
	const auto in_idx_m = in_idx[m];
	if (in_idx_m == irow) {
	j = m;
	break;
	} else if (in_idx_m < irow) {
	left = m + 1;
	} else {
	right = m - 1;
	}
	}
	out_idx[i] = idx[i];
	if (j >= 0) {
	const size_t in_offset = j * row_length;
	const size_t out_offset = i * row_length;
	for (size_t k = 0; k < row_length; ++k) {
	out_data[out_offset+k] = in_data[in_offset+k];
	}
	}
	}
	};

	/*!
	* \brief This kernel should be invoked when the row indices
	* to be retained are all in the input rsp.
	* Each thread searches for a subarray of indices of
	* the user-input idx array for retain. The first index
	* in the subarray will be searched for using binary search.
	* The rest of the indices will be searched for starting from
	* the lower bound of the binary search. This kernel assumes
	* that idx has been sorted in ascending order.
	*/
	struct SparseRetainRspRowBlockKernel {
	template<typename DType, typename RType, typename IType>
	MSHADOW_XINLINE static void Map(int i, DType* out_data, RType* out_idx,
	const DType* in_data, const RType* in_idx,
	const IType* idx, const size_t num_indices,
	const size_t nnr, const size_t row_length,
	const size_t seg_len) {
	const size_t seg_start = i * seg_len;
	if (seg_start >= num_indices) return;
	const size_t seg_end = (seg_start+seg_len < num_indices? seg_start+seg_len : num_indices);
	for (size_t j = seg_start; j < seg_end; ++j) {
	out_idx[j] = idx[j];
	}
	// use binary search to find the lower bound of idx[seg_start] in in_idx
	const RType* first = in_idx;
	const RType* last = in_idx + nnr;
	const auto val = idx[seg_start];
	const RType* it;
	int count = last - first, step;
	while (count > 0) {
	it = first;
	step = count / 2;
	it += step;
	if (*it < val) {
	first = ++it;
	count -= step + 1;
	} else {
	count = step;
	}
	}
	size_t cur_row_idx = first - in_idx;
	// end of binary search
	if (cur_row_idx == nnr \|\| in_idx[cur_row_idx] > idx[seg_end-1]) {
	return;
	}
	size_t cur_idx = seg_start;
	while (cur_row_idx < nnr && cur_idx < seg_end) {
	if (in_idx[cur_row_idx] == idx[cur_idx]) {
	const size_t in_offset = cur_row_idx * row_length;
	const size_t out_offset = cur_idx * row_length;
	for (size_t k = 0; k < row_length; ++k) {
	out_data[out_offset+k] = in_data[in_offset+k];
	}
	++cur_row_idx;
	++cur_idx;
	} else if (in_idx[cur_row_idx] < idx[cur_idx]) {
	++cur_row_idx;
	} else {
	++cur_idx;
	}
	}
	}
	};

	/*!
	* Copy input indices to output indices.
	* Only used when input rsp is dense.
	*/
	struct SparseRetainCopyIndices {
	template<typename RType, typename IType>
	MSHADOW_XINLINE static void Map(int i, RType* out_idx, IType* idx) {
	out_idx[i] = idx[i];
	}
	};

	/*!
	* Copy input retained rows to output rows.
	* Only used when input rsp is dense.
	* This kernel is only used when ctx is on GPU.
	* So it's parallelized by out_rows' elements,
	* instead of rows.
	*/
	struct SparseRetainCopyRetainedRowsFromDnsPerElem {
	template<typename DType, typename IType>
	MSHADOW_XINLINE static void Map(int i, DType* out_rows, const DType* in_rows,
	const IType* idx, const size_t row_length) {
	const size_t irow = i / row_length;
	const size_t icol = i % row_length;
	out_rows[i] = in_rows[static_cast<size_t>(idx[irow]) * row_length + icol];
	}
	};

	/*!
	* Copy input retained rows to output rows.
	* Only used when input rsp is dense.
	* This kernel is only used when ctx is on CPU.
	* So it's parallelized by out_rows' rows instead of elements.
	*/
	struct SparseRetainCopyRetainedRowsFromDnsPerRow {
	template<typename DType, typename IType>
	MSHADOW_XINLINE static void Map(int i, DType* out_rows, const DType* in_rows,
	const IType* idx, const size_t row_length) {
	const size_t dst_offset = i * row_length;
	const size_t src_offset = static_cast<size_t>(idx[i]) * row_length;
	for (size_t j = 0; j < row_length; j++) {
	out_rows[dst_offset + j] = in_rows[src_offset + j];
	}
	}
	};

	template<typename xpu>
	void SparseRetainOpForwardRspImpl(mshadow::Stream<xpu> *s,
	const NDArray& input_nd,
	const TBlob& idx_data,
	const OpReqType req,
	NDArray* output_nd) {
	if (req == kNullOp) return;
	CHECK_EQ(req, kWriteTo) << "SparseRetainOpForwardRspImpl only support req = kWriteTo now";
	CHECK_EQ(input_nd.storage_type(), kRowSparseStorage)
	<< "SparseRetainOpForwardRspImpl operator only takes row sparse NDArray as input";
	CHECK_EQ(output_nd->storage_type(), kRowSparseStorage)
	<< "SparseRetainOpForwardRspImpl operator only outputs row sparse NDArray";

	if (!input_nd.storage_initialized() \|\| idx_data.Size() == 0U \|\| input_nd.shape()[0] == 0) {
	FillZerosRspImpl(s, *output_nd);
	return;
	}

	const TBlob input_data = input_nd.data();
	const TBlob input_idx = input_nd.aux_data(rowsparse::kIdx);

	output_nd->CheckAndAlloc({mshadow::Shape1(idx_data.Size())});
	TBlob output_data = output_nd->data();
	TBlob output_idx = output_nd->aux_data(rowsparse::kIdx);
	const auto row_length = input_data.shape_.ProdShape(1, input_data.shape_.ndim());

	using namespace mxnet_op;
	MSHADOW_TYPE_SWITCH(output_data.type_flag_, DType, { // output data type
	Kernel<set_zero, xpu>::Launch(s, output_data.Size(), output_data.dptr<DType>());
	MSHADOW_IDX_TYPE_SWITCH(output_idx.type_flag_, RType, { // row index data type
	MSHADOW_TYPE_SWITCH(idx_data.type_flag_, IType, { // index array data type
	if (input_idx.Size() == input_nd.shape()[0]) { // input rsp is dense
	using namespace mshadow;
	// copy indices
	Tensor<xpu, 1, RType> output_idx_tensor = output_idx.FlatTo1D<xpu, RType>(s);
	const size_t num_rows_retained = output_idx.Size();
	if (output_idx.type_flag_ == idx_data.type_flag_) { // same type, use Copy
	const Tensor<xpu, 1, RType> idx_tensor = idx_data.FlatTo1D<xpu, RType>(s);
	Copy(output_idx_tensor, idx_tensor, s);
	} else { // different index types, use Kernel::Launch
	Kernel<SparseRetainCopyIndices, xpu>::Launch(s, num_rows_retained,
	output_idx.dptr<RType>(), idx_data.dptr<IType>());
	}
	// copy data
	if (std::is_same<xpu, cpu>::value) { // For cpu, parallelize by rows
	Kernel<SparseRetainCopyRetainedRowsFromDnsPerRow, xpu>::Launch(s, idx_data.Size(),
	output_data.dptr<DType>(), input_data.dptr<DType>(),
	idx_data.dptr<IType>(), row_length);
	} else { // For gpu, parallelize by elements
	Kernel<SparseRetainCopyRetainedRowsFromDnsPerElem, xpu>::Launch(s, output_data.Size(),
	output_data.dptr<DType>(), input_data.dptr<DType>(),
	idx_data.dptr<IType>(), row_length);
	}
	} else { // input rsp is not dense
	Kernel<SparseRetainRspThreadKernel, xpu>::Launch(s, idx_data.Size(),
	output_data.dptr<DType>(), output_idx.dptr<RType>(), input_data.dptr<DType>(),
	input_idx.dptr<RType>(), idx_data.dptr<IType>(), input_data.shape_[0], row_length);
	}
	});
	});
	});
	}

	template<typename xpu>
	void SparseRetainOpForwardEx(const nnvm::NodeAttrs& attrs,
	const OpContext& ctx,
	const std::vector<NDArray>& inputs,
	const std::vector<OpReqType>& req,
	const std::vector<NDArray>& outputs) {
	CHECK_EQ(inputs.size(), 2U);
	CHECK_EQ(outputs.size(), 1U);
	CHECK_EQ(req.size(), 1U);
	if (req[sr::kOut] == kNullOp) return;
	CHECK_EQ(req[sr::kOut], kWriteTo) << "sparse_retain only supports req=\'write\'";
	CHECK_EQ(inputs[sr::kIdx].storage_type(), kDefaultStorage)
	<< "sparse_retain operator only takes default NDArray as its index array";
	if (inputs[sr::kArr].storage_type() == kRowSparseStorage) {
	NDArray output_nd = outputs[sr::kOut];
	SparseRetainOpForwardRspImpl<xpu>(ctx.get_stream<xpu>(), inputs[sr::kArr],
	inputs[sr::kIdx].data(), req[sr::kOut], &output_nd);
	} else {
	LOG(FATAL) << "sparse_retain op only supports row-sparse ndarrays as input";
	}
	}

	template<int req>
	struct SparseRetainRspGradKernel {
	template<typename DType, typename RType, typename IType>
	MSHADOW_XINLINE static void Map(int i, DType* in_grad, RType* in_grad_idx,
	const DType* out_grad, const IType* idx,
	const size_t row_length) {
	const RType irow = idx[i];
	in_grad_idx[i] = irow;
	const size_t out_offset = irow * row_length;
	const size_t in_offset = i * row_length;
	for (size_t j = 0; j < row_length; ++j) {
	KERNEL_ASSIGN(in_grad[in_offset+j], req, out_grad[out_offset+j]);
	}
	}
	};

	template<typename xpu>
	void SparseRetainOpBackwardEx(const nnvm::NodeAttrs& attrs,
	const OpContext& ctx,
	const std::vector<NDArray>& inputs,
	const std::vector<OpReqType>& req,
	const std::vector<NDArray>& outputs) {
	CHECK_EQ(req.size(), 2U);
	CHECK_EQ(req[sr::kIdx], kNullOp);
	if (req[sr::kArr] == kNullOp) return;
	CHECK_EQ(req[sr::kArr], kWriteTo);

	CHECK_EQ(inputs.size(), 2U);
	CHECK_EQ(outputs.size(), 2U)
	<< "sparse_retain does not support calculating gradients of indices";

	CHECK_EQ(inputs[sr::kOut].storage_type(), kDefaultStorage)
	<< "sparse_retain backward only takes default NDArray as ograd";
	CHECK_EQ(inputs[sr::kIdx].storage_type(), kDefaultStorage)
	<< "sparse_retain backward only takes default NDArray as its index array";
	CHECK_EQ(outputs[sr::kArr].storage_type(), kRowSparseStorage)
	<< "sparse_retain backward only outputs row sparse NDArray as grad of input";

	using namespace mxnet_op;
	using namespace mshadow;
	Stream<xpu> *s = ctx.get_stream<xpu>();
	const TBlob idx_data = inputs[sr::kIdx].data();
	if (idx_data.Size() == 0U) {
	FillZerosRspImpl(s, outputs[sr::kArr]);
	return;
	}

	const TBlob out_grad_data = inputs[sr::kOut].data();

	NDArray in_grad_nd = outputs[sr::kArr];
	in_grad_nd.CheckAndAlloc({mshadow::Shape1(idx_data.Size())});
	TBlob in_grad_data = in_grad_nd.data();
	TBlob in_grad_idx = in_grad_nd.aux_data(rowsparse::kIdx);
	const auto row_length = out_grad_data.shape_.ProdShape(1, out_grad_data.shape_.ndim());

	MSHADOW_TYPE_SWITCH(out_grad_data.type_flag_, DType, { // output data type
	MSHADOW_IDX_TYPE_SWITCH(in_grad_idx.type_flag_, RType, { // row index data type
	MSHADOW_TYPE_SWITCH(idx_data.type_flag_, IType, { // index array data type
	MXNET_ASSIGN_REQ_SWITCH(req[sr::kArr], req_type, {
	Kernel<SparseRetainRspGradKernel<req_type>, xpu>::Launch(
	s, in_grad_idx.Size(), in_grad_data.dptr<DType>(), in_grad_idx.dptr<RType>(),
	out_grad_data.dptr<DType>(), idx_data.dptr<IType>(), row_length);
	});
	});
	});
	});
	}


	} // namespace op
	} // namespace mxnet

	#endif // MXNET_OPERATOR_TENSOR_SPARSE_RETAIN_INL_H_