src/operator/tensor/cast_storage-inl.h - mxnet - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *   http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
  */

 /*!
  * \file cast_storage-inl.h
  * \brief CPU implementation of cast_storage operator for dense and sparse tensors
  */
 #ifndef MXNET_OPERATOR_TENSOR_CAST_STORAGE_INL_H_
 #define MXNET_OPERATOR_TENSOR_CAST_STORAGE_INL_H_

 #include <dmlc/timer.h>
 #include <mxnet/ndarray.h>
 #include <vector>
 #include <algorithm>
 #include "../mxnet_op.h"
 #include "../operator_common.h"
 #include "../../src/operator/tensor/init_op.h"
 #ifdef __CUDACC__
 #include "./cast_storage-inl.cuh"
 #endif  // __CUDACC__
 #if MXNET_USE_MKLDNN == 1
 #include "../nn/mkldnn/mkldnn_base-inl.h"
 #endif


 namespace mxnet {
 namespace op {

 /*!
  * \brief CPU Kernel for marking row_idx of a RSP tensor per row.
  */
 struct MarkRspRowIdx {
   // i represents the row index of the tensor data
   template<typename DType, typename RType>
   MSHADOW_CINLINE static void Map(int i,
                                   RType* row_idx,
                                   const DType* data,
                                   const nnvm::dim_t row_length) {
     using nnvm::dim_t;
     dim_t j = 0;
     dim_t offset = i * row_length;
     for (; j < row_length; ++j) {
       if (data[offset+j] != 0) {
         break;
       }
     }
     if (row_length == j) {
       row_idx[i] = 0;  // mark as zero for zero row
     } else {
       row_idx[i] = 1;  // mark as one for non-zero row
     }
   }
 };

 /*!
  * \brief CPU implementation of casting a dns tensor to rsp type.
  */
 inline void CastStorageDnsRspImpl(const OpContext& ctx,
                                   const cpu& cpu_dev,
                                   const TBlob& dns,
                                   NDArray* rsp) {
   using namespace rowsparse;
   using namespace mshadow;
   using nnvm::dim_t;
   CHECK(rsp != nullptr);
   CHECK_EQ(rsp->storage_type(), kRowSparseStorage);
   CHECK_EQ(dns.shape_, rsp->shape());
   mshadow::Stream<cpu>* s = ctx.get_stream<cpu>();
   MSHADOW_TYPE_SWITCH(dns.type_flag_, DType, {  // data type
     MSHADOW_IDX_TYPE_SWITCH(rsp->aux_type(kIdx), RType, {  // row idx type
       const dim_t num_rows = dns.shape_[0];
       const dim_t row_length = dns.shape_.ProdShape(1, dns.shape_.ndim());
       rsp->CheckAndAllocAuxData(kIdx, Shape1(num_rows));
       TBlob row_idx_blob = rsp->aux_data(kIdx);
       RType* row_idx = row_idx_blob.dptr<RType>();
       dim_t num_threads = num_rows;
       mxnet_op::Kernel<MarkRspRowIdx, cpu>::Launch(s, num_threads,
           row_idx, dns.dptr<DType>(), row_length);
       dim_t nnr = 0;
       nnr = common::ParallelAccumulate(row_idx, num_rows, nnr);
       rsp->set_aux_shape(kIdx, Shape1(nnr));
       if (0 == nnr) return;
       auto storage_shape = dns.shape_;
       storage_shape[0] = nnr;
       rsp->CheckAndAllocData(storage_shape);
       auto dns_data = dns.get_with_shape<cpu, 2, DType>(Shape2(num_rows, row_length), s);
       auto rsp_data = rsp->data().get_with_shape<cpu, 2, DType>(Shape2(nnr, row_length), s);
       dim_t idx = 0;
       for (dim_t i = 0; i < num_rows; ++i) {
         if (row_idx[i] > 0) {
           row_idx[idx] = i;
           Copy(rsp_data[idx], dns_data[i], s);
           ++idx;
         }
       }
     });
   });
 }

 // TODO(haibin) Use memcopy instead will be much faster than assigning each individual element
 struct CastStorageRspDnsKernel {
   template<typename DType, typename IType>
   MSHADOW_XINLINE static void Map(int i,
                                   const nnvm::dim_t row_length,
                                   const IType* idx,
                                   const DType *data,
                                   DType* dns) {
     using nnvm::dim_t;
     IType rid = idx[i];
     dim_t dns_offset = rid * row_length;
     dim_t rsp_offset = i * row_length;
     for (dim_t col = 0; col < row_length; col++) {
       dns[dns_offset + col] = data[rsp_offset + col];
     }
   }
 };

 /*!
  * \brief This function assumes that the memory for dns has been allocated already
  * since the shape is known at binding stage.
  */
 template<typename xpu>
 void CastStorageRspDnsImpl(const OpContext& ctx,
                            const NDArray& rsp,
                            TBlob* dns) {
   mshadow::Stream<xpu>* s = ctx.get_stream<xpu>();
   CHECK_EQ(rsp.storage_type(), kRowSparseStorage);
   using nnvm::dim_t;
   MSHADOW_TYPE_SWITCH(dns->type_flag_, DType, {
     MSHADOW_IDX_TYPE_SWITCH(rsp.aux_type(rowsparse::kIdx), IType, {
       // assign zeros
       mxnet_op::Kernel<mxnet_op::set_zero, xpu>::Launch(s, dns->Size(), dns->dptr<DType>());
       if (rsp.storage_initialized()) {
         // copy over row by row
         auto in_idx = rsp.aux_data(rowsparse::kIdx).FlatTo1D<xpu, IType>(s).dptr_;
         auto in_data = rsp.data().dptr<DType>();
         auto out_data = dns->dptr<DType>();
         auto shape = rsp.shape();
         const dim_t num_rows = rsp.aux_shape(rowsparse::kIdx).Size();
         const dim_t row_length = shape.ProdShape(1, shape.ndim());
         const dim_t num_threads = num_rows;
         mxnet_op::Kernel<CastStorageRspDnsKernel, xpu>::Launch(s, num_threads,
             row_length, in_idx, in_data, out_data);
       }
     });
   });
 }

 /*!
  * \brief CPU kernel for initializing the indptr in a csr matrix.
  */
 struct FillCsrIndPtr {
   /*!
    * \brief
    * \param i         the i-th row of the dns tensor
    * \param indptr    the indptr of the csr tensor
    * \param dns       the dns tensor
    * \param num_rows  number of rows of the dns tensor
    * \param num_cols  number of columns of the dns tensor
    */
   template<typename DType, typename IType>
   MSHADOW_CINLINE static void Map(int i,
                                   IType* indptr,
                                   const DType* dns,
                                   const nnvm::dim_t num_rows,
                                   const nnvm::dim_t num_cols) {
     using nnvm::dim_t;
     indptr[i+1] = 0;
     const dim_t offset = i * num_cols;
     for (dim_t j = 0; j < num_cols; ++j) {
       if (dns[offset+j] != 0) {
         ++indptr[i+1];
       }
     }
   }
 };

 /*!
  * \brief CPU kernel for initializing the col_idx and value array of the csr matrix.
  */
 struct FillCsrColIdxAndVals {
   /*!
    * \brief
    * \param i         the i-th row of the dns tensor
    * \param val       value array of the csr tensor
    * \param col_idx   column idx array of the csr tensor
    * \param indptr    indptr array of the csr tensor
    * \param dns       dns tensor
    * \param num_rows  number of rows of the dns tensor
    * \param num_cols  number of columns of the dns tensor
    */
   template<typename DType, typename IType, typename CType>
   MSHADOW_CINLINE static void Map(int i,
                                   DType* val,
                                   CType* col_idx,
                                   const IType* indptr,
                                   const DType* dns,
                                   const nnvm::dim_t num_rows,
                                   const nnvm::dim_t num_cols) {
     using nnvm::dim_t;
     const dim_t offset = i * num_cols;
     IType k = indptr[i];
     for (dim_t j = 0; j < num_cols; ++j) {
       if (dns[offset+j] != 0) {
         val[k] = dns[offset+j];
         col_idx[k] = j;
         ++k;
       }
     }
   }
 };

 /*!
  * \brief CPU implementation of casting a dns matrix to csr type.
  */
 inline void CastStorageDnsCsrImpl(const OpContext& ctx,
                                   const cpu& cpu_dev,
                                   const TBlob& dns,
                                   NDArray* csr) {
   CHECK(csr != nullptr);
   CHECK_EQ(csr->storage_type(), kCSRStorage);
   CHECK_EQ(dns.shape_.ndim(), 2);
   CHECK_EQ(dns.shape_, csr->shape());
   using mshadow::Shape1;
   using nnvm::dim_t;
   mshadow::Stream<cpu>* s = ctx.get_stream<cpu>();
   MSHADOW_TYPE_SWITCH(dns.type_flag_, DType, {  // data type
     MSHADOW_IDX_TYPE_SWITCH(csr->aux_type(csr::kIndPtr), IType, {  // indptr type
       MSHADOW_IDX_TYPE_SWITCH(csr->aux_type(csr::kIdx), CType, {  // col idx type
         const dim_t num_rows = dns.shape_[0];
         const dim_t num_cols = dns.shape_[1];
         csr->CheckAndAllocAuxData(csr::kIndPtr, mshadow::Shape1(num_rows+1));
         IType* indptr = csr->aux_data(csr::kIndPtr).dptr<IType>();
         DType* dns_data = dns.dptr<DType>();
         dim_t num_threads = num_rows;
         mxnet_op::Kernel<FillCsrIndPtr, cpu>::Launch(s, num_threads,
             indptr, dns_data, num_rows, num_cols);
         // single thread to accumulate indptr
         // indptr[num_rows] indicates the number of non-zero elements
         indptr[0] = 0;
         for (dim_t i = 0; i < num_rows; ++i) {
           indptr[i+1] += indptr[i];
         }
         // allocate column idx array and value array
         csr->CheckAndAllocAuxData(csr::kIdx, Shape1(static_cast<index_t>(indptr[num_rows])));
         csr->CheckAndAllocData(Shape1(static_cast<index_t>(indptr[num_rows])));
         // fill col_idx and value arrays of the csr
         mxnet_op::Kernel<FillCsrColIdxAndVals, cpu>::Launch(s, num_threads,
             csr->data().dptr<DType>(), csr->aux_data(csr::kIdx).dptr<CType>(),
             indptr, dns_data, num_rows, num_cols);
       });
     });
   });
 }

 /*!
  * \brief This is the kernel for copying csr.data to its corresponding dns matrix.
  */
 struct CopyCsrDataToDns {
   /*!
    * \brief
    * \param i         the i-th row of the dns tensor
    * \param dns_data  data blob of the dns tensor
    * \param col_idx   column idx array of the csr tensor
    * \param indptr    indptr array of the csr tensor
    * \param csr_data  data blob of the csr tensor
    * \param num_cols  number of columns of the dns tensor
    */
   template<typename DType, typename IType, typename CType>
   MSHADOW_XINLINE static void Map(int i,
                                   DType* dns_data,
                                   const CType* col_idx,
                                   const IType* indptr,
                                   const DType* csr_data,
                                   const nnvm::dim_t num_cols) {
     const nnvm::dim_t offset = i * num_cols;
     for (IType j = indptr[i]; j < indptr[i+1]; ++j) {
       dns_data[offset+col_idx[j]] = csr_data[j];
     }
   }
 };

 /*!
  * \brief Casts a csr matrix to dns format.
  */
 template<typename xpu>
 void CastStorageCsrDnsImpl(const OpContext& ctx,
                            const NDArray& csr,
                            TBlob* dns) {
   CHECK(dns != nullptr);
   CHECK_EQ(csr.storage_type(), kCSRStorage);
   CHECK_EQ(dns->shape_.ndim(), 2);
   CHECK_EQ(dns->shape_, csr.shape());
   using nnvm::dim_t;
   mshadow::Stream<xpu>* s = ctx.get_stream<xpu>();
   MSHADOW_TYPE_SWITCH(dns->type_flag_, DType, {  // data type
     MSHADOW_IDX_TYPE_SWITCH(csr.aux_type(csr::kIndPtr), IType, {  // indptr type
       MSHADOW_IDX_TYPE_SWITCH(csr.aux_type(csr::kIdx), CType, {  // col idx type
         const dim_t num_rows = dns->shape_[0];
         const dim_t num_cols = dns->shape_[1];
         DType* dns_data = dns->dptr<DType>();
         dim_t num_threads = dns->shape_.Size();
         mxnet_op::Kernel<mxnet_op::set_zero, xpu>::Launch(s, num_threads, dns_data);
         if (!csr.storage_initialized()) return;
         const IType* indptr = csr.aux_data(csr::kIndPtr).dptr<IType>();
         const CType* col_idx = csr.aux_data(csr::kIdx).dptr<CType>();
         const DType* csr_data = csr.data().dptr<DType>();
         num_threads = num_rows;
         mxnet_op::Kernel<CopyCsrDataToDns, xpu>::Launch(s, num_threads,
             dns_data, col_idx, indptr, csr_data, num_cols);
       });
     });
   });
 }

 /*!
  * \brief Casts a csr matrix to another csr.
  */
 template <typename xpu>
 void CastStorageCsrCsrImpl(const OpContext& ctx, const NDArray& csr,
                            NDArray* output) {
   mshadow::Stream<xpu>* s = ctx.get_stream<xpu>();
   if (!csr.storage_initialized()) {
     FillZerosCsrImpl(s, *output);
     return;
   }
   mxnet::ShapeVector aux_shapes({csr.aux_shape(csr::kIndPtr), csr.aux_shape(csr::kIdx)});
   output->CheckAndAlloc(aux_shapes);
   const TBlob& val = output->data();
   const TBlob& indptr = output->aux_data(csr::kIndPtr);
   const TBlob& idx = output->aux_data(csr::kIdx);
   mxnet_op::copy(s, val, csr.data());
   mxnet_op::copy(s, indptr, csr.aux_data(csr::kIndPtr));
   mxnet_op::copy(s, idx, csr.aux_data(csr::kIdx));
 }

 /*!
  * \brief Casts a rsp matrix to another rsp.
  */
 template <typename xpu>
 void CastStorageRspRspImpl(const OpContext& ctx, const NDArray& rsp,
                            NDArray* output) {
   CHECK_EQ(rsp.storage_type(), output->storage_type())
       << "Copying with different storage type";
   mshadow::Stream<xpu>* s = ctx.get_stream<xpu>();
   if (!rsp.storage_initialized()) {
     FillZerosRspImpl(s, *output);
     return;
   }
   auto aux_shape = rsp.aux_shape(rowsparse::kIdx);
   output->CheckAndAlloc({aux_shape});
   const TBlob& val = output->data();
   const TBlob& idx = output->aux_data(rowsparse::kIdx);
   const TBlob& from_val = rsp.data();
   const TBlob& from_idx = rsp.aux_data(rowsparse::kIdx);
   mxnet_op::copy(s, val, from_val);
   mxnet_op::copy(s, idx, from_idx);
 }

 template<typename xpu>
 void CastStorageComputeImpl(const OpContext& ctx,
                             const NDArray& input,
                             const NDArray& output) {
   const auto src_stype = input.storage_type();
   const auto dst_stype = output.storage_type();
   if (src_stype == kRowSparseStorage && dst_stype == kDefaultStorage) {
     TBlob ret = output.data();
     CastStorageRspDnsImpl<xpu>(ctx, input, &ret);
   } else if (src_stype == kDefaultStorage && dst_stype == kRowSparseStorage) {
     NDArray ret = output;  // get rid of the const qualifer
     CastStorageDnsRspImpl(ctx, xpu(), input.data(), &ret);
   } else if (src_stype == kDefaultStorage && dst_stype == kCSRStorage) {
     NDArray ret = output;  // get rid of the const qualifer
     CastStorageDnsCsrImpl(ctx, xpu(), input.data(), &ret);
   } else if (src_stype == kCSRStorage && dst_stype == kDefaultStorage) {
     TBlob ret = output.data();
     CastStorageCsrDnsImpl<xpu>(ctx, input, &ret);
   } else if (src_stype == kCSRStorage && dst_stype == kCSRStorage) {
     NDArray ret = output;
     CastStorageCsrCsrImpl<xpu>(ctx, input, &ret);
   } else if (src_stype == kRowSparseStorage && dst_stype == kRowSparseStorage) {
     NDArray ret = output;
     CastStorageRspRspImpl<xpu>(ctx, input, &ret);
 #if MXNET_USE_MKLDNN == 1
   } else if (src_stype == kDefaultStorage && dst_stype == kDefaultStorage) {
     CHECK_EQ(output.ctx().dev_type, input.ctx().dev_type);
     // If one of them uses the MKLDNN layout.
     if (input.IsMKLDNNData() || output.IsMKLDNNData()) {
       NDArray tmp_input = input;
       // If the input data is MKLDNN and is a view, we need to reorder the input
       // data first.
       if (input.IsMKLDNNData() && input.IsView())
         tmp_input = input.Reorder2Default();
       const mkldnn::memory *in_mem = tmp_input.GetMKLDNNData();
       const_cast<NDArray &>(output).CopyFrom(*in_mem);
       MKLDNNStream::Get()->Submit();
     } else {
       mxnet_op::copy(ctx.get_stream<xpu>(), output.data(), input.data());
     }
 #endif
   } else {
     LOG(FATAL) << "Not implemented from " << src_stype << " to " << dst_stype;
   }
 }

 struct CastStorageParam : public dmlc::Parameter<CastStorageParam> {
   int stype;
   DMLC_DECLARE_PARAMETER(CastStorageParam) {
     DMLC_DECLARE_FIELD(stype)
     .add_enum("default", kDefaultStorage)
     .add_enum("row_sparse", kRowSparseStorage)
     .add_enum("csr", kCSRStorage)
     .describe("Output storage type.");
   }
 };

 inline bool CastStorageInferStorageType(const nnvm::NodeAttrs& attrs,
                                         const int dev_mask,
                                         DispatchMode* dispatch_mode,
                                         std::vector<int> *in_attrs,
                                         std::vector<int> *out_attrs) {
   CHECK_EQ(in_attrs->size(), 1U);
   CHECK_EQ(out_attrs->size(), 1U);
   CHECK_NE(in_attrs->at(0), kUndefinedStorage)
     << "src ndarray's storage type must be specified";
   const CastStorageParam& param = nnvm::get<CastStorageParam>(attrs.parsed);
   CHECK_NE(param.stype, kUndefinedStorage)
     << "dst ndarray's storage type must be specified";
   const auto& in_stype = in_attrs->at(0);
   const auto& param_stype = static_cast<NDArrayStorageType>(param.stype);
   bool dispatched = false;
   // dns -> dns, dns -> rsp, dns -> csr
   if (!dispatched && in_stype == kDefaultStorage && param_stype == kDefaultStorage) {
     // dns -> dns
     DispatchMode mode = DispatchMode::kFCompute;
 #if MXNET_USE_MKLDNN == 1
     // If we use MKLDNN and the arrays are in CPU memory, the array may store
     // MKLDNN layout, we should convert its layout explicitly.
     if (dev_mask == kCPU)
       mode = DispatchMode::kFComputeEx;
 #endif
     dispatched = storage_type_assign(out_attrs, kDefaultStorage, dispatch_mode, mode);
   }
   if (!dispatched && in_stype == kDefaultStorage &&
     (param_stype == kRowSparseStorage || param_stype == kCSRStorage)) {
     // dns -> rsp, dns -> csr
     dispatched = storage_type_assign(out_attrs, param_stype,
                                      dispatch_mode, DispatchMode::kFComputeEx);
   }
   if (!dispatched && in_stype == kRowSparseStorage &&
       (param_stype == kRowSparseStorage || param_stype == kDefaultStorage)) {
     // rsp -> rsp, rsp -> dns
     dispatched = storage_type_assign(out_attrs, param_stype,
                                      dispatch_mode, DispatchMode::kFComputeEx);
   }
   if (!dispatched && in_stype == kCSRStorage &&
       (param_stype == kCSRStorage || param_stype == kDefaultStorage)) {
     // csr -> csr, csr -> dns
     dispatched = storage_type_assign(out_attrs, param_stype,
                                      dispatch_mode, DispatchMode::kFComputeEx);
   }
   return dispatched;
 }

 template<typename xpu>
 void CastStorageComputeEx(const nnvm::NodeAttrs& attrs,
                           const OpContext& ctx,
                           const std::vector<NDArray>& inputs,
                           const std::vector<OpReqType>& req,
                           const std::vector<NDArray>& outputs) {
   CHECK_EQ(inputs.size(), 1);
   CHECK_EQ(outputs.size(), 1);
   if (req[0] == kNullOp) return;
   CHECK_EQ(req[0], kWriteTo) << "CastStorageComputeEx expects req[0] == kWriteTo";
   CastStorageComputeImpl<xpu>(ctx, inputs[0], outputs[0]);
 }

 }  // namespace op
 }  // namespace mxnet

 #endif  // MXNET_OPERATOR_TENSOR_CAST_STORAGE_INL_H_
	/*
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing,
	* software distributed under the License is distributed on an
	* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	* KIND, either express or implied. See the License for the
	* specific language governing permissions and limitations
	* under the License.
	*/

	/*!
	* \file cast_storage-inl.h
	* \brief CPU implementation of cast_storage operator for dense and sparse tensors
	*/
	#ifndef MXNET_OPERATOR_TENSOR_CAST_STORAGE_INL_H_
	#define MXNET_OPERATOR_TENSOR_CAST_STORAGE_INL_H_

	#include <dmlc/timer.h>
	#include <mxnet/ndarray.h>
	#include <vector>
	#include <algorithm>
	#include "../mxnet_op.h"
	#include "../operator_common.h"
	#include "../../src/operator/tensor/init_op.h"
	#ifdef __CUDACC__
	#include "./cast_storage-inl.cuh"
	#endif // __CUDACC__
	#if MXNET_USE_MKLDNN == 1
	#include "../nn/mkldnn/mkldnn_base-inl.h"
	#endif


	namespace mxnet {
	namespace op {

	/*!
	* \brief CPU Kernel for marking row_idx of a RSP tensor per row.
	*/
	struct MarkRspRowIdx {
	// i represents the row index of the tensor data
	template<typename DType, typename RType>
	MSHADOW_CINLINE static void Map(int i,
	RType* row_idx,
	const DType* data,
	const nnvm::dim_t row_length) {
	using nnvm::dim_t;
	dim_t j = 0;
	dim_t offset = i * row_length;
	for (; j < row_length; ++j) {
	if (data[offset+j] != 0) {
	break;
	}
	}
	if (row_length == j) {
	row_idx[i] = 0; // mark as zero for zero row
	} else {
	row_idx[i] = 1; // mark as one for non-zero row
	}
	}
	};

	/*!
	* \brief CPU implementation of casting a dns tensor to rsp type.
	*/
	inline void CastStorageDnsRspImpl(const OpContext& ctx,
	const cpu& cpu_dev,
	const TBlob& dns,
	NDArray* rsp) {
	using namespace rowsparse;
	using namespace mshadow;
	using nnvm::dim_t;
	CHECK(rsp != nullptr);
	CHECK_EQ(rsp->storage_type(), kRowSparseStorage);
	CHECK_EQ(dns.shape_, rsp->shape());
	mshadow::Stream<cpu>* s = ctx.get_stream<cpu>();
	MSHADOW_TYPE_SWITCH(dns.type_flag_, DType, { // data type
	MSHADOW_IDX_TYPE_SWITCH(rsp->aux_type(kIdx), RType, { // row idx type
	const dim_t num_rows = dns.shape_[0];
	const dim_t row_length = dns.shape_.ProdShape(1, dns.shape_.ndim());
	rsp->CheckAndAllocAuxData(kIdx, Shape1(num_rows));
	TBlob row_idx_blob = rsp->aux_data(kIdx);
	RType* row_idx = row_idx_blob.dptr<RType>();
	dim_t num_threads = num_rows;
	mxnet_op::Kernel<MarkRspRowIdx, cpu>::Launch(s, num_threads,
	row_idx, dns.dptr<DType>(), row_length);
	dim_t nnr = 0;
	nnr = common::ParallelAccumulate(row_idx, num_rows, nnr);
	rsp->set_aux_shape(kIdx, Shape1(nnr));
	if (0 == nnr) return;
	auto storage_shape = dns.shape_;
	storage_shape[0] = nnr;
	rsp->CheckAndAllocData(storage_shape);
	auto dns_data = dns.get_with_shape<cpu, 2, DType>(Shape2(num_rows, row_length), s);
	auto rsp_data = rsp->data().get_with_shape<cpu, 2, DType>(Shape2(nnr, row_length), s);
	dim_t idx = 0;
	for (dim_t i = 0; i < num_rows; ++i) {
	if (row_idx[i] > 0) {
	row_idx[idx] = i;
	Copy(rsp_data[idx], dns_data[i], s);
	++idx;
	}
	}
	});
	});
	}

	// TODO(haibin) Use memcopy instead will be much faster than assigning each individual element
	struct CastStorageRspDnsKernel {
	template<typename DType, typename IType>
	MSHADOW_XINLINE static void Map(int i,
	const nnvm::dim_t row_length,
	const IType* idx,
	const DType *data,
	DType* dns) {
	using nnvm::dim_t;
	IType rid = idx[i];
	dim_t dns_offset = rid * row_length;
	dim_t rsp_offset = i * row_length;
	for (dim_t col = 0; col < row_length; col++) {
	dns[dns_offset + col] = data[rsp_offset + col];
	}
	}
	};

	/*!
	* \brief This function assumes that the memory for dns has been allocated already
	* since the shape is known at binding stage.
	*/
	template<typename xpu>
	void CastStorageRspDnsImpl(const OpContext& ctx,
	const NDArray& rsp,
	TBlob* dns) {
	mshadow::Stream<xpu>* s = ctx.get_stream<xpu>();
	CHECK_EQ(rsp.storage_type(), kRowSparseStorage);
	using nnvm::dim_t;
	MSHADOW_TYPE_SWITCH(dns->type_flag_, DType, {
	MSHADOW_IDX_TYPE_SWITCH(rsp.aux_type(rowsparse::kIdx), IType, {
	// assign zeros
	mxnet_op::Kernel<mxnet_op::set_zero, xpu>::Launch(s, dns->Size(), dns->dptr<DType>());
	if (rsp.storage_initialized()) {
	// copy over row by row
	auto in_idx = rsp.aux_data(rowsparse::kIdx).FlatTo1D<xpu, IType>(s).dptr_;
	auto in_data = rsp.data().dptr<DType>();
	auto out_data = dns->dptr<DType>();
	auto shape = rsp.shape();
	const dim_t num_rows = rsp.aux_shape(rowsparse::kIdx).Size();
	const dim_t row_length = shape.ProdShape(1, shape.ndim());
	const dim_t num_threads = num_rows;
	mxnet_op::Kernel<CastStorageRspDnsKernel, xpu>::Launch(s, num_threads,
	row_length, in_idx, in_data, out_data);
	}
	});
	});
	}

	/*!
	* \brief CPU kernel for initializing the indptr in a csr matrix.
	*/
	struct FillCsrIndPtr {
	/*!
	* \brief
	* \param i the i-th row of the dns tensor
	* \param indptr the indptr of the csr tensor
	* \param dns the dns tensor
	* \param num_rows number of rows of the dns tensor
	* \param num_cols number of columns of the dns tensor
	*/
	template<typename DType, typename IType>
	MSHADOW_CINLINE static void Map(int i,
	IType* indptr,
	const DType* dns,
	const nnvm::dim_t num_rows,
	const nnvm::dim_t num_cols) {
	using nnvm::dim_t;
	indptr[i+1] = 0;
	const dim_t offset = i * num_cols;
	for (dim_t j = 0; j < num_cols; ++j) {
	if (dns[offset+j] != 0) {
	++indptr[i+1];
	}
	}
	}
	};

	/*!
	* \brief CPU kernel for initializing the col_idx and value array of the csr matrix.
	*/
	struct FillCsrColIdxAndVals {
	/*!
	* \brief
	* \param i the i-th row of the dns tensor
	* \param val value array of the csr tensor
	* \param col_idx column idx array of the csr tensor
	* \param indptr indptr array of the csr tensor
	* \param dns dns tensor
	* \param num_rows number of rows of the dns tensor
	* \param num_cols number of columns of the dns tensor
	*/
	template<typename DType, typename IType, typename CType>
	MSHADOW_CINLINE static void Map(int i,
	DType* val,
	CType* col_idx,
	const IType* indptr,
	const DType* dns,
	const nnvm::dim_t num_rows,
	const nnvm::dim_t num_cols) {
	using nnvm::dim_t;
	const dim_t offset = i * num_cols;
	IType k = indptr[i];
	for (dim_t j = 0; j < num_cols; ++j) {
	if (dns[offset+j] != 0) {
	val[k] = dns[offset+j];
	col_idx[k] = j;
	++k;
	}
	}
	}
	};

	/*!
	* \brief CPU implementation of casting a dns matrix to csr type.
	*/
	inline void CastStorageDnsCsrImpl(const OpContext& ctx,
	const cpu& cpu_dev,
	const TBlob& dns,
	NDArray* csr) {
	CHECK(csr != nullptr);
	CHECK_EQ(csr->storage_type(), kCSRStorage);
	CHECK_EQ(dns.shape_.ndim(), 2);
	CHECK_EQ(dns.shape_, csr->shape());
	using mshadow::Shape1;
	using nnvm::dim_t;
	mshadow::Stream<cpu>* s = ctx.get_stream<cpu>();
	MSHADOW_TYPE_SWITCH(dns.type_flag_, DType, { // data type
	MSHADOW_IDX_TYPE_SWITCH(csr->aux_type(csr::kIndPtr), IType, { // indptr type
	MSHADOW_IDX_TYPE_SWITCH(csr->aux_type(csr::kIdx), CType, { // col idx type
	const dim_t num_rows = dns.shape_[0];
	const dim_t num_cols = dns.shape_[1];
	csr->CheckAndAllocAuxData(csr::kIndPtr, mshadow::Shape1(num_rows+1));
	IType* indptr = csr->aux_data(csr::kIndPtr).dptr<IType>();
	DType* dns_data = dns.dptr<DType>();
	dim_t num_threads = num_rows;
	mxnet_op::Kernel<FillCsrIndPtr, cpu>::Launch(s, num_threads,
	indptr, dns_data, num_rows, num_cols);
	// single thread to accumulate indptr
	// indptr[num_rows] indicates the number of non-zero elements
	indptr[0] = 0;
	for (dim_t i = 0; i < num_rows; ++i) {
	indptr[i+1] += indptr[i];
	}
	// allocate column idx array and value array
	csr->CheckAndAllocAuxData(csr::kIdx, Shape1(static_cast<index_t>(indptr[num_rows])));
	csr->CheckAndAllocData(Shape1(static_cast<index_t>(indptr[num_rows])));
	// fill col_idx and value arrays of the csr
	mxnet_op::Kernel<FillCsrColIdxAndVals, cpu>::Launch(s, num_threads,
	csr->data().dptr<DType>(), csr->aux_data(csr::kIdx).dptr<CType>(),
	indptr, dns_data, num_rows, num_cols);
	});
	});
	});
	}

	/*!
	* \brief This is the kernel for copying csr.data to its corresponding dns matrix.
	*/
	struct CopyCsrDataToDns {
	/*!
	* \brief
	* \param i the i-th row of the dns tensor
	* \param dns_data data blob of the dns tensor
	* \param col_idx column idx array of the csr tensor
	* \param indptr indptr array of the csr tensor
	* \param csr_data data blob of the csr tensor
	* \param num_cols number of columns of the dns tensor
	*/
	template<typename DType, typename IType, typename CType>
	MSHADOW_XINLINE static void Map(int i,
	DType* dns_data,
	const CType* col_idx,
	const IType* indptr,
	const DType* csr_data,
	const nnvm::dim_t num_cols) {
	const nnvm::dim_t offset = i * num_cols;
	for (IType j = indptr[i]; j < indptr[i+1]; ++j) {
	dns_data[offset+col_idx[j]] = csr_data[j];
	}
	}
	};

	/*!
	* \brief Casts a csr matrix to dns format.
	*/
	template<typename xpu>
	void CastStorageCsrDnsImpl(const OpContext& ctx,
	const NDArray& csr,
	TBlob* dns) {
	CHECK(dns != nullptr);
	CHECK_EQ(csr.storage_type(), kCSRStorage);
	CHECK_EQ(dns->shape_.ndim(), 2);
	CHECK_EQ(dns->shape_, csr.shape());
	using nnvm::dim_t;
	mshadow::Stream<xpu>* s = ctx.get_stream<xpu>();
	MSHADOW_TYPE_SWITCH(dns->type_flag_, DType, { // data type
	MSHADOW_IDX_TYPE_SWITCH(csr.aux_type(csr::kIndPtr), IType, { // indptr type
	MSHADOW_IDX_TYPE_SWITCH(csr.aux_type(csr::kIdx), CType, { // col idx type
	const dim_t num_rows = dns->shape_[0];
	const dim_t num_cols = dns->shape_[1];
	DType* dns_data = dns->dptr<DType>();
	dim_t num_threads = dns->shape_.Size();
	mxnet_op::Kernel<mxnet_op::set_zero, xpu>::Launch(s, num_threads, dns_data);
	if (!csr.storage_initialized()) return;
	const IType* indptr = csr.aux_data(csr::kIndPtr).dptr<IType>();
	const CType* col_idx = csr.aux_data(csr::kIdx).dptr<CType>();
	const DType* csr_data = csr.data().dptr<DType>();
	num_threads = num_rows;
	mxnet_op::Kernel<CopyCsrDataToDns, xpu>::Launch(s, num_threads,
	dns_data, col_idx, indptr, csr_data, num_cols);
	});
	});
	});
	}

	/*!
	* \brief Casts a csr matrix to another csr.
	*/
	template <typename xpu>
	void CastStorageCsrCsrImpl(const OpContext& ctx, const NDArray& csr,
	NDArray* output) {
	mshadow::Stream<xpu>* s = ctx.get_stream<xpu>();
	if (!csr.storage_initialized()) {
	FillZerosCsrImpl(s, *output);
	return;
	}
	mxnet::ShapeVector aux_shapes({csr.aux_shape(csr::kIndPtr), csr.aux_shape(csr::kIdx)});
	output->CheckAndAlloc(aux_shapes);
	const TBlob& val = output->data();
	const TBlob& indptr = output->aux_data(csr::kIndPtr);
	const TBlob& idx = output->aux_data(csr::kIdx);
	mxnet_op::copy(s, val, csr.data());
	mxnet_op::copy(s, indptr, csr.aux_data(csr::kIndPtr));
	mxnet_op::copy(s, idx, csr.aux_data(csr::kIdx));
	}

	/*!
	* \brief Casts a rsp matrix to another rsp.
	*/
	template <typename xpu>
	void CastStorageRspRspImpl(const OpContext& ctx, const NDArray& rsp,
	NDArray* output) {
	CHECK_EQ(rsp.storage_type(), output->storage_type())
	<< "Copying with different storage type";
	mshadow::Stream<xpu>* s = ctx.get_stream<xpu>();
	if (!rsp.storage_initialized()) {
	FillZerosRspImpl(s, *output);
	return;
	}
	auto aux_shape = rsp.aux_shape(rowsparse::kIdx);
	output->CheckAndAlloc({aux_shape});
	const TBlob& val = output->data();
	const TBlob& idx = output->aux_data(rowsparse::kIdx);
	const TBlob& from_val = rsp.data();
	const TBlob& from_idx = rsp.aux_data(rowsparse::kIdx);
	mxnet_op::copy(s, val, from_val);
	mxnet_op::copy(s, idx, from_idx);
	}

	template<typename xpu>
	void CastStorageComputeImpl(const OpContext& ctx,
	const NDArray& input,
	const NDArray& output) {
	const auto src_stype = input.storage_type();
	const auto dst_stype = output.storage_type();
	if (src_stype == kRowSparseStorage && dst_stype == kDefaultStorage) {
	TBlob ret = output.data();
	CastStorageRspDnsImpl<xpu>(ctx, input, &ret);
	} else if (src_stype == kDefaultStorage && dst_stype == kRowSparseStorage) {
	NDArray ret = output; // get rid of the const qualifer
	CastStorageDnsRspImpl(ctx, xpu(), input.data(), &ret);
	} else if (src_stype == kDefaultStorage && dst_stype == kCSRStorage) {
	NDArray ret = output; // get rid of the const qualifer
	CastStorageDnsCsrImpl(ctx, xpu(), input.data(), &ret);
	} else if (src_stype == kCSRStorage && dst_stype == kDefaultStorage) {
	TBlob ret = output.data();
	CastStorageCsrDnsImpl<xpu>(ctx, input, &ret);
	} else if (src_stype == kCSRStorage && dst_stype == kCSRStorage) {
	NDArray ret = output;
	CastStorageCsrCsrImpl<xpu>(ctx, input, &ret);
	} else if (src_stype == kRowSparseStorage && dst_stype == kRowSparseStorage) {
	NDArray ret = output;
	CastStorageRspRspImpl<xpu>(ctx, input, &ret);
	#if MXNET_USE_MKLDNN == 1
	} else if (src_stype == kDefaultStorage && dst_stype == kDefaultStorage) {
	CHECK_EQ(output.ctx().dev_type, input.ctx().dev_type);
	// If one of them uses the MKLDNN layout.
	if (input.IsMKLDNNData() \|\| output.IsMKLDNNData()) {
	NDArray tmp_input = input;
	// If the input data is MKLDNN and is a view, we need to reorder the input
	// data first.
	if (input.IsMKLDNNData() && input.IsView())
	tmp_input = input.Reorder2Default();
	const mkldnn::memory *in_mem = tmp_input.GetMKLDNNData();
	const_cast<NDArray &>(output).CopyFrom(*in_mem);
	MKLDNNStream::Get()->Submit();
	} else {
	mxnet_op::copy(ctx.get_stream<xpu>(), output.data(), input.data());
	}
	#endif
	} else {
	LOG(FATAL) << "Not implemented from " << src_stype << " to " << dst_stype;
	}
	}

	struct CastStorageParam : public dmlc::Parameter<CastStorageParam> {
	int stype;
	DMLC_DECLARE_PARAMETER(CastStorageParam) {
	DMLC_DECLARE_FIELD(stype)
	.add_enum("default", kDefaultStorage)
	.add_enum("row_sparse", kRowSparseStorage)
	.add_enum("csr", kCSRStorage)
	.describe("Output storage type.");
	}
	};

	inline bool CastStorageInferStorageType(const nnvm::NodeAttrs& attrs,
	const int dev_mask,
	DispatchMode* dispatch_mode,
	std::vector<int> *in_attrs,
	std::vector<int> *out_attrs) {
	CHECK_EQ(in_attrs->size(), 1U);
	CHECK_EQ(out_attrs->size(), 1U);
	CHECK_NE(in_attrs->at(0), kUndefinedStorage)
	<< "src ndarray's storage type must be specified";
	const CastStorageParam& param = nnvm::get<CastStorageParam>(attrs.parsed);
	CHECK_NE(param.stype, kUndefinedStorage)
	<< "dst ndarray's storage type must be specified";
	const auto& in_stype = in_attrs->at(0);
	const auto& param_stype = static_cast<NDArrayStorageType>(param.stype);
	bool dispatched = false;
	// dns -> dns, dns -> rsp, dns -> csr
	if (!dispatched && in_stype == kDefaultStorage && param_stype == kDefaultStorage) {
	// dns -> dns
	DispatchMode mode = DispatchMode::kFCompute;
	#if MXNET_USE_MKLDNN == 1
	// If we use MKLDNN and the arrays are in CPU memory, the array may store
	// MKLDNN layout, we should convert its layout explicitly.
	if (dev_mask == kCPU)
	mode = DispatchMode::kFComputeEx;
	#endif
	dispatched = storage_type_assign(out_attrs, kDefaultStorage, dispatch_mode, mode);
	}
	if (!dispatched && in_stype == kDefaultStorage &&
	(param_stype == kRowSparseStorage \|\| param_stype == kCSRStorage)) {
	// dns -> rsp, dns -> csr
	dispatched = storage_type_assign(out_attrs, param_stype,
	dispatch_mode, DispatchMode::kFComputeEx);
	}
	if (!dispatched && in_stype == kRowSparseStorage &&
	(param_stype == kRowSparseStorage \|\| param_stype == kDefaultStorage)) {
	// rsp -> rsp, rsp -> dns
	dispatched = storage_type_assign(out_attrs, param_stype,
	dispatch_mode, DispatchMode::kFComputeEx);
	}
	if (!dispatched && in_stype == kCSRStorage &&
	(param_stype == kCSRStorage \|\| param_stype == kDefaultStorage)) {
	// csr -> csr, csr -> dns
	dispatched = storage_type_assign(out_attrs, param_stype,
	dispatch_mode, DispatchMode::kFComputeEx);
	}
	return dispatched;
	}

	template<typename xpu>
	void CastStorageComputeEx(const nnvm::NodeAttrs& attrs,
	const OpContext& ctx,
	const std::vector<NDArray>& inputs,
	const std::vector<OpReqType>& req,
	const std::vector<NDArray>& outputs) {
	CHECK_EQ(inputs.size(), 1);
	CHECK_EQ(outputs.size(), 1);
	if (req[0] == kNullOp) return;
	CHECK_EQ(req[0], kWriteTo) << "CastStorageComputeEx expects req[0] == kWriteTo";
	CastStorageComputeImpl<xpu>(ctx, inputs[0], outputs[0]);
	}

	} // namespace op
	} // namespace mxnet

	#endif // MXNET_OPERATOR_TENSOR_CAST_STORAGE_INL_H_