src/operator/tensor/dot-inl.cuh - mxnet - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *   http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
  */

 /*!
  *  Copyright (c) 2017 by Contributors
  * \file dot-inl.cuh
  * \brief GPU specific function definition of matrix dot operator
  */
 #ifndef MXNET_OPERATOR_TENSOR_DOT_INL_CUH_
 #define MXNET_OPERATOR_TENSOR_DOT_INL_CUH_

 #include <mxnet/base.h>
 #include <mxnet/operator.h>
 #include "./init_op.h"
 #include "./sort_op.h"
 #include "./util/tensor_util-inl.h"
 #include "./util/tensor_util-inl.cuh"
 #include "../../common/utils.h"

 namespace mxnet {
 namespace op {

 /*!
  * \brief GPU scalar kernel of dot(csr, dns1) = dns2
  * Parallelization by output matrix elements: 1 thread/element
  */
 template<int req>
 struct DotCsrDnsDnsScalarKernel {
   /*!
    * \brief This function represents performing an inner product between a row of lhs
    * and a column of rhs and then assigning the value to out[tid].
    * \param tid         global thread id
    * \param out         output matrix data
    * \param data_l      csr matrix data
    * \param indptr_l    csr matrix row index pointer
    * \param col_idx_l   csr matrix column indices
    * \param data_r      dns1 matrix data
    * \param num_cols_r  dns1 matrix number of columns
    */
   template<typename DType, typename IType, typename CType>
   __device__ __forceinline__ static void Map(int tid,
                                              DType* out,
                                              const DType* data_l,
                                              const IType* indptr_l,
                                              const CType* col_idx_l,
                                              const DType* data_r,
                                              const nnvm::dim_t num_cols_r) {
     const nnvm::dim_t irow = tid / num_cols_r;  // row id of the lhs
     const nnvm::dim_t icol = tid % num_cols_r;  // col id of the rhs
     DType sum = 0;
     for (IType j = indptr_l[irow]; j < indptr_l[irow+1]; ++j) {
       const CType cur_col = col_idx_l[j];  // corresponding row id of the rhs
       sum += data_l[j] * data_r[cur_col*num_cols_r+icol];
     }
     KERNEL_ASSIGN(out[tid], req, sum);
   }
 };

 /*!
  * \brief GPU vector kernel of dot(csr, dns1) = dns2
  * Parallelization by output matrix elements: 1 warp/element
  */
 template<int req>
 struct DotCsrDnsDnsVectorKernel {
   /*!
    * \brief see DotCsrDnsDnsScalarKernel Map for documentation.
    */
   template<typename DType, typename IType, typename CType>
   __device__ __forceinline__ static void Map(int tid,
                                              DType* out,
                                              const DType* data_l,
                                              const IType* indptr_l,
                                              const CType* col_idx_l,
                                              const DType* data_r,
                                              const nnvm::dim_t num_cols_r) {
     using nnvm::dim_t;
     __shared__ volatile DType vals[mshadow::cuda::kBaseThreadNum];
     const dim_t warp_id = tid / 32;           // global warp id
     const dim_t lane = tid & (32-1);          // local thread id within warp
     const dim_t irow = warp_id / num_cols_r;  // lhs row that this warp computes
     const dim_t kcol = warp_id % num_cols_r;  // rhs column that this warp computes

     // Range of nnz elements in this row
     const dim_t low  = static_cast<dim_t>(indptr_l[irow]);
     const dim_t high = static_cast<dim_t>(indptr_l[irow+1]);

     // Compute running sum per thread
     DType sum = 0;
     for (dim_t j = low+lane; j < high; j+=32) {
       sum += data_l[j] * data_r[col_idx_l[j]*num_cols_r + kcol];
     }
     vals[threadIdx.x] = sum; __syncwarp();

     // Parallel reduction in shared memory
     if (lane < 16) {vals[threadIdx.x] += vals[threadIdx.x+16];} __syncwarp();
     if (lane <  8) {vals[threadIdx.x] += vals[threadIdx.x+ 8];} __syncwarp();
     if (lane <  4) {vals[threadIdx.x] += vals[threadIdx.x+ 4];} __syncwarp();
     if (lane <  2) {vals[threadIdx.x] += vals[threadIdx.x+ 2];} __syncwarp();
     if (lane <  1) {vals[threadIdx.x] += vals[threadIdx.x+ 1];} __syncwarp();

     if (lane == 0) {
       KERNEL_ASSIGN(out[irow*num_cols_r+kcol], req, vals[threadIdx.x]);
     }
   }
 };

 /*!
  * \brief GPU scalar kernel of dot(csr.T, dns1) = dns2
  * Parallelization by output matrix elements: 1 thread/element
  */
 template<int req>
 struct DotCsrTransDnsDnsScalarKernel {
   /*!
    * \brief This function represents performing an inner product between a column of lhs
    * and a column of rhs and then assigning the value to out[tid].
    * \param tid         global thread id
    * \param out         output matrix
    * \param data_l      csr matrix data
    * \param indptr_l    csr matrix row index pointer
    * \param col_idx_l   csr matrix column indices
    * \param data_r      dns1 matrix data of rhs
    * \param num_rows_l  csr matrix number of rows (= number of columns of csr.T)
    * \param num_cols_r  dns1 matrix number of columns
    */
   template<typename DType, typename IType, typename CType>
   __device__ __forceinline__ static void Map(int tid,
                                              DType* out,
                                              const DType* data_l,
                                              const IType* indptr_l,
                                              const CType* col_idx_l,
                                              const DType* data_r,
                                              const nnvm::dim_t num_rows_l,
                                              const nnvm::dim_t num_cols_r) {
     using nnvm::dim_t;
     const dim_t irow = tid / num_cols_r;  // col id of the lhs
     const dim_t icol = tid % num_cols_r;  // col id of the rhs
     DType sum = 0;

     // Each thread scans each column with binary search to find nnz elements in its row
     for (dim_t k = 0; k < num_rows_l; ++k) {
       const dim_t low = static_cast<dim_t>(indptr_l[k]);
       const dim_t high = static_cast<dim_t>(indptr_l[k+1]);
       if (low == high || irow < col_idx_l[low] || irow > col_idx_l[high-1]) continue;
       dim_t j = high, l = low, r = high - 1;
       while (l <= r) {
         dim_t m = l + (r - l) / 2;
         if (col_idx_l[m] == irow) {
           j = m; break;
         }
         if (col_idx_l[m] < irow) {
           l = m + 1;
         } else {
           r = m - 1;
         }
       }
       if (j < high) {
         sum += data_l[j] * data_r[k*num_cols_r+icol];
       }
     }
     KERNEL_ASSIGN(out[tid], req, sum);
   }
 };

 /*!
  * \brief GPU Kernel of dot(csr.T, rsp1) = rsp2
  * Parallelization by rows: 1 thread/row
  * TODO: write a faster kernel optimized for GPU
  */
 struct DotCsrTransRspRspByRowsKernel {
   /*!
    * \brief
    * \param tid           global thread id
    * \param out           output rsp matrix data
    * \param row_idx_out   output rsp matrix non-zero row indices
    * \param data_l        csr matrix data
    * \param indptr_l      csr matrix row index pointer
    * \param col_idx_l     csr matrix column indices
    * \param data_r        rsp1 matrix data
    * \param row_idx_r     rsp1 matrix non-zero row indices
    * \param num_cols_r    rsp1 matrix number of cols
    * \param nnr_r         rsp1 matrix number of non-zero rows
    * \param nnr_out       output rsp matrix number of non-zero rows
    */
   template<typename DType, typename IType, typename CType, typename RType>
   __device__ __forceinline__ static void Map(int tid,
                                              DType* out,
                                              const RType* row_idx_out,
                                              const DType* data_l,
                                              const IType* indptr_l,
                                              const CType* col_idx_l,
                                              const DType* data_r,
                                              const RType* row_idx_r,
                                              const nnvm::dim_t num_cols_r,
                                              const nnvm::dim_t nnr_r,
                                              const nnvm::dim_t nnr_out) {
     using nnvm::dim_t;
     // This thread computes non-zero row 'tid' of the output matrix
     // The actual row id corresponding to the lhs row is row_idx_out[tid]
     if (tid < nnr_out) {
       const dim_t offset_out = tid * num_cols_r;
       // Iterate over rhs matrix rows (or, equivalently, lhs columns worthy taking a look at)
       for (dim_t i = 0; i < nnr_r; i++) {
         const RType j = row_idx_r[i];  // j is the actual rhs row id (= lhs column id)
         if (indptr_l[j] == indptr_l[j+1]) continue;
         const dim_t offset_r = i * num_cols_r;
         // Iterate over lhs column j to find possible non-zero value in this row
         // TODO: remove sequential search, this is a bottleneck
         for (IType k = indptr_l[j]; k < indptr_l[j+1]; k++) {
           const CType col_idx = col_idx_l[k];
           if (col_idx == row_idx_out[tid]) {
             for (dim_t l = 0; l < num_cols_r; l++) {
               out[offset_out+l] += data_l[k] * data_r[offset_r+l];
             }
           } else if (col_idx > row_idx_out[tid]) {
             break;
           }
         }
       }
     }
   }
 };

 /*!
  * \brief GPU Kernel of dot(csr, rsp) = dns
  * Parallelization by output elements: 1 thread/element
  */
 struct DotCsrRspDnsScalarKernel {
   /*!
    * \brief
    * \param tid        global thread id
    * \param out        output dns matrix data
    * \param data_l     csr matrix data
    * \param indptr_l   csr matrix row index pointer
    * \param col_idx_l  csr matrix column indices
    * \param data_r     rsp matrix data
    * \param row_idx_r  rsp matrix non-zero row indices
    * \param row_flg_r  rsp matrix auxiliary array holding storage indices of non-zero rows
    * \param nnr_r      rsp matrix number of non-zero rows
    * \param num_rows   output dns matrix number of rows
    * \param num_cols   output dns matrix number of columns
    */
   template<typename DType, typename IType, typename CType, typename RType>
   __device__ __forceinline__ static void Map(int tid,
                                              DType* out,
                                              const DType* data_l,
                                              const IType* indptr_l,
                                              const CType* col_idx_l,
                                              const DType* data_r,
                                              const RType* row_idx_r,
                                              const RType* row_flg_r,
                                              const nnvm::dim_t nnr_r,
                                              const nnvm::dim_t num_rows,
                                              const nnvm::dim_t num_cols) {
     using nnvm::dim_t;
     if (tid < num_rows*num_cols) {
       const dim_t i = static_cast<dim_t>(tid) / num_cols;  // i = row this thread computes
       const dim_t k = static_cast<dim_t>(tid) % num_cols;  // k = col this thread computes
       // Compute inner product of i-th row and k-th col
       DType sum = 0;
       for (IType j = indptr_l[i]; j < indptr_l[i+1]; j++) {
         const dim_t csr_col = col_idx_l[j];
         const dim_t rsp_row_idx = row_flg_r[csr_col];
         if (rsp_row_idx > 0) {
           sum += data_l[j] * data_r[(rsp_row_idx-1)*num_cols+k];
         }
       }
       if (sum != 0) {
         out[i*num_cols+k] += sum;
       }
     }
   }
 };

 /*!
  * \brief GPU Kernel to scatter row id to corresponding entries
  * \param tid         global thread id
  * \param csr_indptr  indptr array of csr
  * \param csr_rows    array of row id of csr elements
  * \param num_rows    total number of rows in csr matrix
  * Parallelization by output elements: 1 thread/row
  */
 struct CsrRowScatterKernel {
   template<typename CType>
   __device__ __forceinline__ static void Map(int tid,
                                              const CType* csr_indptr,
                                              CType* csr_rows,
                                              const nnvm::dim_t num_rows) {
     if (tid < num_rows) {
       for (CType i = csr_indptr[tid]; i < csr_indptr[tid+1]; ++i) {
         csr_rows[i] = tid;
       }
     }
   }
 };

 struct CscDataIndicesKernel {
   /*!
    * \brief
    * \param tid          global thread id
    * \param lhs_data     lhs dense matrix data
    * \param rhs_data     csr matrix data
    * \param rhs_indices  csr matrix column indices
    * \param rhs_indptr   csr matrix row pointer
    * \param out          output matrix data
    * \param lhs_num_cols lhs dns matrix number of columns
    * \param out_num_rows output dns matrix number of rows
    * \param out_num_cols output dns matrix number of columns
    */
   template<typename DType, typename IType, typename CType>
   __device__ __forceinline__ static void Map(int tid,
                                              const IType* original_idx_ptr,
                                              const DType* csr_data_ptr,
                                              const CType* csr_rows_ptr,
                                              DType* csc_data_ptr,
                                              IType* csc_indices_ptr,
                                              const nnvm::dim_t nnz) {
     using nnvm::dim_t;
     if (tid < nnz) {
       const IType origin = original_idx_ptr[tid];
       csc_data_ptr[tid] = csr_data_ptr[origin];
       csc_indices_ptr[tid] = csr_rows_ptr[origin];
     }
   }
 };

 /*!
  * \brief GPU Kernel of dot(dns, csr.T) = dns
  * Parallelization by output elements: 1 thread/element
  */
 struct DotDnsCsrTransDnsKernel {
   /*!
    * \brief
    * \param tid          global thread id
    * \param lhs_data     lhs dense matrix data
    * \param rhs_data     csr matrix data
    * \param rhs_indices  csr matrix column indices
    * \param rhs_indptr   csr matrix row pointer
    * \param out          output matrix data
    * \param lhs_num_cols lhs dns matrix number of columns
    * \param out_num_rows output dns matrix number of rows
    * \param out_num_cols output dns matrix number of columns
    */
   template<typename DType, typename IType, typename CType>
   __device__ __forceinline__ static void Map(int tid,
                                              const DType* lhs_data,
                                              const DType* rhs_data,
                                              const IType* rhs_indices,
                                              const CType* rhs_indptr,
                                              DType* out,
                                              const nnvm::dim_t lhs_num_cols,
                                              const nnvm::dim_t out_num_rows,
                                              const nnvm::dim_t out_num_cols) {
     using nnvm::dim_t;
     if (tid < out_num_rows*out_num_cols) {
       const dim_t i = static_cast<dim_t>(tid) % out_num_rows;  // i = row this thread computes
       const dim_t k = static_cast<dim_t>(tid) / out_num_rows;  // k = col this thread computes
       // Compute inner product of i-th row and k-th col
       DType sum = 0;
       for (CType col_id = rhs_indptr[k]; col_id < rhs_indptr[k + 1]; ++col_id) {
         sum += lhs_data[i * lhs_num_cols + rhs_indices[col_id]] * rhs_data[col_id];
       }
       out[i * out_num_cols + k] = sum;
     }
   }
 };

 /*!
  * \brief GPU Impl of dot(csr, dns1) = dns2 and dot(csr.T, dns1) = dns2
  */
 inline void DotCsrDnsDnsImpl(const OpContext& ctx,
                              const gpu& gpu_dev,
                              const NDArray& lhs,
                              const TBlob& rhs,
                              const OpReqType req,
                              const bool trans_lhs,
                              TBlob* ret) {
   if (kNullOp == req) return;
   CHECK_EQ(lhs.storage_type(), kCSRStorage);
   mshadow::Stream<gpu>* s = ctx.get_stream<gpu>();
   if (!lhs.storage_initialized()) {
     Fill(s, *ret, req, 0);
     return;
   }

   using namespace mshadow;
   using mxnet_op::Kernel;
   using mxnet_op::set_zero;
   using nnvm::dim_t;

   const dim_t num_rows_l = lhs.shape()[0];
   const dim_t num_cols_r = rhs.shape_[1];
   const dim_t threads_per_warp = mxnet_op::cuda_get_device_prop().warpSize;
   dim_t num_threads;
   // TODO: remove kernel dependency on warpSize=32
   if (threads_per_warp != 32) {
     LOG(FATAL) << "DotCsrDnsDnsImpl GPU kernels expect warpSize=32";
   }

   const TBlob data_l = lhs.data();
   const TBlob indptr_l = lhs.aux_data(csr::kIndPtr);
   const TBlob col_idx_l = lhs.aux_data(csr::kIdx);
   const TBlob& data_r = rhs;
   const TBlob data_out = *ret;

   MSHADOW_SGL_DBL_TYPE_SWITCH(data_l.type_flag_, DType, {  // data type
     MSHADOW_IDX_TYPE_SWITCH(indptr_l.type_flag_, IType, {  // indptr type
       MSHADOW_IDX_TYPE_SWITCH(col_idx_l.type_flag_, CType, {  // col idx type
         if (kWriteTo == req) {
           num_threads = data_out.Size();
           Kernel<set_zero, gpu>::Launch(s, num_threads, data_out.dptr<DType>());
         }
         if (trans_lhs) {
           // TODO(haojin2): Switching to deterministic algorithm for now.
           //                Further optimizations to come later.
           const nnvm::dim_t num_csr_rows = lhs.shape()[0];
           const nnvm::dim_t num_csr_cols = lhs.shape()[1];
           const nnvm::dim_t num_dns_rows = rhs.shape_[0];
           const nnvm::dim_t nnz = lhs.storage_shape().Size();

           IType* original_idx_ptr = nullptr;
           IType* csc_indices_ptr = nullptr;
           IType* csc_cols_ptr = nullptr;
           CType* csr_rows_ptr = nullptr;
           CType* csc_indptr_ptr = nullptr;
           DType* csc_data_ptr = nullptr;
           char* temp_storage_ptr = nullptr;
           size_t original_idx_bytes = nnz*sizeof(IType);
           size_t csc_indices_bytes = nnz*sizeof(IType);
           size_t csc_cols_bytes = nnz*sizeof(IType);
           size_t csr_rows_bytes = nnz*sizeof(CType);
           size_t csc_indptr_bytes = (num_csr_cols+1)*sizeof(CType);
           size_t csc_data_bytes = nnz*sizeof(DType);
           size_t scan_temp_storage_bytes = 0;
           size_t temp_storage_bytes = SortByKeyWorkspaceSize<IType, IType, gpu>(nnz);
           IType* csr_indices_ptr = col_idx_l.dptr<IType>();
           cub::DeviceScan::ExclusiveSum(temp_storage_ptr,
                                         scan_temp_storage_bytes,
                                         csc_indptr_ptr,
                                         csc_indptr_ptr,
                                         num_csr_cols+1,
                                         mshadow::Stream<gpu>::GetStream(s));
           temp_storage_bytes = std::max(temp_storage_bytes, scan_temp_storage_bytes);
           temp_storage_bytes += (sizeof(dim_t) - temp_storage_bytes % sizeof(dim_t));
           size_t total_workspace_bytes =
             original_idx_bytes + csc_indices_bytes + csc_cols_bytes + csr_rows_bytes +
             csc_indptr_bytes + csc_data_bytes + temp_storage_bytes;
           total_workspace_bytes += (sizeof(IType) - total_workspace_bytes % sizeof(IType));
           Tensor<gpu, 1, char> workspace = ctx.requested[0]
               .get_space_typed<gpu, 1, char>(Shape1(total_workspace_bytes), s);
           original_idx_ptr = reinterpret_cast<IType*>(workspace.dptr_);
           csc_indices_ptr = reinterpret_cast<IType*>(workspace.dptr_ + original_idx_bytes);
           csc_cols_ptr = reinterpret_cast<IType*>(workspace.dptr_ + original_idx_bytes +
                                                   csc_indices_bytes);
           csr_rows_ptr = reinterpret_cast<CType*>(workspace.dptr_ + original_idx_bytes +
                                                   csc_indices_bytes + csc_cols_bytes);
           csc_indptr_ptr = reinterpret_cast<CType*>(workspace.dptr_ + original_idx_bytes +
                                                     csc_indices_bytes + csc_cols_bytes +
                                                     csr_rows_bytes);
           temp_storage_ptr = workspace.dptr_ + original_idx_bytes + csc_indices_bytes +
                              csc_cols_bytes + csr_rows_bytes + csc_indptr_bytes;
           csc_data_ptr = reinterpret_cast<DType*>(
                            workspace.dptr_ + total_workspace_bytes - csc_data_bytes);

           // Fill original_idx
           mxnet_op::Kernel<range_fwd, gpu>::Launch(
             s, nnz, 1, IType(0), IType(1), kWriteTo, original_idx_ptr);
           // Fill csc_cols with copy of csr_indices
           mxnet_op::Kernel<mxnet_op::op_with_req<mshadow_op::identity, kWriteTo>, gpu>::Launch(
             s, nnz, csc_cols_ptr, csr_indices_ptr);
           // Allocate the tensors needed for SortByKey
           Tensor<gpu, 1, IType> original_idx(original_idx_ptr, Shape1(nnz), s);
           Tensor<gpu, 1, IType> csc_cols(csc_cols_ptr, Shape1(nnz), s);
           Tensor<gpu, 1, char> temp_storage(temp_storage_ptr, Shape1(temp_storage_bytes), s);

           int num_bits = common::ilog2ul(num_csr_cols - 1);
           SortByKey(csc_cols, original_idx, true, &temp_storage, 0, num_bits);

           // Scatter csr indptr to row id
           mxnet_op::Kernel<CsrRowScatterKernel, gpu>::Launch(
             s, num_csr_rows, indptr_l.dptr<CType>(), csr_rows_ptr, num_csr_rows);
           // Reset indptr to zero
           mxnet_op::Kernel<mxnet_op::set_zero, gpu>::Launch(s, num_csr_cols+1, csc_indptr_ptr);
           // Histogram on the sorted cols
           mxnet_op::Kernel<HistogramKernel, gpu>::Launch(
             s, nnz, csc_indptr_ptr, csc_cols_ptr, nnz);
           // Scan the bin counts for every column to get csc_indptr
           cub::DeviceScan::ExclusiveSum(temp_storage_ptr,
                                         temp_storage_bytes,
                                         csc_indptr_ptr,
                                         csc_indptr_ptr,
                                         num_csr_cols+1,
                                         mshadow::Stream<gpu>::GetStream(s));
           // Assign data to csc matrix arrays
           mxnet_op::Kernel<CscDataIndicesKernel, gpu>::Launch(
             s, nnz, original_idx_ptr, data_l.dptr<DType>(), csr_rows_ptr, csc_data_ptr,
             csc_indices_ptr, nnz);
           if (num_cols_r > 4) {
             num_threads = data_out.Size();
             MXNET_ASSIGN_REQ_SWITCH(req, ReqType, {
               Kernel<DotCsrDnsDnsScalarKernel<ReqType>, gpu>::Launch(s, num_threads,
                   data_out.dptr<DType>(), csc_data_ptr, csc_indptr_ptr,
                   csc_indices_ptr, data_r.dptr<DType>(), num_cols_r);
             });
           } else {
             num_threads = threads_per_warp * num_rows_l * num_cols_r;
             MXNET_ASSIGN_REQ_SWITCH(req, ReqType, {
               Kernel<DotCsrDnsDnsVectorKernel<ReqType>, gpu>::Launch(s, num_threads,
                   data_out.dptr<DType>(), csc_data_ptr, csc_indptr_ptr,
                   csc_indices_ptr, data_r.dptr<DType>(), num_cols_r);
             });
           }
         } else {
           if (num_cols_r > 4) {
             num_threads = data_out.Size();
             MXNET_ASSIGN_REQ_SWITCH(req, ReqType, {
               Kernel<DotCsrDnsDnsScalarKernel<ReqType>, gpu>::Launch(s, num_threads,
                   data_out.dptr<DType>(), data_l.dptr<DType>(), indptr_l.dptr<IType>(),
                   col_idx_l.dptr<CType>(), data_r.dptr<DType>(), num_cols_r);
             });
           } else {
             num_threads = threads_per_warp * num_rows_l * num_cols_r;
             MXNET_ASSIGN_REQ_SWITCH(req, ReqType, {
               Kernel<DotCsrDnsDnsVectorKernel<ReqType>, gpu>::Launch(s, num_threads,
                   data_out.dptr<DType>(), data_l.dptr<DType>(), indptr_l.dptr<IType>(),
                   col_idx_l.dptr<CType>(), data_r.dptr<DType>(), num_cols_r);
             });
           }
         }
       });
     });
   });
 }

 struct DotCsrTransDnsRspKernel {
   /*!
    * \brief
    * \param tid              global thread id
    * \param out              output rsp matrix data
    * \param lookup_table     lookup table from row in lhs to row in dst
    * \param sorted_indices   csr matrix column indices in sorted order
    * \param nnz              number of non-zeros in csr matrix
    * \param original_idx     original indices to the unsorted csr column indices
    * \param rhs              dns rhs data
    * \param val_array        csr matrix data
    * \param idx_array        csr matrix row indices
    * \param row_length       length of a row in the output rsp matrix
    */
   template<typename DType, typename IType>
   __device__ __forceinline__ static void Map(int thread_id,
                                              DType* out,
                                              const IType* lookup_table,
                                              const IType* sorted_indices,
                                              const nnvm::dim_t nnz,
                                              const IType* original_idx,
                                              const DType* rhs,
                                              const DType* val_array,
                                              const IType* idx_array,
                                              const nnvm::dim_t row_length) {
     int tid = thread_id / row_length;
     const nnvm::dim_t offset = thread_id % row_length;
     if (tid == 0 || sorted_indices[tid - 1] != sorted_indices[tid]) {
       DType acc = 0;
       const IType src_row_idx = sorted_indices[tid];
       const IType dst_row_idx = lookup_table[src_row_idx];
       const IType out_offset = dst_row_idx * row_length + offset;
       do {
         const IType idx = original_idx[tid];
         const DType val = val_array[idx];
         const DType col_idx = idx_array[idx];
         const IType rhs_offset = col_idx * row_length + offset;
         acc += rhs[rhs_offset] * val;
         tid++;
       } while (tid < nnz && sorted_indices[tid - 1] == sorted_indices[tid]);
       out[out_offset] = acc;
     }
   }
 };

 /*!
  * \brief GPU Impl of dot(csr.T, dns) = rsp
  */
 inline void DotCsrDnsRspImpl(const OpContext& ctx,
                              const gpu& gpu_dev,
                              const NDArray& lhs,
                              const TBlob& rhs,
                              const OpReqType req,
                              const bool trans_lhs,
                              NDArray* ret) {
   if (kNullOp == req) return;
   CHECK_EQ(lhs.storage_type(), kCSRStorage);
   CHECK_EQ(ret->storage_type(), kRowSparseStorage);
   CHECK_EQ(req, kWriteTo);
   mshadow::Stream<gpu>* s = ctx.get_stream<gpu>();
   if (!lhs.storage_initialized()) {
     FillZerosRspImpl(s, *ret);
     return;
   }

   using mshadow::Shape1;
   using mshadow::Tensor;
   using mxnet_op::Kernel;
   using mxnet_op::set_zero;
   using nnvm::dim_t;
   using namespace csr;

   const TBlob data_l = lhs.data();
   const TBlob indptr_l = lhs.aux_data(kIndPtr);
   const TBlob col_idx_l = lhs.aux_data(kIdx);
   const TBlob& data_r = rhs;
   size_t nnz = lhs.aux_data(kIdx).Size();

   const dim_t num_rows_l = lhs.shape()[0];
   const dim_t num_cols_l = lhs.shape()[1];
   const dim_t num_cols_r = rhs.shape_[1];
   CHECK_EQ(ret->aux_type(rowsparse::kIdx), col_idx_l.type_flag_)
     << "Mismatch indices dtype detected";
   MSHADOW_SGL_DBL_TYPE_SWITCH(data_l.type_flag_, DType, {  // data type
     MSHADOW_IDX_TYPE_SWITCH(indptr_l.type_flag_, IType, {  // indptr type
       MSHADOW_IDX_TYPE_SWITCH(col_idx_l.type_flag_, CType, {  // col idx type
         if (trans_lhs) {
           IType* col_idx_l_ptr = col_idx_l.dptr<IType>();
           // temporary memory layout
           size_t* nnr_ptr = nullptr;
           IType* original_idx_ptr = nullptr;
           IType* row_idx_ptr = nullptr;
           IType* col_idx_copy_ptr = nullptr;
           IType* lookup_table_ptr = nullptr;
           char* temp_storage_ptr = nullptr;

           // estimate temp space for unique.
           const size_t nnr_bytes = sizeof(size_t);
           size_t unique_temp_bytes = 0;
           size_t *null_ptr = nullptr;
           size_t *null_dptr = nullptr;
           cudaStream_t stream = mshadow::Stream<gpu>::GetStream(s);
           cub::DeviceSelect::Unique(NULL, unique_temp_bytes, null_dptr, null_dptr,
                                     null_ptr, nnz, stream);
           // the temp storage for sort and unique
           size_t original_idx_bytes = nnz * sizeof(IType);
           size_t row_idx_bytes = nnz * sizeof(IType);
           size_t col_idx_copy_bytes = nnz * sizeof(IType);
           size_t lookup_table_bytes = num_cols_l * sizeof(IType);
           size_t sort_temp_bytes = SortByKeyWorkspaceSize<IType, IType, gpu>(nnz);
           size_t total_temp_bytes = std::max(sort_temp_bytes, unique_temp_bytes);

           // layout: original_idx, col_idx_copy, temp_storage
           size_t total_workspace_bytes = nnr_bytes + original_idx_bytes + row_idx_bytes +
                                          col_idx_copy_bytes +
                                          lookup_table_bytes + total_temp_bytes;
           // request temp space
           Tensor<gpu, 1, char> workspace = ctx.requested[0]
               .get_space_typed<gpu, 1, char>(Shape1(total_workspace_bytes), s);
           // update individual temp space ptrs
           nnr_ptr = reinterpret_cast<size_t*>(workspace.dptr_);
           original_idx_ptr = reinterpret_cast<IType*>(workspace.dptr_ + nnr_bytes);
           row_idx_ptr = reinterpret_cast<IType*>(workspace.dptr_ + nnr_bytes +
                                                  original_idx_bytes);
           col_idx_copy_ptr = reinterpret_cast<IType*>(workspace.dptr_ + nnr_bytes +
                                                       original_idx_bytes + row_idx_bytes);
           lookup_table_ptr = reinterpret_cast<IType*>(workspace.dptr_ + nnr_bytes +
                                                       original_idx_bytes + row_idx_bytes +
                                                       col_idx_copy_bytes);
           temp_storage_ptr = workspace.dptr_ + nnr_bytes + original_idx_bytes +
                              row_idx_bytes + col_idx_copy_bytes + lookup_table_bytes;

           // Fill original_idx
           Kernel<range_fwd, gpu>::Launch(
             s, nnz, 1, IType(0), IType(1), kWriteTo, original_idx_ptr);
           // Make a copy of col_idx_l
           Kernel<mxnet_op::op_with_req<mshadow_op::identity, kWriteTo>, gpu>::Launch(
             s, nnz, col_idx_copy_ptr, col_idx_l_ptr);

           // Construct the tensors needed for SortByKey
           Tensor<gpu, 1, IType> col_idx_copy(col_idx_copy_ptr, Shape1(nnz), s);
           Tensor<gpu, 1, IType> original_idx(original_idx_ptr, Shape1(nnz), s);
           Tensor<gpu, 1, char> temp_storage(temp_storage_ptr, Shape1(total_temp_bytes), s);

           int num_bits = common::ilog2ul(num_cols_l - 1);
           SortByKey(col_idx_copy, original_idx, true, &temp_storage, 0, num_bits);

           // over-allocate aux indices
           ret->CheckAndAllocAuxData(rowsparse::kIdx, Shape1(nnz));
           // compute unique indices
           IType* ret_idx_ptr = ret->aux_data(rowsparse::kIdx).dptr<IType>();
           cub::DeviceSelect::Unique(temp_storage_ptr, unique_temp_bytes, col_idx_copy_ptr, ret_idx_ptr,
                                     nnr_ptr, nnz, stream);
           // retrieve num non-zero rows
           size_t nnr = 0;
           CUDA_CALL(cudaMemcpy(&nnr, nnr_ptr, nnr_bytes, cudaMemcpyDeviceToHost));
           // allocate data
           ret->CheckAndAllocData(mshadow::Shape2(nnz, num_cols_r));
           // generate lookup table
           Kernel<MarkLookupTable, gpu>::Launch(s, nnr, lookup_table_ptr, ret_idx_ptr);

           // Scatter csr indptr to row id
           Kernel<CsrRowScatterKernel, gpu>::Launch(
             s, num_rows_l, indptr_l.dptr<IType>(), row_idx_ptr, num_rows_l);

           Kernel<DotCsrTransDnsRspKernel, gpu>::Launch(s, nnz * num_cols_r,
                  ret->data().dptr<DType>(),
                  lookup_table_ptr, col_idx_copy_ptr, nnz,
                  original_idx_ptr, data_r.dptr<DType>(),
                  data_l.dptr<DType>(),
                  row_idx_ptr, num_cols_r);

           // reshape aux data
           ret->set_aux_shape(rowsparse::kIdx, Shape1(nnr));
         } else {
           LOG(FATAL) << "DotCsrDnsRspImpl has not implemented dot(csr, dns) = rsp yet.";
         }
       });
     });
   });
 }


 /*!
  * \brief GPU Impl of dot(csr, rsp1) = rsp2 and dot(csr.T, rsp1) = rsp2
  * TODO: Optimize for GPU; this is a baseline implementation providing
  *       the operator functionality, it is not yet fully optimized for GPU.
  */
 inline void DotCsrRspRspImpl(const OpContext& ctx,
                              const gpu& gpu_dev,
                              const NDArray& lhs,
                              const NDArray& rhs,
                              const OpReqType req,
                              const bool trans_lhs,
                              NDArray* ret) {
   if (kNullOp == req) return;
   // Reuse dot(csr, dns) implementation if rhs rsp matrix is in fact dense
   if (rhs.storage_shape()[0] == rhs.shape()[0]) {
     DotCsrDnsRspImpl(ctx, gpu_dev, lhs, rhs.data(), req, trans_lhs, ret);
     return;
   }
   CHECK_EQ(lhs.storage_type(), kCSRStorage);
   CHECK_EQ(rhs.storage_type(), kRowSparseStorage);
   CHECK_EQ(ret->storage_type(), kRowSparseStorage);
   mshadow::Stream<gpu>* s = ctx.get_stream<gpu>();
   if (!lhs.storage_initialized() || !rhs.storage_initialized()) {
     FillZerosRspImpl(s, *ret);
     return;
   }
   CHECK_EQ(req, kWriteTo);

   using mshadow::Shape1;
   using mxnet_op::Kernel;
   using mxnet_op::set_zero;
   using nnvm::dim_t;

   const TBlob data_l = lhs.data();
   const TBlob indptr_l = lhs.aux_data(csr::kIndPtr);
   const TBlob col_idx_l = lhs.aux_data(csr::kIdx);
   const TBlob data_r = rhs.data();
   const TBlob row_idx_r = rhs.aux_data(rowsparse::kIdx);

   const dim_t num_rows_l = lhs.shape()[0];
   const dim_t num_cols_l = lhs.shape()[1];
   const dim_t num_cols_r = rhs.shape()[1];
   const dim_t nnr_r = rhs.storage_shape()[0];
   const dim_t threads_per_warp = mxnet_op::cuda_get_device_prop().warpSize;
   dim_t num_threads;
   // TODO: remove kernel dependency on warpSize=32
   if (threads_per_warp != 32) {
     LOG(FATAL) << "DotCsrRspRspImpl GPU kernels expect warpSize=32";
   }

   MSHADOW_SGL_DBL_TYPE_SWITCH(data_l.type_flag_, DType, {  // data type
     MSHADOW_IDX_TYPE_SWITCH(indptr_l.type_flag_, IType, {  // indptr type
       MSHADOW_IDX_TYPE_SWITCH(col_idx_l.type_flag_, CType, {  // col idx type
         MSHADOW_IDX_TYPE_SWITCH(row_idx_r.type_flag_, RType, {  // row idx type
           if (trans_lhs) {
             // Compute number of non-zero rows (nnr) of output matrix
             // - alloc temp storage for row_flg array and for cub's prefix sum
             // - mark non-zero columns of csr matrix in row_flg
             // - compute inclusive prefix sum over marked array
             // - copy last value (nnr_out) from device to host
             dim_t* row_flg_out = NULL;
             void* d_temp_storage = NULL;
             size_t temp_storage_bytes = 0;
             cub::DeviceScan::InclusiveSum(d_temp_storage,
                                           temp_storage_bytes,
                                           row_flg_out,
                                           row_flg_out,
                                           num_cols_l,
                                           mshadow::Stream<gpu>::GetStream(s));
             mshadow::Tensor<gpu, 1, char> workspace = ctx.requested[0]
                 .get_space_typed<gpu, 1, char>(Shape1(num_cols_l * sizeof(dim_t) +
                                                       temp_storage_bytes), s);
             row_flg_out = reinterpret_cast<dim_t*>(workspace.dptr_);
             d_temp_storage = workspace.dptr_ + num_cols_l*sizeof(dim_t);
             num_threads = num_cols_l;
             Kernel<set_zero, gpu>::Launch(s, num_threads, row_flg_out);
             num_threads = num_rows_l * threads_per_warp;
             Kernel<MarkCsrColWarpKernel, gpu>::Launch(s, num_threads,
                 row_flg_out, col_idx_l.dptr<CType>(), indptr_l.dptr<IType>(),
                 num_rows_l, num_cols_l);
             cub::DeviceScan::InclusiveSum(d_temp_storage,
                                           temp_storage_bytes,
                                           row_flg_out,
                                           row_flg_out,
                                           num_cols_l,
                                           mshadow::Stream<gpu>::GetStream(s));
             dim_t nnr_out = 0;
             CUDA_CALL(cudaMemcpy(&nnr_out, &row_flg_out[num_cols_l-1], sizeof(dim_t),
                                  cudaMemcpyDeviceToHost));
             if (0 == nnr_out) {
               FillZerosRspImpl(s, *ret);
               return;
             }

             // Allocate output matrix space
             ret->CheckAndAlloc({mshadow::Shape1(nnr_out)});
             const TBlob data_out_blob = ret->data();
             const TBlob row_idx_out_blob = ret->aux_data(rowsparse::kIdx);
             DType* data_out = data_out_blob.dptr<DType>();
             RType* row_idx_out = row_idx_out_blob.dptr<RType>();
             num_threads = nnr_out * num_cols_r;
             Kernel<set_zero, gpu>::Launch(s, num_threads, data_out);
             num_threads = nnr_out;
             Kernel<set_zero, gpu>::Launch(s, num_threads, row_idx_out);

             // Fill row_idx array of output matrix, using the row_flg values
             num_threads = num_cols_l;
             Kernel<FillRspRowIdxKernel, gpu>::Launch(s, num_threads,
                 row_idx_out, row_flg_out, num_cols_l);

             // Perform matrix-matrix multiply
             num_threads = nnr_out;
             Kernel<DotCsrTransRspRspByRowsKernel, gpu>::Launch(s, num_threads,
                 data_out, row_idx_out,
                 data_l.dptr<DType>(), indptr_l.dptr<IType>(), col_idx_l.dptr<CType>(),
                 data_r.dptr<DType>(), row_idx_r.dptr<RType>(),
                 num_cols_r, nnr_r, nnr_out);
           } else {
             LOG(FATAL) << "DotCsrRspRspImpl has not implemented dot(csr, rsp1) = rsp2 yet.";
           }
         });
       });
     });
   });
 }

 /*!
  * \brief GPU Impl of dot(csr, rsp) = dns and dot(csr.T, rsp) = dns
  */
 inline void DotCsrRspDnsImpl(const OpContext& ctx,
                              const gpu& gpu_dev,
                              const NDArray& lhs,
                              const NDArray& rhs,
                              const OpReqType req,
                              const bool trans_lhs,
                              TBlob* ret) {
   // Reuse dot(csr, dns) implementation if rhs rsp matrix is in fact dense
   if (rhs.storage_shape()[0] == rhs.shape()[0]) {
     DotCsrDnsDnsImpl(ctx, gpu_dev, lhs, rhs.data(), req, trans_lhs, ret);
     return;
   }
   if (kNullOp == req) return;
   CHECK_EQ(lhs.storage_type(), kCSRStorage);
   CHECK_EQ(rhs.storage_type(), kRowSparseStorage);

   using mxnet_op::Kernel;
   using mxnet_op::set_zero;
   mshadow::Stream<gpu>* s = ctx.get_stream<gpu>();
   if (!lhs.storage_initialized() || !rhs.storage_initialized()) {
     if (kWriteTo == req) {
       MSHADOW_TYPE_SWITCH(ret->type_flag_, DType, {  // data type
         Kernel<set_zero, gpu>::Launch(s, ret->Size(), ret->dptr<DType>());
       });
     }
     return;
   }

   using nnvm::dim_t;
   const dim_t num_rows = ret->shape_[0];
   const dim_t num_cols = ret->shape_[1];
   const dim_t nnr_r = rhs.storage_shape()[0];
   dim_t num_threads;

   const TBlob data_l = lhs.data();
   const TBlob indptr_l = lhs.aux_data(csr::kIndPtr);
   const TBlob col_idx_l = lhs.aux_data(csr::kIdx);
   const TBlob data_r = rhs.data();
   const TBlob row_idx_r = rhs.aux_data(rowsparse::kIdx);

   MSHADOW_SGL_DBL_TYPE_SWITCH(data_l.type_flag_, DType, {  // data type
     MSHADOW_IDX_TYPE_SWITCH(indptr_l.type_flag_, IType, {  // indptr type
       MSHADOW_IDX_TYPE_SWITCH(col_idx_l.type_flag_, CType, {  // col idx type
         MSHADOW_IDX_TYPE_SWITCH(row_idx_r.type_flag_, RType, {  // row idx type
           if (kWriteTo == req) {
             num_threads = num_rows*num_cols;
             Kernel<set_zero, gpu>::Launch(s, num_threads, ret->dptr<DType>());
           }
           if (trans_lhs) {
             LOG(FATAL) << "DotCsrRspDnsImpl has not implemented dot(csr.T, rsp) = dns yet.";
           } else {
             // TODO: Consider implementing a vector kernel for SpMV (similar to DotCsrDnsDns)
             // Alloc temp storage for row_flg array
             RType* row_flg_r = ctx.requested[0]
                 .get_space_typed<gpu, 1, RType>(mshadow::Shape1(rhs.shape()[0]), s).dptr_;
             num_threads = rhs.shape()[0];
             Kernel<set_zero, gpu>::Launch(s, num_threads, row_flg_r);
             // Set row_flg index array
             num_threads = nnr_r;
             Kernel<IndexRspRowFlgKernel, gpu>::Launch(s, num_threads,
                 row_flg_r, row_idx_r.dptr<RType>(), nnr_r);
             // Perform sparse matrix-matrix multiply
             num_threads = num_rows*num_cols;
             Kernel<DotCsrRspDnsScalarKernel, gpu>::Launch(s, num_threads,
                 ret->dptr<DType>(),
                 data_l.dptr<DType>(), indptr_l.dptr<IType>(), col_idx_l.dptr<CType>(),
                 data_r.dptr<DType>(), row_idx_r.dptr<RType>(), row_flg_r, rhs.storage_shape()[0],
                 num_rows, num_cols);
           }
         });
       });
     });
   });
 }


 /*
  * \brief GPU Impl of dot(dns, csr) = csr
  */
 inline void DotDnsCsrCsrImpl(const OpContext& ctx, const gpu& gpu_dev,
                              const TBlob& lhs, const NDArray& rhs,
                              const OpReqType req, NDArray* ret) {
   LOG(FATAL) << "dot(dense, csr) = csr is not implemented on GPU";
 }

 /*
  * \brief GPU Impl of dot(dns, csr) = dns and dot(dns, csr.T) = dns
  */
 inline void DotDnsCsrDnsImpl(const OpContext& ctx, const gpu& gpu_dev,
                              const TBlob& dns, const NDArray& rhs,
                              const OpReqType req, NDArray* ret,
                              const bool transpose_b) {
   if (req == kNullOp) {
     return;
   }
   CHECK_EQ(req, kWriteTo);
   CHECK_EQ(rhs.storage_type(), kCSRStorage);

   using namespace mshadow;
   using namespace mshadow::expr;
   using nnvm::dim_t;

   /* Initialize data structures */
   mshadow::Stream<gpu>* s = ctx.get_stream<gpu>();
   TBlob csr_data = rhs.data();
   TBlob csr_indices = rhs.aux_data(csr::kIdx);
   TBlob csr_indptr = rhs.aux_data(csr::kIndPtr);
   if (!rhs.storage_initialized()) {
     Fill(s, ret->data(), req, 0);
     return;
   }

   MSHADOW_SGL_DBL_TYPE_SWITCH(csr_data.type_flag_, DType, {     // data type
     MSHADOW_IDX_TYPE_SWITCH(csr_indices.type_flag_, IType, {     // indptr type
       MSHADOW_IDX_TYPE_SWITCH(csr_indptr.type_flag_, CType, {  // colidx type
         const nnvm::dim_t out_num_rows = ret->shape()[0];
         const nnvm::dim_t out_num_cols = ret->shape()[1];
         // if dot(dense, csr) = dns, transform to csc first
         if (!transpose_b) {
           const nnvm::dim_t num_csr_rows = rhs.shape()[0];
           const nnvm::dim_t num_csr_cols = rhs.shape()[1];
           const nnvm::dim_t num_dns_rows = dns.shape_[0];
           const nnvm::dim_t nnz = rhs.storage_shape().Size();

           IType* original_idx_ptr = nullptr;
           IType* csc_indices_ptr = nullptr;
           IType* csc_cols_ptr = nullptr;
           CType* csr_rows_ptr = nullptr;
           CType* csc_indptr_ptr = nullptr;
           DType* csc_data_ptr = nullptr;
           char* temp_storage_ptr = nullptr;
           size_t original_idx_bytes = nnz*sizeof(IType);
           size_t csc_indices_bytes = nnz*sizeof(IType);
           size_t csc_cols_bytes = nnz*sizeof(IType);
           size_t csr_rows_bytes = nnz*sizeof(CType);
           size_t csc_indptr_bytes = (num_csr_cols+1)*sizeof(CType);
           size_t csc_data_bytes = nnz*sizeof(DType);
           size_t scan_temp_storage_bytes = 0;
           size_t temp_storage_bytes = SortByKeyWorkspaceSize<IType, IType, gpu>(nnz);
           IType* csr_indices_ptr = csr_indices.dptr<IType>();
           cub::DeviceScan::ExclusiveSum(temp_storage_ptr,
                                         scan_temp_storage_bytes,
                                         csc_indptr_ptr,
                                         csc_indptr_ptr,
                                         num_csr_cols+1,
                                         mshadow::Stream<gpu>::GetStream(s));
           temp_storage_bytes = std::max(temp_storage_bytes, scan_temp_storage_bytes);
           temp_storage_bytes += (sizeof(dim_t) - temp_storage_bytes % sizeof(dim_t));
           size_t total_workspace_bytes =
             original_idx_bytes + csc_indices_bytes + csc_cols_bytes + csr_rows_bytes +
             csc_indptr_bytes + csc_data_bytes + temp_storage_bytes;
           total_workspace_bytes += (sizeof(IType) - total_workspace_bytes % sizeof(IType));
           Tensor<gpu, 1, char> workspace = ctx.requested[0]
               .get_space_typed<gpu, 1, char>(Shape1(total_workspace_bytes), s);
           original_idx_ptr = reinterpret_cast<IType*>(workspace.dptr_);
           csc_indices_ptr = reinterpret_cast<IType*>(workspace.dptr_ + original_idx_bytes);
           csc_cols_ptr = reinterpret_cast<IType*>(workspace.dptr_ + original_idx_bytes +
                                                   csc_indices_bytes);
           csr_rows_ptr = reinterpret_cast<CType*>(workspace.dptr_ + original_idx_bytes +
                                                   csc_indices_bytes + csc_cols_bytes);
           csc_indptr_ptr = reinterpret_cast<CType*>(workspace.dptr_ + original_idx_bytes +
                                                     csc_indices_bytes + csc_cols_bytes +
                                                     csr_rows_bytes);
           temp_storage_ptr = workspace.dptr_ + original_idx_bytes + csc_indices_bytes +
                              csc_cols_bytes + csr_rows_bytes + csc_indptr_bytes;
           csc_data_ptr = reinterpret_cast<DType*>(
                            workspace.dptr_ + total_workspace_bytes - csc_data_bytes);

           // Fill original_idx
           mxnet_op::Kernel<range_fwd, gpu>::Launch(
             s, nnz, 1, IType(0), IType(1), kWriteTo, original_idx_ptr);
           // Fill csc_cols with copy of csr_indices
           mxnet_op::Kernel<mxnet_op::op_with_req<mshadow_op::identity, kWriteTo>, gpu>::Launch(
             s, nnz, csc_cols_ptr, csr_indices_ptr);
           // Allocate the tensors needed for SortByKey
           Tensor<gpu, 1, IType> original_idx(original_idx_ptr, Shape1(nnz), s);
           Tensor<gpu, 1, IType> csc_cols(csc_cols_ptr, Shape1(nnz), s);
           Tensor<gpu, 1, char> temp_storage(temp_storage_ptr, Shape1(temp_storage_bytes), s);

           int num_bits = common::ilog2ul(num_csr_cols - 1);
           SortByKey(csc_cols, original_idx, true, &temp_storage, 0, num_bits);

           // Scatter csr indptr to row id
           mxnet_op::Kernel<CsrRowScatterKernel, gpu>::Launch(
             s, num_csr_rows, csr_indptr.dptr<CType>(), csr_rows_ptr, num_csr_rows);
           // Reset indptr to zero
           mxnet_op::Kernel<mxnet_op::set_zero, gpu>::Launch(s, num_csr_cols+1, csc_indptr_ptr);
           // Histogram on the sorted cols
           mxnet_op::Kernel<HistogramKernel, gpu>::Launch(
             s, nnz, csc_indptr_ptr, csc_cols_ptr, nnz);
           // Scan the bin counts for every column to get csc_indptr
           cub::DeviceScan::ExclusiveSum(temp_storage_ptr,
                                         temp_storage_bytes,
                                         csc_indptr_ptr,
                                         csc_indptr_ptr,
                                         num_csr_cols+1,
                                         mshadow::Stream<gpu>::GetStream(s));
           // Assign data to csc matrix arrays
           mxnet_op::Kernel<CscDataIndicesKernel, gpu>::Launch(
             s, nnz, original_idx_ptr, csr_data.dptr<DType>(), csr_rows_ptr, csc_data_ptr,
             csc_indices_ptr, nnz);

           mxnet_op::Kernel<DotDnsCsrTransDnsKernel, gpu>::Launch(
             s, out_num_rows * out_num_cols, dns.dptr<DType>(),
             csc_data_ptr, csc_indices_ptr, csc_indptr_ptr,
             ret->data().dptr<DType>(), dns.shape_[1],
             out_num_rows, out_num_cols);
         } else {
           mxnet_op::Kernel<DotDnsCsrTransDnsKernel, gpu>::Launch(
             s, out_num_rows * out_num_cols, dns.dptr<DType>(),
             csr_data.dptr<DType>(), csr_indices.dptr<IType>(),
             csr_indptr.dptr<CType>(), ret->data().dptr<DType>(),
             dns.shape_[1], out_num_rows, out_num_cols);
         }
       });
     });
   });
 }

 }  // namespace op
 }  // namespace mxnet

 #endif  // MXNET_OPERATOR_TENSOR_DOT_INL_CUH_