src/operator/tensor/indexing_op.cu - mxnet - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *   http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
  */

 /*!
  * \file indexing_op.cu
  * \brief GPU implementation of indexing operator
  * \author Siyi Li, Chi Zhang
  */

 #include "./indexing_op.h"
 #include "./util/tensor_util-inl.cuh"
 #include "./util/tensor_util-inl.h"

 namespace mxnet {
 namespace op {

 /*! \brief If there are out-of-bound indices, out will be assigned to 1.
  */
 struct is_valid_check {
   template <typename DType>
   MSHADOW_XINLINE static void Map(int i,
                                   char* out,
                                   const DType* data,
                                   const DType min,
                                   const DType max) {
     if (data[i] < min || data[i] > max)
       *out = 1;
   }
 };

 struct AddTakeGradRspGPUKernel {
   template <typename DType, typename IType>
   __device__ __forceinline__ static void Map(int tid,
                                              DType* out,
                                              const nnvm::dim_t* prefix_sum,
                                              const IType* data,
                                              const DType* ograd,
                                              const nnvm::dim_t row_length) {
     using nnvm::dim_t;
     const dim_t data_i  = tid / row_length;
     const dim_t grad_i  = tid % row_length;
     const dim_t irow    = static_cast<dim_t>(data[data_i]);
     const dim_t rsp_row = prefix_sum[irow] - 1;
     const DType val     = ograd[data_i * row_length + grad_i];
     atomicAdd(static_cast<DType*>(&(out[rsp_row * row_length + grad_i])), val);
   }
 };

 /*
  * \brief kernel for backward computation for take, executed with deterministic order
  * \param thread_id the thread id
  * \param out the output gradient data
  * \param lookup_table the table to lookup the position of an id in gradient array
  * \param sorted_data the sorted data input
  * \param original_idx the original indices of the sorted data input
  * \param ograd head gradient
  * \param row_length the output dimension
  * \param num_threads_per_row the number of threads to process a row together
  * \param SZ the number of features a thread is responsible for
  */
 template <int SZ>
 struct AddTakeGradRspDeterministicKernel {
   template <typename DType>
   __device__ __forceinline__ static void Map(int thread_id,
                                              DType* out,
                                              const nnvm::dim_t* lookup_table,
                                              const nnvm::dim_t* sorted_data,
                                              const nnvm::dim_t data_size,
                                              const nnvm::dim_t* original_idx,
                                              const DType* ograd,
                                              const nnvm::dim_t row_length,
                                              const nnvm::dim_t num_threads_per_row) {
     using nnvm::dim_t;
     int tid                 = thread_id / num_threads_per_row;
     const int feature_start = thread_id % num_threads_per_row * SZ;
     int num_features        = SZ;
     if (feature_start + num_features > row_length) {
       num_features = row_length - feature_start;
     }
     if (tid == 0 || sorted_data[tid - 1] != sorted_data[tid]) {
       DType acc[SZ];
 #pragma unroll
       for (int i = 0; i < SZ; i++) {
         acc[i] = 0;
       }
       const dim_t data       = sorted_data[tid];
       const dim_t row_id     = lookup_table[data];
       const dim_t out_offset = row_id * row_length + feature_start;
       do {
         const dim_t idx          = original_idx[tid];
         const dim_t ograd_offset = idx * row_length + feature_start;
         for (int i = 0; i < num_features; i++) {
           acc[i] += ograd[ograd_offset + i];
         }
         tid++;
       } while (tid < data_size && sorted_data[tid - 1] == sorted_data[tid]);
       for (int i = 0; i < num_features; i++) {
         out[out_offset + i] += acc[i];
       }
     }
   }
 };

 template <bool clip = true>
 struct TakeZeroAxisGPU {
   // assume that idx have been flattened to a 1-D tensor (N,)
   // assume that out_data and in_data have been flattened to 2-D tensors, (N, M) and (K, M)
   // M is the number of columns of in_data and out_data
   // K is the number of rows of in_data
   // i is the index of out_data
   template <typename DType, typename IType>
   MSHADOW_XINLINE static void Map(int i,
                                   DType* out_data,
                                   const DType* in_data,
                                   const IType* idx,
                                   const int64_t M,
                                   const int64_t K) {
     int64_t j = static_cast<int64_t>(idx[i / M]);
     if (clip) {
       if (j <= 0)
         j = 0;
       else if (j >= K)
         j = K - 1;
     } else {
       j = j % K;
       j += (j < 0) ? K : 0;
     }
     out_data[i] = in_data[j * M + i % M];
   }
 };

 /*
  * \brief returns true if all indices are between [min, max]
  * \param s the stream
  * \param data_ptr the indices on the stream
  * \param data_size the number of indices to examine
  * \param min the expected min value for indices
  * \param max the expected max value for indices
  * \param is_valid_ptr the temparary workspace
  */
 template <typename DType>
 bool CheckIndexOutOfBound(mshadow::Stream<gpu>* s,
                           const DType* data_ptr,
                           size_t data_size,
                           const DType min,
                           const DType max,
                           char* is_valid_ptr) {
   using namespace mxnet_op;
   int32_t is_valid = 0;
   Kernel<set_zero, gpu>::Launch(s, 1, is_valid_ptr);
   Kernel<is_valid_check, gpu>::Launch(s, data_size, is_valid_ptr, data_ptr, min, max);
   CUDA_CALL(cudaMemcpyAsync(&is_valid,
                             is_valid_ptr,
                             sizeof(char),
                             cudaMemcpyDeviceToHost,
                             mshadow::Stream<gpu>::GetStream(s)));
   CUDA_CALL(cudaStreamSynchronize(mshadow::Stream<gpu>::GetStream(s)));
   return is_valid == 0;
 }

 // Embedding forward implementation with dense weight
 template <>
 void EmbeddingOpForwardDnsImpl<gpu>(mshadow::Stream<gpu>* s,
                                     const TBlob& data,
                                     const TBlob& weight,
                                     const OpReqType req,
                                     const TBlob& output) {
   using namespace mxnet_op;
   const mxnet::TShape& ishape = data.shape_;
   const mxnet::TShape& oshape = output.shape_;

   MSHADOW_TYPE_SWITCH(output.type_flag_, DType, {
     MSHADOW_TYPE_SWITCH(data.type_flag_, IType, {
       Tensor<gpu, 1, IType> idx =
           data.get_with_shape<gpu, 1, IType>(Shape1(ishape.ProdShape(0, ishape.ndim())), s);
       Tensor<gpu, 2, DType> wmat = weight.get<gpu, 2, DType>(s);
       Tensor<gpu, 2, DType> out  = output.get_with_shape<gpu, 2, DType>(
           Shape2(oshape.ProdShape(0, oshape.ndim() - 1), oshape[oshape.ndim() - 1]), s);
       Kernel<TakeZeroAxisGPU<true>, gpu>::Launch(
           s, oshape.Size(), out.dptr_, wmat.dptr_, idx.dptr_, wmat.shape_[1], wmat.shape_[0]);
     });
   });
 }

 template <>
 void SparseEmbeddingOpForwardRspImpl<gpu>(const OpContext& ctx,
                                           const TBlob& data,
                                           const NDArray& weight,
                                           const OpReqType req,
                                           const TBlob& output) {
   if (req == kNullOp)
     return;
   using namespace rowsparse;
   using namespace mxnet_op;
   mshadow::Stream<gpu>* s = ctx.get_stream<gpu>();
   // zeros weight
   if (req == kWriteTo && !weight.storage_initialized()) {
     size_t out_size = output.shape_.Size();
     MSHADOW_TYPE_SWITCH(output.type_flag_, DType, {
       Fill<false>(
           s, TBlob(output.dptr<DType>(), mshadow::Shape1(out_size), gpu::kDevMask), kWriteTo, 0);
     })
     return;
   }
   // check out-of-bound indices
   MSHADOW_TYPE_SWITCH(data.type_flag_, DType, {
     DType min                      = 0;
     DType max                      = static_cast<DType>(weight.shape()[0] - 1);
     DType* data_ptr                = data.dptr<DType>();
     size_t data_size               = data.shape_.Size();
     Tensor<gpu, 1, char> workspace = ctx.requested[0].get_space_typed<gpu, 1, char>(Shape1(1), s);
     char* is_valid_ptr             = reinterpret_cast<char*>(workspace.dptr_);
     bool is_valid = CheckIndexOutOfBound(s, data_ptr, data_size, min, max, is_valid_ptr);
     CHECK(is_valid) << "SparseEmbedding input contains data out of bound";
   })
   // the weight is actually dense
   if (weight.aux_shape(kIdx)[0] == weight.shape()[0]) {
     EmbeddingOpForwardDnsImpl<gpu>(s, data, weight.data(), req, output);
   } else {
     EmbeddingOpForwardRspImpl<gpu>(s, data, weight, req, output);
   }
 }

 template <typename IType, typename DType, typename RType>
 void SparseEmbeddingDeterministicKernelLaunch(const OpContext& ctx,
                                               const TBlob& ograd,
                                               const TBlob& data,
                                               const OpReqType req,
                                               const NDArray& output) {
   using namespace mshadow;
   using namespace mxnet_op;
   using namespace expr;
   using namespace rowsparse;
   using nnvm::dim_t;
   mshadow::Stream<gpu>* s = ctx.get_stream<gpu>();
   const dim_t num_rows    = output.shape()[0];
   const dim_t row_length  = output.shape()[1];
   const dim_t data_size   = static_cast<dim_t>(data.shape_.Size());
   // temp resource declarations
   dim_t* lookup_table = nullptr;
   void* temp_storage  = nullptr;
   dim_t* sorted_data  = nullptr;
   dim_t* original_idx = nullptr;
   // calculate number of bytes for temp resources
   size_t lookup_table_bytes         = num_rows * sizeof(dim_t);
   size_t sorted_data_storage_bytes  = data_size * sizeof(dim_t);
   size_t original_idx_storage_bytes = data_size * sizeof(dim_t);
   size_t sort_workspace_size        = SortByKeyWorkspaceSize<dim_t, dim_t, gpu>(data_size);
   size_t unique_workspace_bytes     = 0;
   // estimate unique temp space
   IType* data_ptr  = data.dptr<IType>();
   size_t* null_ptr = nullptr;
   // unique operations will be applied on sorted data
   cub::DeviceSelect::Unique(nullptr,
                             unique_workspace_bytes,
                             sorted_data,
                             sorted_data,
                             null_ptr,
                             data_size,
                             Stream<gpu>::GetStream(s));
   // One more space reserved for unique count
   size_t temp_workspace_bytes = std::max(unique_workspace_bytes, sort_workspace_size);
   size_t total_storage_bytes  = lookup_table_bytes + sorted_data_storage_bytes +
                                original_idx_storage_bytes + temp_workspace_bytes;

   // request resource and split it. layout is:
   // lookup_table, sorted_data, original_idx, temp_storage
   Tensor<gpu, 1, char> workspace =
       ctx.requested[0].get_space_typed<gpu, 1, char>(Shape1(total_storage_bytes), s);
   lookup_table = reinterpret_cast<dim_t*>(workspace.dptr_);
   sorted_data  = reinterpret_cast<dim_t*>(workspace.dptr_ + lookup_table_bytes);
   original_idx =
       reinterpret_cast<dim_t*>(workspace.dptr_ + lookup_table_bytes + sorted_data_storage_bytes);
   temp_storage = workspace.dptr_ + total_storage_bytes - temp_workspace_bytes;

   // check out-of-bound indices
   {
     IType min        = 0;
     IType max        = static_cast<IType>(output.shape()[0] - 1);
     IType* data_ptr  = data.dptr<IType>();
     size_t data_size = data.shape_.Size();
     bool is_valid    = CheckIndexOutOfBound(
         s, data_ptr, data_size, min, max, reinterpret_cast<char*>(temp_storage));
     CHECK(is_valid) << "Embedding input contains data out of bound";
   }

   // make a copy of the data, to be sorted
   TBlob sorted_data_blob(sorted_data, Shape1(data_size), gpu::kDevMask);
   auto sorted_data_tensor = sorted_data_blob.FlatTo1D<gpu, dim_t>(s);
   mxnet_op::copy(s, sorted_data_blob, data);

   // generate original idx
   Tensor<gpu, 1, dim_t> original_idx_tensor(original_idx, Shape1(data_size), s);
   Kernel<range_fwd, gpu>::Launch(
       s, data_size, 1, static_cast<dim_t>(0), static_cast<dim_t>(1), kWriteTo, original_idx);
   // sort data with its original idx
   int num_bits           = common::ilog2ui(num_rows - 1);
   char* temp_storage_ptr = reinterpret_cast<char*>(temp_storage);
   Tensor<gpu, 1, char> temp_storage_tensor(temp_storage_ptr, Shape1(sort_workspace_size), s);
   SortByKey(sorted_data_tensor, original_idx_tensor, true, &temp_storage_tensor, 0, num_bits);

   // compute unique row ids based on sorted values.
   output.CheckAndAllocAuxData(kIdx, Shape1(data_size + 1));

   // fill row_idx array of output matrix, using the row_flg values
   RType* grad_row_idx = output.aux_data(kIdx).dptr<RType>();
   cub::DeviceSelect::Unique(temp_storage_ptr,
                             unique_workspace_bytes,
                             sorted_data,
                             grad_row_idx,
                             grad_row_idx + data_size,
                             data_size,
                             Stream<gpu>::GetStream(s));

   dim_t nnr = 0;
   CUDA_CALL(cudaMemcpyAsync(&nnr,
                             grad_row_idx + data_size,
                             sizeof(RType),
                             cudaMemcpyDeviceToHost,
                             mshadow::Stream<gpu>::GetStream(s)));
   CUDA_CALL(cudaStreamSynchronize(mshadow::Stream<gpu>::GetStream(s)));
   CHECK_EQ(output.shape().ndim(), 2) << "Unexcepted ndim";
   output.CheckAndAllocData(Shape2(nnr, output.shape()[1]));
   output.set_aux_shape(kIdx, Shape1(nnr));

   // generate lookup table
   Kernel<MarkLookupTable, gpu>::Launch(s, nnr, lookup_table, grad_row_idx);

   // accumulate gradients
   DType* grad_data = output.data().dptr<DType>();
   Fill<false>(s, TBlob(grad_data, Shape1(nnr * row_length), gpu::kDevMask), kWriteTo, 0);
   const int SZ                          = 4;
   const nnvm::dim_t num_threads_per_row = (row_length + SZ - 1) / SZ;
   Kernel<AddTakeGradRspDeterministicKernel<SZ>, gpu>::Launch(s,
                                                              data_size * num_threads_per_row,
                                                              grad_data,
                                                              lookup_table,
                                                              sorted_data,
                                                              data_size,
                                                              original_idx,
                                                              ograd.dptr<DType>(),
                                                              row_length,
                                                              num_threads_per_row);
 }

 inline void SparseEmbeddingOpBackwardDeterministicRspImpl(const OpContext& ctx,
                                                           const TBlob& ograd,
                                                           const TBlob& data,
                                                           const OpReqType req,
                                                           const NDArray& output) {
   using nnvm::dim_t;
   if (req == kNullOp)
     return;
   CHECK_EQ(req, kWriteTo) << "SparseEmbedding layer doesn't support "
                           << "weight gradient calculation with req != write";

   mshadow::Stream<gpu>* s = ctx.get_stream<gpu>();
   const dim_t data_size   = static_cast<dim_t>(data.shape_.Size());
   if (data_size == 0) {
     FillZerosRspImpl(s, output);
     return;
   }

   MSHADOW_TYPE_SWITCH(data.type_flag_, IType, {
     MSHADOW_TYPE_SWITCH(ograd.type_flag_, DType, {
       MSHADOW_IDX_TYPE_SWITCH(output.aux_type(rowsparse::kIdx), RType, {
         SparseEmbeddingDeterministicKernelLaunch<IType, DType, RType>(
             ctx, ograd, data, req, output);
       });
     });
   });
 }

 template <>
 inline void SparseEmbeddingOpBackwardRspImpl<gpu>(const bool deterministic,
                                                   const OpContext& ctx,
                                                   const TBlob& ograd,
                                                   const TBlob& data,
                                                   const OpReqType req,
                                                   const NDArray& output) {
   if (deterministic) {
     SparseEmbeddingOpBackwardDeterministicRspImpl(ctx, ograd, data, req, output);
     return;
   }
   using namespace mshadow;
   using namespace mxnet_op;
   using namespace mshadow::expr;
   using namespace rowsparse;
   using nnvm::dim_t;
   if (req == kNullOp)
     return;
   CHECK_EQ(req, kWriteTo) << "SparseEmbedding layer doesn't support "
                           << "weight gradient calculation with req != write";

   // Request temporary storage for marking non-zero rows and prefix sum
   Stream<gpu>* s   = ctx.get_stream<gpu>();
   dim_t num_rows   = output.shape()[0];
   dim_t row_length = output.shape()[1];
   dim_t data_size  = static_cast<dim_t>(data.shape_.Size());
   dim_t num_threads;

   MSHADOW_TYPE_SWITCH(data.type_flag_, IType, {
     MSHADOW_SGL_DBL_TYPE_SWITCH(ograd.type_flag_, DType, {
       MSHADOW_IDX_TYPE_SWITCH(output.aux_type(kIdx), RType, {
         dim_t* prefix_sum         = nullptr;
         void* d_temp_storage      = nullptr;
         size_t temp_storage_bytes = 0;
         cub::DeviceScan::InclusiveSum(d_temp_storage,
                                       temp_storage_bytes,
                                       prefix_sum,
                                       prefix_sum,
                                       num_rows,
                                       Stream<gpu>::GetStream(s));
         Tensor<gpu, 1, char> workspace = ctx.requested[0].get_space_typed<gpu, 1, char>(
             Shape1(num_rows * sizeof(dim_t) + temp_storage_bytes), s);
         prefix_sum     = reinterpret_cast<dim_t*>(workspace.dptr_);
         d_temp_storage = workspace.dptr_ + num_rows * sizeof(dim_t);
         num_threads    = num_rows;
         Fill<false>(s, TBlob(prefix_sum, Shape1(num_threads), gpu::kDevMask), kWriteTo, 0);
         Kernel<MarkRowFlgKernel, gpu>::Launch(s, data_size, prefix_sum, data.dptr<IType>());

         cub::DeviceScan::InclusiveSum(d_temp_storage,
                                       temp_storage_bytes,
                                       prefix_sum,
                                       prefix_sum,
                                       num_rows,
                                       mshadow::Stream<gpu>::GetStream(s));
         dim_t nnr = 0;
         CUDA_CALL(cudaMemcpyAsync(&nnr,
                                   &prefix_sum[num_rows - 1],
                                   sizeof(dim_t),
                                   cudaMemcpyDeviceToHost,
                                   mshadow::Stream<gpu>::GetStream(s)));
         CUDA_CALL(cudaStreamSynchronize(mshadow::Stream<gpu>::GetStream(s)));
         if (nnr == 0) {
           FillZerosRspImpl(s, output);
           return;
         }
         output.CheckAndAlloc({Shape1(nnr)});
         RType* grad_row_idx = output.aux_data(kIdx).dptr<RType>();
         // fill row_idx array of output matrix, using the row_flg values
         Kernel<FillRspRowIdxKernel, gpu>::Launch(s, num_rows, grad_row_idx, prefix_sum, num_rows);
         // prefill with zeros
         DType* grad_data = output.data().dptr<DType>();
         Fill<false>(s, TBlob(grad_data, Shape1(nnr * row_length), gpu::kDevMask), kWriteTo, 0);
         // add the final gradients
         num_threads = row_length * data_size;
         Kernel<AddTakeGradRspGPUKernel, gpu>::Launch(s,
                                                      num_threads,
                                                      grad_data,
                                                      prefix_sum,
                                                      data.dptr<IType>(),
                                                      ograd.dptr<DType>(),
                                                      row_length);
       });
     });
   });
 }

 /*
  * \brief check if any of the indices is out of bound
  * \param s the stream
  * \param idx_ptr the indices on the stream
  * \param N the number of indices in an axis
  * \param M the number of axises to exmaine
  * \param mshape the array that stores shape for each dimension
  * \param is_valid_dim_ptr the temparary workspace that contains out-of-bound indices
  */
 template <typename DType>
 void GatherNDCheckBoundGPU(mshadow::Stream<gpu>* s,
                            const DType* idx_ptr,
                            index_t N,
                            index_t M,
                            const mshadow::Shape<10> mshape,
                            DType* is_valid_dim_ptr) {
   using namespace mxnet_op;
   Kernel<set_zero, gpu>::Launch(s, M, is_valid_dim_ptr);
   Kernel<is_valid_check_gather_nd, gpu>::Launch(s, M, is_valid_dim_ptr, idx_ptr, N, mshape);

   std::vector<DType> is_valid_dim(M);
   CUDA_CALL(cudaMemcpyAsync(is_valid_dim.data(),
                             is_valid_dim_ptr,
                             sizeof(DType) * M,
                             cudaMemcpyDeviceToHost,
                             mshadow::Stream<gpu>::GetStream(s)));
   CUDA_CALL(cudaStreamSynchronize(mshadow::Stream<gpu>::GetStream(s)));
   for (int m = 0; m < M; m++) {
     if (is_valid_dim[m] > mshape[m] - 1 || is_valid_dim[m] < -mshape[m]) {
       LOG(FATAL) << "IndexError: index " << is_valid_dim[m] << " is out of bounds for axis " << m
                  << " with size " << mshape[m];
     }
   }
 }

 void GatherNDForwardGPU(const nnvm::NodeAttrs& attrs,
                         const OpContext& ctx,
                         const std::vector<TBlob>& inputs,
                         const std::vector<OpReqType>& req,
                         const std::vector<TBlob>& outputs) {
   using namespace mxnet_op;
   using namespace mshadow;
   CHECK_EQ(inputs.size(), 2U);
   CHECK_EQ(outputs.size(), 1U);
   if (req[0] == kNullOp)
     return;
   mshadow::Stream<gpu>* s     = ctx.get_stream<gpu>();
   const mxnet::TShape& dshape = inputs[0].shape_;
   const mxnet::TShape& ishape = inputs[1].shape_;
   int M                       = ishape[0];
   int N                       = ishape.Size() / M;
   int K                       = dshape.ProdShape(M, dshape.ndim());
   mshadow::Shape<10> strides;
   mshadow::Shape<10> mshape;
   for (int i = M - 1, stride = K; i >= 0; stride *= dshape[i], --i) {
     strides[i] = stride;
     mshape[i]  = dshape[i];
   }
   MSHADOW_TYPE_SWITCH_WITH_BOOL(inputs[0].type_flag_, DType, {  // output data type switch
     MSHADOW_TYPE_SWITCH(inputs[1].type_flag_, IType, {          // indices data type switch
       // check whether indices are out of bound
       IType* idx_ptr = inputs[1].dptr<IType>();
       Tensor<gpu, 1, IType> workspace =
           ctx.requested[0].get_space_typed<gpu, 1, IType>(Shape1(M), s);
       IType* is_valid_dim_ptr = reinterpret_cast<IType*>(workspace.dptr_);
       GatherNDCheckBoundGPU(s, idx_ptr, N, M, mshape, is_valid_dim_ptr);
       Kernel<gather_nd, gpu>::Launch(s,
                                      N,
                                      req[0],
                                      N,
                                      M,
                                      K,
                                      strides,
                                      mshape,
                                      outputs[0].dptr<DType>(),
                                      inputs[0].dptr<DType>(),
                                      inputs[1].dptr<IType>());
     });
   });
 }

 struct backward_gather_nd_gpu {
   template <typename DType, typename IType>
   MSHADOW_XINLINE static void Map(index_t i,
                                   index_t N,
                                   index_t M,
                                   index_t K,
                                   const mshadow::Shape<10> strides,
                                   DType* out,
                                   const DType* data,
                                   const IType* indices) {
     index_t offset = 0;
     for (index_t j = 0; j < M; ++j) {
       offset += strides[j] * static_cast<int>(indices[j * N + i]);
     }
     for (index_t j = 0; j < K; ++j) {
       atomicAdd(out + (offset + j), data[i * K + j]);
     }
   }
 };

 template <typename DType, typename IType>
 inline void GatherNDBackwardImpl(index_t N,
                                  index_t M,
                                  index_t K,
                                  const mshadow::Shape<10> strides,
                                  DType* out,
                                  const DType* data,
                                  const IType* indices,
                                  mshadow::Stream<gpu>* s) {
   mxnet_op::Kernel<backward_gather_nd_gpu, gpu>::Launch(s, N, N, M, K, strides, out, data, indices);
 }

 template <>
 void TakeOpForward<gpu>(const nnvm::NodeAttrs& attrs,
                         const OpContext& ctx,
                         const std::vector<TBlob>& inputs,
                         const std::vector<OpReqType>& req,
                         const std::vector<TBlob>& outputs) {
   using namespace mxnet_op;
   if (req[take_::kOut] == kNullOp)
     return;
   const TakeParam& param = nnvm::get<TakeParam>(attrs.parsed);
   CHECK_EQ(inputs.size(), 2U);
   CHECK_EQ(outputs.size(), 1U);

   const mxnet::TShape& idxshape = inputs[take_::kIdx].shape_;
   const mxnet::TShape& arrshape = inputs[take_::kArr].shape_;
   const mxnet::TShape& oshape   = outputs[take_::kOut].shape_;

   if (idxshape.Size() == 0) {
     return;
   }

   Stream<gpu>* s        = ctx.get_stream<gpu>();
   const int actual_axis = param.axis + ((param.axis < 0) ? arrshape.ndim() : 0);

   MSHADOW_TYPE_SWITCH_WITH_BOOL(outputs[take_::kOut].type_flag_, DType, {   // output data type
     MSHADOW_TYPE_SWITCH_WITH_BOOL(inputs[take_::kIdx].type_flag_, IType, {  // index data type
       if (param.mode == take_::kRaise) {
         // check out-of-bound indices
         IType min       = 0;
         IType max       = static_cast<IType>(arrshape[actual_axis] - 1);
         IType* idx_ptr  = inputs[take_::kIdx].dptr<IType>();
         size_t idx_size = idxshape.Size();
         Tensor<gpu, 1, char> workspace =
             ctx.requested[0].get_space_typed<gpu, 1, char>(Shape1(1), s);
         char* is_valid_ptr = reinterpret_cast<char*>(workspace.dptr_);
         bool is_valid      = CheckIndexOutOfBound(s, idx_ptr, idx_size, min, max, is_valid_ptr);
         CHECK(is_valid) << "Take indices contains indices out of bound";
       }
       if (actual_axis == 0) {
         if (param.mode == take_::kClip) {
           Kernel<TakeZeroAxisGPU<true>, gpu>::Launch(s,
                                                      oshape.Size(),
                                                      outputs[take_::kOut].dptr<DType>(),
                                                      inputs[take_::kArr].dptr<DType>(),
                                                      inputs[take_::kIdx].dptr<IType>(),
                                                      oshape.Size() / idxshape.Size(),
                                                      arrshape[0]);
         } else {
           Kernel<TakeZeroAxisGPU<false>, gpu>::Launch(s,
                                                       oshape.Size(),
                                                       outputs[take_::kOut].dptr<DType>(),
                                                       inputs[take_::kArr].dptr<DType>(),
                                                       inputs[take_::kIdx].dptr<IType>(),
                                                       oshape.Size() / idxshape.Size(),
                                                       arrshape[0]);
         }
       } else {
         mshadow::Shape<10> in_strides;
         int stride = 1;
         for (int i = arrshape.ndim() - 1; i >= 0; stride *= arrshape[i], --i) {
           in_strides[i] = stride;
         }
         mshadow::Shape<10> out_strides;
         stride = 1;
         for (int i = oshape.ndim() - 1; i >= 0; stride *= oshape[i], --i) {
           out_strides[i] = stride;
         }
         if (param.mode == take_::kClip) {
           Kernel<TakeNonzeroAxis<true>, gpu>::Launch(s,
                                                      oshape.Size(),
                                                      outputs[take_::kOut].dptr<DType>(),
                                                      inputs[take_::kArr].dptr<DType>(),
                                                      inputs[take_::kIdx].dptr<IType>(),
                                                      out_strides[actual_axis - 1],
                                                      in_strides[actual_axis - 1],
                                                      in_strides[actual_axis],
                                                      arrshape.ndim(),
                                                      oshape.ndim(),
                                                      idxshape.ndim(),
                                                      arrshape[actual_axis],
                                                      actual_axis);
         } else {
           Kernel<TakeNonzeroAxis<false>, gpu>::Launch(s,
                                                       oshape.Size(),
                                                       outputs[take_::kOut].dptr<DType>(),
                                                       inputs[take_::kArr].dptr<DType>(),
                                                       inputs[take_::kIdx].dptr<IType>(),
                                                       out_strides[actual_axis - 1],
                                                       in_strides[actual_axis - 1],
                                                       in_strides[actual_axis],
                                                       arrshape.ndim(),
                                                       oshape.ndim(),
                                                       idxshape.ndim(),
                                                       arrshape[actual_axis],
                                                       actual_axis);
         }
       }
     });
   });
 }

 namespace {
 /*
  * \brief returns integer log2(a) rounded up
  */
 inline int ilog2(unsigned int a) {
   int k = 1;
   while (a >>= 1)
     k++;
   return k;
 }
 }  // namespace

 /*
  * \brief finds the lower and upper-bound positions of each unique element within
  * a sorted input array
  *
  * \param sorted_data input elements previously sorted
  * \param bounds output containing all lower-bound followed by all upper-bound positions
  * \param data_dim total number of elements in the input array
  * \param vocab_dim maximum number of unique elements
  */
 template <typename IType>
 __global__ void EmbeddingFindBounds(const IType* sorted_data,
                                     IType* bounds,
                                     const index_t data_dim,
                                     const index_t vocab_dim) {
   const index_t id = blockIdx.x * blockDim.x + threadIdx.x;
   if (id >= vocab_dim)
     return;

   // Binary search to find lower bound: stored at bounds[0..vocab_dim-1]
   IType lower_bound = 0;
   IType upper_bound = data_dim - 1;
   IType mean;
   while (lower_bound < upper_bound) {
     mean = (lower_bound + upper_bound) / 2;
     if (id <= sorted_data[mean])
       upper_bound = mean;
     else
       lower_bound = mean + 1;
   }
   bool found_row = (sorted_data[lower_bound] == id);
   if (!found_row) {
     bounds[id]             = -1;
     bounds[vocab_dim + id] = -2;
     return;
   } else {
     bounds[id] = lower_bound;
   }

   // Binary search to find upper bound: stored at bounds[vocab_dim..2*vocab_dim-1]
   lower_bound = 0;
   upper_bound = data_dim - 1;
   while (lower_bound < upper_bound) {
     mean = (lower_bound + upper_bound + 1) / 2;
     if (id >= sorted_data[mean])
       lower_bound = mean;
     else
       upper_bound = mean - 1;
   }
   bounds[vocab_dim + id] = upper_bound;
 }

 /*
  * \brief kernel to compute gradient of EmbeddingOp
  * \param grad_in input gradient data
  * \param original_index reference to the position at original input data for each index
  * \param index_bounds lower and upper-bounds positions of each unique index
  * \param grad_out output gradient data
  * \param embbedding_dim dimension of the dense embedding
  * \param vocab_dim maximum number of unique indices in the data array: tokens vocabulary size
  * \param nelems_per_load number of elements per each load based on (LType / DType)
  * \param req write/add/null
  */
 template <typename AType, typename LType, typename DType, typename IType>
 __global__ void EmbeddingGradKernel(DType* grad_in,
                                     const IType* original_index,
                                     const IType* index_bounds,
                                     const DType* grad_out,
                                     const index_t embbedding_dim,
                                     const index_t vocab_dim,
                                     const int nelems_per_load,
                                     const int req) {
   extern __shared__ int sharedmem[];
   AType* grad_in_row            = reinterpret_cast<AType*>(sharedmem);
   const LType* aligned_grad_out = reinterpret_cast<const LType*>(grad_out);
   LType* aligned_grad_in        = reinterpret_cast<LType*>(grad_in);
   const index_t aligned_emb_dim = embbedding_dim / nelems_per_load;
   LType load_value[1];
   DType* data_values = reinterpret_cast<DType*>(load_value);

   IType my_row = blockIdx.x;
   if (my_row < vocab_dim) {
     // Read lower and upper bounds for current row
     IType lower_bound = index_bounds[my_row];
     IType upper_bound = index_bounds[vocab_dim + my_row];
     int nOccurrences  = upper_bound - lower_bound + 1;

     for (index_t emb_id = threadIdx.x; emb_id < aligned_emb_dim; emb_id += blockDim.x) {
       // Initialize grad_in
       if (req == kAddTo) {
         *load_value = aligned_grad_in[my_row * aligned_emb_dim + emb_id];
         for (index_t val_id = 0; val_id < nelems_per_load; val_id++) {
           grad_in_row[val_id * blockDim.x + threadIdx.x] = static_cast<AType>(data_values[val_id]);
         }
       } else {
         for (index_t val_id = 0; val_id < nelems_per_load; val_id++) {
           grad_in_row[val_id * blockDim.x + threadIdx.x] = static_cast<AType>(0.0);
         }
       }
       // Add all rows from grad_out according to indices in data
       for (index_t data_idx = lower_bound; data_idx < (lower_bound + nOccurrences); ++data_idx) {
         *load_value = aligned_grad_out[original_index[data_idx] * aligned_emb_dim + emb_id];
         for (index_t val_id = 0; val_id < nelems_per_load; val_id++) {
           grad_in_row[val_id * blockDim.x + threadIdx.x] += static_cast<AType>(data_values[val_id]);
         }
       }
       // Save results
       for (index_t val_id = 0; val_id < nelems_per_load; val_id++) {
         data_values[val_id] = static_cast<DType>(grad_in_row[val_id * blockDim.x + threadIdx.x]);
       }
       aligned_grad_in[my_row * aligned_emb_dim + emb_id] = *load_value;
     }
   }
 }

 template <typename AType, typename IType, typename DType>
 void EmbeddingGradKernelCaller(const OpContext& ctx,
                                mshadow::Tensor<gpu, 2, DType> grad_in,
                                const mshadow::Tensor<gpu, 1, IType>& index,
                                const mshadow::Tensor<gpu, 2, DType>& grad_out,
                                const std::vector<OpReqType>& req) {
   using namespace mxnet_op;
   using namespace mshadow::expr;

   Stream<gpu>* s               = ctx.get_stream<gpu>();
   const index_t data_dim       = index.shape_[0];
   const index_t vocab_dim      = grad_in.shape_[0];
   const index_t embbedding_dim = grad_in.shape_[1];

   // Calculate amount of temporary storage
   size_t sort_workspace_size = mxnet::op::SortByKeyWorkspaceSize<int, int, gpu>(data_dim);
   size_t workspace_size =
       2 * data_dim * sizeof(int) + 2 * vocab_dim * sizeof(int) + sort_workspace_size;

   // Request temporary storage
   Tensor<gpu, 1, char> workspace =
       ctx.requested[embedding::kTempSpace].get_space_typed<gpu, 1, char>(Shape1(workspace_size), s);

   // Create tensors
   size_t pos = 0;
   Tensor<gpu, 1, int> sorted_data(reinterpret_cast<int*>(&workspace[pos]), Shape1(data_dim), s);
   pos += data_dim * sizeof(int);
   // Reference to input data positions for each element of sorted_data
   Tensor<gpu, 1, int> original_index(reinterpret_cast<int*>(&workspace[pos]), Shape1(data_dim), s);
   pos += data_dim * sizeof(int);
   // lower and upper bound positions of each index within sorted_data
   Tensor<gpu, 1, int> bounds_index(
       reinterpret_cast<int*>(&workspace[pos]), Shape1(2 * vocab_dim), s);
   pos += 2 * vocab_dim * sizeof(int);
   Tensor<gpu, 1, char> Sort_temp_storage(&workspace[pos], Shape1(sort_workspace_size), s);

   // Clip indices [0, vocab_dim-1]
   Kernel<tcast_clip, gpu>::Launch(
       s, data_dim, sorted_data.dptr_, index.dptr_, static_cast<int>(vocab_dim));

   Kernel<range_fwd, gpu>::Launch(s, data_dim, 1, 0, 1, kWriteTo, original_index.dptr_);

   // Sort indices array
   int num_bits = ilog2((vocab_dim - 1));
   mxnet::op::SortByKey(sorted_data, original_index, true, &Sort_temp_storage, 0, num_bits);

   // Find lower & upper bounds of each possible index
   const int threads_block_bounds = 128;
   const int nblocks_bounds       = (vocab_dim + threads_block_bounds - 1) / threads_block_bounds;
   EmbeddingFindBounds<<<nblocks_bounds, threads_block_bounds, 0, Stream<gpu>::GetStream(s)>>>(
       sorted_data.dptr_, bounds_index.dptr_, data_dim, vocab_dim);

   // Compute Gradient
   int ltype = mxnet::common::cuda::get_load_type(embbedding_dim * sizeof(DType));

   MXNET_LOAD_TYPE_SWITCH(ltype, LType, {
     CHECK_LE(sizeof(DType), sizeof(LType));
     int nelems_per_load    = sizeof(LType) / sizeof(DType);
     int threads_block_grad = 32;
     int maxThreads         = 1024;
     while (threads_block_grad < (embbedding_dim / nelems_per_load) &&
            (threads_block_grad < maxThreads))
       threads_block_grad += 32;
     size_t required_shared = threads_block_grad * nelems_per_load * sizeof(AType);
     dim3 blocks(vocab_dim, 1);
     EmbeddingGradKernel<AType, LType>
         <<<blocks, threads_block_grad, required_shared, Stream<gpu>::GetStream(s)>>>(
             grad_in.dptr_,
             original_index.dptr_,
             bounds_index.dptr_,
             grad_out.dptr_,
             embbedding_dim,
             vocab_dim,
             nelems_per_load,
             req[embedding::kWeight]);
   });
 }

 template <>
 void EmbeddingOpBackward<gpu>(const nnvm::NodeAttrs& attrs,
                               const OpContext& ctx,
                               const std::vector<TBlob>& inputs,
                               const std::vector<OpReqType>& req,
                               const std::vector<TBlob>& outputs) {
   using namespace mshadow;
   using namespace mshadow::expr;
   CHECK_EQ(inputs.size(), 2U);
   CHECK_EQ(outputs.size(), 2U);
   CHECK_EQ(req[embedding::kData], kNullOp)
       << "Embedding layer doesn't support calculate data gradient";
   if (req[embedding::kWeight] == kNullOp) {
     return;
   }
   CHECK_EQ(outputs[1].type_flag_, inputs[0].type_flag_);

   const mxnet::TShape& ishape = inputs[1].shape_;
   const mxnet::TShape& oshape = inputs[0].shape_;

   Stream<gpu>* s = ctx.get_stream<gpu>();

   CHECK_NE(req[embedding::kWeight], kWriteInplace)
       << "Backward of Embedding does not support writing in place.";
   bool safe_acc = dmlc::GetEnv("MXNET_SAFE_ACCUMULATION", true);
   if (!safe_acc && outputs[1].type_flag_ == mshadow::kFloat16) {
     common::LogOnce(
         "MXNET_SAFE_ACCUMULATION=1 is recommended for EmbeddingOpBackward "
         "with float16 inputs. "
         "See https://mxnet.apache.org/api/faq/env_var "
         "for more details.");
   }
   MXNET_REAL_ACC_TYPE_SWITCH(outputs[1].type_flag_, DType, AType, {
     MSHADOW_TYPE_SWITCH(inputs[1].type_flag_, IType, {
       Tensor<gpu, 1, IType> data =
           inputs[1].get_with_shape<gpu, 1, IType>(Shape1(ishape.ProdShape(0, ishape.ndim())), s);
       Tensor<gpu, 2, DType> grad_out = inputs[0].get_with_shape<gpu, 2, DType>(
           Shape2(oshape.ProdShape(0, oshape.ndim() - 1), oshape[oshape.ndim() - 1]), s);
       Tensor<gpu, 2, DType> grad_in = outputs[1].get<gpu, 2, DType>(s);

       if (req[embedding::kWeight] == kWriteTo || req[embedding::kWeight] == kAddTo) {
         if (safe_acc)
           EmbeddingGradKernelCaller<AType>(ctx, grad_in, data, grad_out, req);
         else
           EmbeddingGradKernelCaller<DType>(ctx, grad_in, data, grad_out, req);
       } else {
         LOG(FATAL) << "wrong req";
       }
     });
   });
 }

 NNVM_REGISTER_OP(Embedding).set_attr<FCompute>("FCompute<gpu>", EmbeddingOpForward<gpu>);

 NNVM_REGISTER_OP(_backward_Embedding)
     .set_attr<FCompute>("FCompute<gpu>", EmbeddingOpBackward<gpu>)
     .set_attr<FComputeEx>("FComputeEx<gpu>", EmbeddingOpBackwardEx<gpu>);

 NNVM_REGISTER_OP(take).set_attr<FCompute>("FCompute<gpu>", TakeOpForward<gpu>);

 NNVM_REGISTER_OP(_backward_take).set_attr<FCompute>("FCompute<gpu>", TakeOpBackward<gpu>);

 NNVM_REGISTER_OP(batch_take).set_attr<FCompute>("FCompute<gpu>", BatchTakeOpForward<gpu>);

 NNVM_REGISTER_OP(one_hot).set_attr<FCompute>("FCompute<gpu>", OneHotOpForward<gpu>);

 NNVM_REGISTER_OP(gather_nd).set_attr<FCompute>("FCompute<gpu>", GatherNDForwardGPU);

 NNVM_REGISTER_OP(scatter_nd).set_attr<FCompute>("FCompute<gpu>", ScatterNDForward<gpu>);

 NNVM_REGISTER_OP(_backward_gather_nd).set_attr<FCompute>("FCompute<gpu>", GatherNDBackward<gpu>);

 NNVM_REGISTER_OP(_scatter_set_nd).set_attr<FCompute>("FCompute<gpu>", ScatterSetNDForward<gpu>);
 }  // namespace op
 }  // namespace mxnet