src/operator/optimizer_op.cu - mxnet - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *   http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
  */

 /*!
  * \file optimizer_op.cu
  * \brief Optimizer operators
  * \author Junyuan Xie
  */
 #include "./optimizer_op-inl.h"
 #include <cub/cub.cuh>

 namespace mxnet {
 namespace op {

 template <int req>
 struct SGDMomStdDnsRspDnsKernel<req, gpu> {
   template <typename DType, typename IType, typename RType>
   MSHADOW_XINLINE static void Map(int i,
                                   index_t row_length,
                                   DType* out_data,
                                   DType* mom_data,
                                   const DType* weight_data,
                                   const IType* grad_idx,
                                   const DType* grad_data,
                                   const RType* prefix_sum,
                                   const DType clip_gradient,
                                   const DType momentum,
                                   const DType lr,
                                   const DType wd,
                                   const DType rescale_grad) {
     using nnvm::dim_t;
     const dim_t row_id  = i / row_length;
     const dim_t col_id  = i % row_length;
     const dim_t nnr     = prefix_sum[row_id];
     const bool non_zero = (row_id == 0) ? prefix_sum[0] > 0 : nnr > prefix_sum[row_id - 1];
     const RType grad_i  = (nnr - 1) * row_length + col_id;
     const DType grad    = non_zero ? grad_data[grad_i] : static_cast<DType>(0);
     DType grad_rescaled = rescale_grad * grad;
     if (clip_gradient >= 0.0f) {
       grad_rescaled = mshadow_op::clip::Map(grad_rescaled, clip_gradient);
     }
     grad_rescaled += wd * weight_data[i];
     mom_data[i] *= momentum;
     mom_data[i] -= lr * grad_rescaled;
     KERNEL_ASSIGN(out_data[i], req, weight_data[i] + mom_data[i]);
   }
 };

 template <>
 void SGDMomStdUpdateDnsRspDnsImpl<gpu>(const SGDMomParam& param,
                                        const OpContext& ctx,
                                        const TBlob& weight,
                                        const NDArray& grad,
                                        const TBlob& mom,
                                        const OpReqType& req,
                                        TBlob* out) {
   using namespace mxnet_op;
   using namespace rowsparse;
   using namespace mshadow;
   Stream<gpu>* s = ctx.get_stream<gpu>();
   if (req == kNullOp)
     return;
   CHECK_EQ(req, kWriteInplace) << "kWriteInplace is expected for sparse sgd_mom_update";
   CHECK_GT(weight.shape_.Size(), 0);
   CHECK_GT(mom.shape_.Size(), 0);

   MSHADOW_REAL_TYPE_SWITCH(weight.type_flag_, DType, {
     MSHADOW_IDX_TYPE_SWITCH(grad.aux_type(kIdx), IType, {
       MXNET_ASSIGN_REQ_SWITCH(req, req_type, {
         DType* weight_data     = weight.dptr<DType>();
         IType* grad_idx        = grad.aux_data(kIdx).dptr<IType>();
         DType* grad_val        = grad.data().dptr<DType>();
         DType* mom_data        = mom.dptr<DType>();
         DType* out_data        = out->dptr<DType>();
         nnvm::dim_t num_rows   = weight.shape_[0];
         nnvm::dim_t row_length = weight.shape_.ProdShape(1, weight.ndim());

         nnvm::dim_t* prefix_sum   = nullptr;
         void* d_temp_storage      = nullptr;
         size_t temp_storage_bytes = 0;
         cub::DeviceScan::InclusiveSum(d_temp_storage,
                                       temp_storage_bytes,
                                       prefix_sum,
                                       prefix_sum,
                                       num_rows,
                                       Stream<gpu>::GetStream(s));
         Tensor<gpu, 1, char> workspace = ctx.requested[0].get_space_typed<gpu, 1, char>(
             Shape1(num_rows * sizeof(nnvm::dim_t) + temp_storage_bytes), s);
         prefix_sum     = reinterpret_cast<nnvm::dim_t*>(workspace.dptr_);
         d_temp_storage = workspace.dptr_ + num_rows * sizeof(nnvm::dim_t);
         // mark row flags
         Fill<false>(s, TBlob(prefix_sum, Shape1(num_rows), gpu::kDevMask), kWriteTo, 0);
         if (grad.storage_initialized()) {
           Kernel<MarkRowFlgKernel, gpu>::Launch(s, grad.aux_shape(kIdx)[0], prefix_sum, grad_idx);
           // calculate inclusive prefix sum
           cub::DeviceScan::InclusiveSum(d_temp_storage,
                                         temp_storage_bytes,
                                         prefix_sum,
                                         prefix_sum,
                                         num_rows,
                                         mshadow::Stream<gpu>::GetStream(s));
         }
         size_t num_threads = num_rows * row_length;
         Kernel<SGDMomStdDnsRspDnsKernel<req_type, gpu>, gpu>::Launch(
             s,
             num_threads,
             row_length,
             out_data,
             mom_data,
             weight_data,
             grad_idx,
             grad_val,
             prefix_sum,
             static_cast<DType>(param.clip_gradient),
             static_cast<DType>(param.momentum),
             static_cast<DType>(param.lr),
             static_cast<DType>(param.wd),
             static_cast<DType>(param.rescale_grad));
       });
     });
   });
 }

 template <int req>
 struct AdamStdDnsRspDnsKernel<req, gpu> {
   template <typename DType, typename IType, typename RType>
   MSHADOW_XINLINE static void Map(int i,
                                   const nnvm::dim_t row_length,
                                   DType* out_data,
                                   DType* mean_data,
                                   DType* var_data,
                                   const DType* weight_data,
                                   const IType* grad_idx,
                                   const DType* grad_data,
                                   const RType* prefix_sum,
                                   const DType clip_gradient,
                                   const DType beta1,
                                   const DType beta2,
                                   const DType lr,
                                   const DType wd,
                                   const DType epsilon,
                                   const DType rescale_grad) {
     using namespace mshadow_op;
     using nnvm::dim_t;
     const dim_t row_id = i / row_length;
     const dim_t col_id = i % row_length;
     const bool non_zero =
         (row_id == 0) ? prefix_sum[0] > 0 : prefix_sum[row_id] > prefix_sum[row_id - 1];
     const RType grad_offset = (prefix_sum[row_id] - 1) * row_length + col_id;
     DType grad_rescaled     = non_zero ? static_cast<DType>(grad_data[grad_offset] * rescale_grad) :
                                      static_cast<DType>(0);
     if (clip_gradient >= 0.0f) {
       grad_rescaled = clip::Map(grad_rescaled, clip_gradient);
     }
     grad_rescaled += weight_data[i] * wd;
     mean_data[i] = beta1 * mean_data[i] + (1.f - beta1) * grad_rescaled;
     var_data[i]  = beta2 * var_data[i] + (1.f - beta2) * square::Map(grad_rescaled);
     KERNEL_ASSIGN(out_data[i],
                   req,
                   weight_data[i] - lr * mean_data[i] / (square_root::Map(var_data[i]) + epsilon));
   }
 };

 template <>
 void AdamStdUpdateDnsRspDnsImpl<gpu>(const AdamParam& param,
                                      const OpContext& ctx,
                                      const TBlob& weight,
                                      const NDArray& grad,
                                      const TBlob& mean,
                                      const TBlob& var,
                                      const OpReqType& req,
                                      TBlob* out) {
   using namespace mxnet_op;
   using namespace rowsparse;
   using namespace mshadow;
   Stream<gpu>* s = ctx.get_stream<gpu>();
   if (req == kNullOp)
     return;
   CHECK_EQ(req, kWriteInplace) << "kWriteInplace is expected for sparse adam_update";
   CHECK_GT(weight.shape_.Size(), 0);
   CHECK_GT(mean.shape_.Size(), 0);
   CHECK_GT(var.shape_.Size(), 0);

   MSHADOW_REAL_TYPE_SWITCH(weight.type_flag_, DType, {
     MSHADOW_IDX_TYPE_SWITCH(grad.aux_type(kIdx), IType, {
       MXNET_ASSIGN_REQ_SWITCH(req, req_type, {
         const DType* weight_data     = weight.dptr<DType>();
         const IType* grad_idx        = grad.aux_data(kIdx).dptr<IType>();
         const DType* grad_val        = grad.data().dptr<DType>();
         DType* mean_data             = mean.dptr<DType>();
         DType* var_data              = var.dptr<DType>();
         DType* out_data              = out->dptr<DType>();
         const nnvm::dim_t num_rows   = weight.shape_[0];
         const nnvm::dim_t row_length = weight.shape_.ProdShape(1, weight.ndim());
         nnvm::dim_t* prefix_sum      = nullptr;
         void* d_temp_storage         = nullptr;
         size_t temp_storage_bytes    = 0;
         cub::DeviceScan::InclusiveSum(d_temp_storage,
                                       temp_storage_bytes,
                                       prefix_sum,
                                       prefix_sum,
                                       num_rows,
                                       Stream<gpu>::GetStream(s));
         Tensor<gpu, 1, char> workspace = ctx.requested[0].get_space_typed<gpu, 1, char>(
             Shape1(num_rows * sizeof(nnvm::dim_t) + temp_storage_bytes), s);
         prefix_sum     = reinterpret_cast<nnvm::dim_t*>(workspace.dptr_);
         d_temp_storage = workspace.dptr_ + num_rows * sizeof(nnvm::dim_t);
         // mark row flags
         Fill<false>(s, TBlob(prefix_sum, Shape1(num_rows), gpu::kDevMask), kWriteTo, 0);
         if (grad.storage_initialized()) {
           Kernel<MarkRowFlgKernel, gpu>::Launch(s, grad.aux_shape(kIdx)[0], prefix_sum, grad_idx);
           // calculate inclusive prefix sum
           cub::DeviceScan::InclusiveSum(d_temp_storage,
                                         temp_storage_bytes,
                                         prefix_sum,
                                         prefix_sum,
                                         num_rows,
                                         Stream<gpu>::GetStream(s));
         }

         Kernel<AdamStdDnsRspDnsKernel<req_type, gpu>, gpu>::Launch(
             s,
             weight.shape_.Size(),
             row_length,
             out_data,
             mean_data,
             var_data,
             weight_data,
             grad_idx,
             grad_val,
             prefix_sum,
             static_cast<DType>(param.clip_gradient),
             static_cast<DType>(param.beta1),
             static_cast<DType>(param.beta2),
             static_cast<DType>(param.lr),
             static_cast<DType>(param.wd),
             static_cast<DType>(param.epsilon),
             static_cast<DType>(param.rescale_grad));
       });
     });
   });
 }

 NNVM_REGISTER_OP(signsgd_update).set_attr<FCompute>("FCompute<gpu>", SignSGDUpdate<gpu>);

 NNVM_REGISTER_OP(signum_update).set_attr<FCompute>("FCompute<gpu>", SignumUpdate<gpu>);

 NNVM_REGISTER_OP(sgd_update)
     .set_attr<FCompute>("FCompute<gpu>", SGDUpdate<gpu>)
     .set_attr<FComputeEx>("FComputeEx<gpu>", SGDUpdateEx<gpu>);

 NNVM_REGISTER_OP(sgd_mom_update)
     .set_attr<FCompute>("FCompute<gpu>", SGDMomUpdate<gpu>)
     .set_attr<FComputeEx>("FComputeEx<gpu>", SGDMomUpdateEx<gpu>);

 NNVM_REGISTER_OP(mp_sgd_update).set_attr<FCompute>("FCompute<gpu>", MP_SGDUpdate<gpu>);

 NNVM_REGISTER_OP(mp_sgd_mom_update).set_attr<FCompute>("FCompute<gpu>", MP_SGDMomUpdate<gpu>);

 NNVM_REGISTER_OP(multi_sgd_update)
     .set_attr<FCompute>("FCompute<gpu>", MultiSGDUpdate<gpu, type_identity, 2>);
 NNVM_REGISTER_OP(multi_sgd_mom_update)
     .set_attr<FCompute>("FCompute<gpu>", MultiSGDMomUpdate<gpu, type_identity, 3>);
 NNVM_REGISTER_OP(multi_mp_sgd_update)
     .set_attr<FCompute>("FCompute<gpu>", MultiSGDUpdate<gpu, single_precision, 3>);
 NNVM_REGISTER_OP(multi_mp_sgd_mom_update)
     .set_attr<FCompute>("FCompute<gpu>", MultiSGDMomUpdate<gpu, single_precision, 4>);

 NNVM_REGISTER_OP(nag_mom_update).set_attr<FCompute>("FCompute<gpu>", NAGMomUpdate<gpu>);

 NNVM_REGISTER_OP(mp_nag_mom_update).set_attr<FCompute>("FCompute<gpu>", MP_NAGMomUpdate<gpu>);

 NNVM_REGISTER_OP(ftml_update).set_attr<FCompute>("FCompute<gpu>", FTMLUpdate<gpu>);

 NNVM_REGISTER_OP(adam_update)
     .set_attr<FCompute>("FCompute<gpu>", AdamUpdate<gpu>)
     .set_attr<FComputeEx>("FComputeEx<gpu>", AdamUpdateEx<gpu>);

 NNVM_REGISTER_OP(rmsprop_update).set_attr<FCompute>("FCompute<gpu>", RMSPropUpdate<gpu>);

 NNVM_REGISTER_OP(rmspropalex_update).set_attr<FCompute>("FCompute<gpu>", RMSPropAlexUpdate<gpu>);

 NNVM_REGISTER_OP(ftrl_update)
     .set_attr<FCompute>("FCompute<gpu>", FtrlUpdate<gpu>)
     .set_attr<FComputeEx>("FComputeEx<gpu>", FtrlUpdateEx<gpu>);

 NNVM_REGISTER_OP(_sparse_adagrad_update)
     .set_attr<FComputeEx>("FComputeEx<gpu>", AdagradUpdateEx<gpu>);

 NNVM_REGISTER_OP(lamb_update_phase1).set_attr<FCompute>("FCompute<gpu>", LambUpdatePhaseOne<gpu>);

 NNVM_REGISTER_OP(lamb_update_phase2).set_attr<FCompute>("FCompute<gpu>", LambUpdatePhaseTwo<gpu>);

 NNVM_REGISTER_OP(mp_lamb_update_phase1)
     .set_attr<FCompute>("FCompute<gpu>", MPLambUpdatePhaseOne<gpu>);

 NNVM_REGISTER_OP(mp_lamb_update_phase2)
     .set_attr<FCompute>("FCompute<gpu>", MPLambUpdatePhaseTwo<gpu>);

 }  // namespace op
 }  // namespace mxnet
	/*
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing,
	* software distributed under the License is distributed on an
	* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	* KIND, either express or implied. See the License for the
	* specific language governing permissions and limitations
	* under the License.
	*/

	/*!
	* \file optimizer_op.cu
	* \brief Optimizer operators
	* \author Junyuan Xie
	*/
	#include "./optimizer_op-inl.h"
	#include <cub/cub.cuh>

	namespace mxnet {
	namespace op {

	template <int req>
	struct SGDMomStdDnsRspDnsKernel<req, gpu> {
	template <typename DType, typename IType, typename RType>
	MSHADOW_XINLINE static void Map(int i,
	index_t row_length,
	DType* out_data,
	DType* mom_data,
	const DType* weight_data,
	const IType* grad_idx,
	const DType* grad_data,
	const RType* prefix_sum,
	const DType clip_gradient,
	const DType momentum,
	const DType lr,
	const DType wd,
	const DType rescale_grad) {
	using nnvm::dim_t;
	const dim_t row_id = i / row_length;
	const dim_t col_id = i % row_length;
	const dim_t nnr = prefix_sum[row_id];
	const bool non_zero = (row_id == 0) ? prefix_sum[0] > 0 : nnr > prefix_sum[row_id - 1];
	const RType grad_i = (nnr - 1) * row_length + col_id;
	const DType grad = non_zero ? grad_data[grad_i] : static_cast<DType>(0);
	DType grad_rescaled = rescale_grad * grad;
	if (clip_gradient >= 0.0f) {
	grad_rescaled = mshadow_op::clip::Map(grad_rescaled, clip_gradient);
	}
	grad_rescaled += wd * weight_data[i];
	mom_data[i] *= momentum;
	mom_data[i] -= lr * grad_rescaled;
	KERNEL_ASSIGN(out_data[i], req, weight_data[i] + mom_data[i]);
	}
	};

	template <>
	void SGDMomStdUpdateDnsRspDnsImpl<gpu>(const SGDMomParam& param,
	const OpContext& ctx,
	const TBlob& weight,
	const NDArray& grad,
	const TBlob& mom,
	const OpReqType& req,
	TBlob* out) {
	using namespace mxnet_op;
	using namespace rowsparse;
	using namespace mshadow;
	Stream<gpu>* s = ctx.get_stream<gpu>();
	if (req == kNullOp)
	return;
	CHECK_EQ(req, kWriteInplace) << "kWriteInplace is expected for sparse sgd_mom_update";
	CHECK_GT(weight.shape_.Size(), 0);
	CHECK_GT(mom.shape_.Size(), 0);

	MSHADOW_REAL_TYPE_SWITCH(weight.type_flag_, DType, {
	MSHADOW_IDX_TYPE_SWITCH(grad.aux_type(kIdx), IType, {
	MXNET_ASSIGN_REQ_SWITCH(req, req_type, {
	DType* weight_data = weight.dptr<DType>();
	IType* grad_idx = grad.aux_data(kIdx).dptr<IType>();
	DType* grad_val = grad.data().dptr<DType>();
	DType* mom_data = mom.dptr<DType>();
	DType* out_data = out->dptr<DType>();
	nnvm::dim_t num_rows = weight.shape_[0];
	nnvm::dim_t row_length = weight.shape_.ProdShape(1, weight.ndim());

	nnvm::dim_t* prefix_sum = nullptr;
	void* d_temp_storage = nullptr;
	size_t temp_storage_bytes = 0;
	cub::DeviceScan::InclusiveSum(d_temp_storage,
	temp_storage_bytes,
	prefix_sum,
	prefix_sum,
	num_rows,
	Stream<gpu>::GetStream(s));
	Tensor<gpu, 1, char> workspace = ctx.requested[0].get_space_typed<gpu, 1, char>(
	Shape1(num_rows * sizeof(nnvm::dim_t) + temp_storage_bytes), s);
	prefix_sum = reinterpret_cast<nnvm::dim_t*>(workspace.dptr_);
	d_temp_storage = workspace.dptr_ + num_rows * sizeof(nnvm::dim_t);
	// mark row flags
	Fill<false>(s, TBlob(prefix_sum, Shape1(num_rows), gpu::kDevMask), kWriteTo, 0);
	if (grad.storage_initialized()) {
	Kernel<MarkRowFlgKernel, gpu>::Launch(s, grad.aux_shape(kIdx)[0], prefix_sum, grad_idx);
	// calculate inclusive prefix sum
	cub::DeviceScan::InclusiveSum(d_temp_storage,
	temp_storage_bytes,
	prefix_sum,
	prefix_sum,
	num_rows,
	mshadow::Stream<gpu>::GetStream(s));
	}
	size_t num_threads = num_rows * row_length;
	Kernel<SGDMomStdDnsRspDnsKernel<req_type, gpu>, gpu>::Launch(
	s,
	num_threads,
	row_length,
	out_data,
	mom_data,
	weight_data,
	grad_idx,
	grad_val,
	prefix_sum,
	static_cast<DType>(param.clip_gradient),
	static_cast<DType>(param.momentum),
	static_cast<DType>(param.lr),
	static_cast<DType>(param.wd),
	static_cast<DType>(param.rescale_grad));
	});
	});
	});
	}

	template <int req>
	struct AdamStdDnsRspDnsKernel<req, gpu> {
	template <typename DType, typename IType, typename RType>
	MSHADOW_XINLINE static void Map(int i,
	const nnvm::dim_t row_length,
	DType* out_data,
	DType* mean_data,
	DType* var_data,
	const DType* weight_data,
	const IType* grad_idx,
	const DType* grad_data,
	const RType* prefix_sum,
	const DType clip_gradient,
	const DType beta1,
	const DType beta2,
	const DType lr,
	const DType wd,
	const DType epsilon,
	const DType rescale_grad) {
	using namespace mshadow_op;
	using nnvm::dim_t;
	const dim_t row_id = i / row_length;
	const dim_t col_id = i % row_length;
	const bool non_zero =
	(row_id == 0) ? prefix_sum[0] > 0 : prefix_sum[row_id] > prefix_sum[row_id - 1];
	const RType grad_offset = (prefix_sum[row_id] - 1) * row_length + col_id;
	DType grad_rescaled = non_zero ? static_cast<DType>(grad_data[grad_offset] * rescale_grad) :
	static_cast<DType>(0);
	if (clip_gradient >= 0.0f) {
	grad_rescaled = clip::Map(grad_rescaled, clip_gradient);
	}
	grad_rescaled += weight_data[i] * wd;
	mean_data[i] = beta1 * mean_data[i] + (1.f - beta1) * grad_rescaled;
	var_data[i] = beta2 * var_data[i] + (1.f - beta2) * square::Map(grad_rescaled);
	KERNEL_ASSIGN(out_data[i],
	req,
	weight_data[i] - lr * mean_data[i] / (square_root::Map(var_data[i]) + epsilon));
	}
	};

	template <>
	void AdamStdUpdateDnsRspDnsImpl<gpu>(const AdamParam& param,
	const OpContext& ctx,
	const TBlob& weight,
	const NDArray& grad,
	const TBlob& mean,
	const TBlob& var,
	const OpReqType& req,
	TBlob* out) {
	using namespace mxnet_op;
	using namespace rowsparse;
	using namespace mshadow;
	Stream<gpu>* s = ctx.get_stream<gpu>();
	if (req == kNullOp)
	return;
	CHECK_EQ(req, kWriteInplace) << "kWriteInplace is expected for sparse adam_update";
	CHECK_GT(weight.shape_.Size(), 0);
	CHECK_GT(mean.shape_.Size(), 0);
	CHECK_GT(var.shape_.Size(), 0);

	MSHADOW_REAL_TYPE_SWITCH(weight.type_flag_, DType, {
	MSHADOW_IDX_TYPE_SWITCH(grad.aux_type(kIdx), IType, {
	MXNET_ASSIGN_REQ_SWITCH(req, req_type, {
	const DType* weight_data = weight.dptr<DType>();
	const IType* grad_idx = grad.aux_data(kIdx).dptr<IType>();
	const DType* grad_val = grad.data().dptr<DType>();
	DType* mean_data = mean.dptr<DType>();
	DType* var_data = var.dptr<DType>();
	DType* out_data = out->dptr<DType>();
	const nnvm::dim_t num_rows = weight.shape_[0];
	const nnvm::dim_t row_length = weight.shape_.ProdShape(1, weight.ndim());
	nnvm::dim_t* prefix_sum = nullptr;
	void* d_temp_storage = nullptr;
	size_t temp_storage_bytes = 0;
	cub::DeviceScan::InclusiveSum(d_temp_storage,
	temp_storage_bytes,
	prefix_sum,
	prefix_sum,
	num_rows,
	Stream<gpu>::GetStream(s));
	Tensor<gpu, 1, char> workspace = ctx.requested[0].get_space_typed<gpu, 1, char>(
	Shape1(num_rows * sizeof(nnvm::dim_t) + temp_storage_bytes), s);
	prefix_sum = reinterpret_cast<nnvm::dim_t*>(workspace.dptr_);
	d_temp_storage = workspace.dptr_ + num_rows * sizeof(nnvm::dim_t);
	// mark row flags
	Fill<false>(s, TBlob(prefix_sum, Shape1(num_rows), gpu::kDevMask), kWriteTo, 0);
	if (grad.storage_initialized()) {
	Kernel<MarkRowFlgKernel, gpu>::Launch(s, grad.aux_shape(kIdx)[0], prefix_sum, grad_idx);
	// calculate inclusive prefix sum
	cub::DeviceScan::InclusiveSum(d_temp_storage,
	temp_storage_bytes,
	prefix_sum,
	prefix_sum,
	num_rows,
	Stream<gpu>::GetStream(s));
	}

	Kernel<AdamStdDnsRspDnsKernel<req_type, gpu>, gpu>::Launch(
	s,
	weight.shape_.Size(),
	row_length,
	out_data,
	mean_data,
	var_data,
	weight_data,
	grad_idx,
	grad_val,
	prefix_sum,
	static_cast<DType>(param.clip_gradient),
	static_cast<DType>(param.beta1),
	static_cast<DType>(param.beta2),
	static_cast<DType>(param.lr),
	static_cast<DType>(param.wd),
	static_cast<DType>(param.epsilon),
	static_cast<DType>(param.rescale_grad));
	});
	});
	});
	}

	NNVM_REGISTER_OP(signsgd_update).set_attr<FCompute>("FCompute<gpu>", SignSGDUpdate<gpu>);

	NNVM_REGISTER_OP(signum_update).set_attr<FCompute>("FCompute<gpu>", SignumUpdate<gpu>);

	NNVM_REGISTER_OP(sgd_update)
	.set_attr<FCompute>("FCompute<gpu>", SGDUpdate<gpu>)
	.set_attr<FComputeEx>("FComputeEx<gpu>", SGDUpdateEx<gpu>);

	NNVM_REGISTER_OP(sgd_mom_update)
	.set_attr<FCompute>("FCompute<gpu>", SGDMomUpdate<gpu>)
	.set_attr<FComputeEx>("FComputeEx<gpu>", SGDMomUpdateEx<gpu>);

	NNVM_REGISTER_OP(mp_sgd_update).set_attr<FCompute>("FCompute<gpu>", MP_SGDUpdate<gpu>);

	NNVM_REGISTER_OP(mp_sgd_mom_update).set_attr<FCompute>("FCompute<gpu>", MP_SGDMomUpdate<gpu>);

	NNVM_REGISTER_OP(multi_sgd_update)
	.set_attr<FCompute>("FCompute<gpu>", MultiSGDUpdate<gpu, type_identity, 2>);
	NNVM_REGISTER_OP(multi_sgd_mom_update)
	.set_attr<FCompute>("FCompute<gpu>", MultiSGDMomUpdate<gpu, type_identity, 3>);
	NNVM_REGISTER_OP(multi_mp_sgd_update)
	.set_attr<FCompute>("FCompute<gpu>", MultiSGDUpdate<gpu, single_precision, 3>);
	NNVM_REGISTER_OP(multi_mp_sgd_mom_update)
	.set_attr<FCompute>("FCompute<gpu>", MultiSGDMomUpdate<gpu, single_precision, 4>);

	NNVM_REGISTER_OP(nag_mom_update).set_attr<FCompute>("FCompute<gpu>", NAGMomUpdate<gpu>);

	NNVM_REGISTER_OP(mp_nag_mom_update).set_attr<FCompute>("FCompute<gpu>", MP_NAGMomUpdate<gpu>);

	NNVM_REGISTER_OP(ftml_update).set_attr<FCompute>("FCompute<gpu>", FTMLUpdate<gpu>);

	NNVM_REGISTER_OP(adam_update)
	.set_attr<FCompute>("FCompute<gpu>", AdamUpdate<gpu>)
	.set_attr<FComputeEx>("FComputeEx<gpu>", AdamUpdateEx<gpu>);

	NNVM_REGISTER_OP(rmsprop_update).set_attr<FCompute>("FCompute<gpu>", RMSPropUpdate<gpu>);

	NNVM_REGISTER_OP(rmspropalex_update).set_attr<FCompute>("FCompute<gpu>", RMSPropAlexUpdate<gpu>);

	NNVM_REGISTER_OP(ftrl_update)
	.set_attr<FCompute>("FCompute<gpu>", FtrlUpdate<gpu>)
	.set_attr<FComputeEx>("FComputeEx<gpu>", FtrlUpdateEx<gpu>);

	NNVM_REGISTER_OP(_sparse_adagrad_update)
	.set_attr<FComputeEx>("FComputeEx<gpu>", AdagradUpdateEx<gpu>);

	NNVM_REGISTER_OP(lamb_update_phase1).set_attr<FCompute>("FCompute<gpu>", LambUpdatePhaseOne<gpu>);

	NNVM_REGISTER_OP(lamb_update_phase2).set_attr<FCompute>("FCompute<gpu>", LambUpdatePhaseTwo<gpu>);

	NNVM_REGISTER_OP(mp_lamb_update_phase1)
	.set_attr<FCompute>("FCompute<gpu>", MPLambUpdatePhaseOne<gpu>);

	NNVM_REGISTER_OP(mp_lamb_update_phase2)
	.set_attr<FCompute>("FCompute<gpu>", MPLambUpdatePhaseTwo<gpu>);

	} // namespace op
	} // namespace mxnet