tests/python/unittest/test_optimizer.py - mxnet - Git at Google

 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.

 import numpy as np
 import mxnet as mx
 import mxnet.lr_scheduler as lr_scheduler
 from mxnet import gluon
 import unittest
 from nose.tools import raises
 import math
 from mxnet.test_utils import *
 from common import setup_module, with_seed, teardown

 @with_seed()
 def test_learning_rate():
     o1 = mx.optimizer.Optimizer(learning_rate=0.01)
     o1.set_learning_rate(0.2)
     assert o1.learning_rate == 0.2

     lr_s = lr_scheduler.FactorScheduler(step=1)
     o2 = mx.optimizer.Optimizer(lr_scheduler=lr_s, learning_rate=0.3)
     assert o2.learning_rate == 0.3
     o2.lr_scheduler.base_lr = 0.4
     assert o2.learning_rate == 0.4


 @raises(UserWarning)
 @with_seed()
 def test_learning_rate_expect_user_warning():
     lr_s = lr_scheduler.FactorScheduler(step=1)
     o = mx.optimizer.Optimizer(lr_scheduler=lr_s, learning_rate=0.3)
     o.set_learning_rate(0.5)


 @with_seed()
 def test_lr_wd_mult():
     data = mx.sym.Variable('data')
     bias = mx.sym.Variable('fc1_bias', lr_mult=1.0)
     fc1 = mx.sym.FullyConnected(data=data, bias=bias, name='fc1', num_hidden=10, lr_mult=0)
     fc2 = mx.sym.FullyConnected(data=fc1, name='fc2', num_hidden=10, wd_mult=0.5)

     mod = mx.mod.Module(symbol=fc2, label_names=None, context=default_context())
     mod.bind(data_shapes=[('data', (5,10))])
     mod.init_params(initializer=mx.init.Uniform(1.0))
     mod.init_optimizer(optimizer_params={'learning_rate': 1.0})
     args1, _ = mod.get_params()
     args1 = {k: v.asnumpy() for k, v in args1.items()}
     mod.forward(mx.io.DataBatch(data=[mx.random.uniform(low=-1.0, high=1.0, shape=(5,10))], label=None), is_train=True)
     mod.backward(mod.get_outputs())
     mod.update()
     args2, _ = mod.get_params()
     args2 = {k: v.asnumpy() for k, v in args2.items()}

     assert mod._optimizer.lr_mult == {'fc1_bias': 1.0, 'fc1_weight': 0.0}
     assert mod._optimizer.wd_mult == {'fc2_bias': 0.5, 'fc2_weight': 0.5, 'fc1_bias': 0.0}
     assert mx.test_utils.almost_equal(args1['fc1_weight'], args2['fc1_weight'], 1e-10)
     assert not mx.test_utils.almost_equal(args1['fc1_bias'], args2['fc1_bias'], 1e-1)
     assert not mx.test_utils.almost_equal(args1['fc2_weight'], args2['fc2_weight'], 1e-1)

 def compare_ndarray_tuple(t1, t2, rtol=None, atol=None):
     if t1 is not None and t2 is not None:
         if isinstance(t1, tuple):
             for s1, s2 in zip(t1, t2):
                 compare_ndarray_tuple(s1, s2, rtol, atol)
         else:
             assert_almost_equal(t1.asnumpy(), t2.asnumpy(), rtol=rtol, atol=atol)


 def compare_optimizer(opt1, opt2, shape, dtype, w_stype='default', g_stype='default',
                       rtol=1e-4, atol=1e-5):
     if w_stype == 'default':
         w2 = mx.random.uniform(shape=shape, ctx=default_context(), dtype=dtype)
         w1 = w2.copyto(default_context())
     elif w_stype == 'row_sparse' or w_stype == 'csr':
         w2 = rand_ndarray(shape, w_stype, density=1, dtype=dtype)
         w1 = w2.copyto(default_context()).tostype('default')
     else:
         raise Exception("type not supported yet")
     if g_stype == 'default':
         g2 = mx.random.uniform(shape=shape, ctx=default_context(), dtype=dtype)
         g1 = g2.copyto(default_context())
     elif g_stype == 'row_sparse' or g_stype == 'csr':
         g2 = rand_ndarray(shape, g_stype, dtype=dtype)
         g1 = g2.copyto(default_context()).tostype('default')
     else:
         raise Exception("type not supported yet")

     state1 = opt1.create_state_multi_precision(0, w1)
     state2 = opt2.create_state_multi_precision(0, w2)
     compare_ndarray_tuple(state1, state2)

     opt1.update_multi_precision(0, w1, g1, state1)
     opt2.update_multi_precision(0, w2, g2, state2)
     compare_ndarray_tuple(state1, state2, rtol=rtol, atol=atol)
     assert_almost_equal(w1.asnumpy(), w2.asnumpy(), rtol=rtol, atol=atol)

 # SGD

 class PySGD(mx.optimizer.Optimizer):
     """python reference implemenation of sgd"""
     def __init__(self, learning_rate=0.01, momentum=0.0, multi_precision=False, **kwargs):
         super(PySGD, self).__init__(learning_rate=learning_rate, **kwargs)
         self.momentum = momentum
         self.multi_precision = multi_precision

     def create_state(self, index, weight):
         """Create additional optimizer state: momentum

         Parameters
         ----------
         weight : NDArray
         The weight data

         """
         momentum = None
         weight_master_copy = None
         do_multi_precision = self.multi_precision and weight.dtype == np.float16
         if do_multi_precision:
             if self.momentum != 0.0:
                 momentum = mx.nd.zeros(weight.shape, weight.context, dtype=np.float32)
             weight_master_copy = array(weight, ctx=weight.context, dtype=np.float32)
             return (momentum, weight_master_copy)
         else:
             if self.momentum != 0.0:
                 momentum = mx.nd.zeros(weight.shape, weight.context, dtype=weight.dtype)
             return momentum

     def create_state_multi_precision(self, index, weight):
         return self.create_state(index, weight)

     def update(self, index, weight, grad, state):
         """Update the parameters.

         Parameters
         ----------
         index : int
         An unique integer key used to index the parameters

         weight : NDArray
         weight ndarray

         grad : NDArray
         grad ndarray

         state : NDArray or other objects returned by init_state
         The auxiliary state used in optimization.
         """
         lr = self._get_lr(index)
         wd = self._get_wd(index)
         self._update_count(index)
         use_multi_precision = isinstance(state, list) or isinstance(state, tuple)

         if not use_multi_precision:
             if self.momentum == 0.0:
                 if self.clip_gradient is not None:
                     weight[:] = ((1 - lr*wd)*weight -
                         lr*mx.nd.clip(grad*self.rescale_grad, -self.clip_gradient, self.clip_gradient))
                 else:
                     weight[:] = (1 - lr*wd)*weight - lr*self.rescale_grad*grad
             else:
                 mom = state
                 if self.clip_gradient is not None:
                     mom[:] = (self.momentum*mom - lr*wd*weight -
                         lr*mx.nd.clip(grad*self.rescale_grad, -self.clip_gradient, self.clip_gradient))
                     weight += mom
                 else:
                     mom[:] = self.momentum*mom - lr*wd*weight - lr*self.rescale_grad*grad
                     weight += mom
         else:
             grad32 = array(grad, ctx=grad.context, dtype=np.float32)
             mom = state[0]
             weight32 = state[1]
             if self.momentum == 0.0:
                 if self.clip_gradient is not None:
                     weight32[:] = ((1 - lr*wd)*weight32 -
                         lr*mx.nd.clip(grad32*self.rescale_grad, -self.clip_gradient, self.clip_gradient))
                 else:
                     weight32[:] = (1 - lr*wd)*weight32 - lr*self.rescale_grad*grad32
             else:
                 if self.clip_gradient is not None:
                     mom[:] = (self.momentum*mom - lr*wd*weight32 -
                         lr*mx.nd.clip(grad32*self.rescale_grad, -self.clip_gradient, self.clip_gradient))
                     weight32 += mom
                 else:
                     mom[:] = self.momentum*mom - lr*wd*weight32 - lr*self.rescale_grad*grad32
                     weight32 += mom
             tmp = weight32.astype(weight.dtype)
             tmp.copyto(weight)

     def update_multi_precision(self, index, weight, grad, state):
         self.update(index, weight, grad, state)

 @with_seed()
 def test_sgd():
     opt1 = PySGD
     opt2 = mx.optimizer.SGD
     shape = (3, 4, 5)
     mom_options = [{}, {'momentum': 0.9}]
     cg_options = [{}, {'clip_gradient': 0.4}, {'clip_gradient': 0.5}]
     rg_options = [{}, {'rescale_grad': 0.14}, {'rescale_grad': 0.8}]
     wd_options = [{}, {'wd': 0.03}, {'wd': 0.05}, {'wd': 0.07}]
     mp_options = [{}, {'multi_precision': False}, {'multi_precision': True}]
     for dtype in [np.float16, np.float32, np.float64]:
         for mom_option in mom_options:
             for cg_option in cg_options:
                 for rg_option in rg_options:
                     for wd_option in wd_options:
                         for mp_option in mp_options:
                             kwarg = {}
                             kwarg.update(mom_option)
                             kwarg.update(cg_option)
                             kwarg.update(rg_option)
                             kwarg.update(wd_option)
                             kwarg.update(mp_option)
                             if (dtype == np.float16 and
                                     ('multi_precision' not in kwarg or
                                         not kwarg['multi_precision'])):
                                 continue
                             if dtype == np.float16:
                                 compare_optimizer(opt1(**kwarg), opt2(**kwarg), shape, dtype, rtol=1e-3)
                             else:
                                 compare_optimizer(opt1(**kwarg), opt2(**kwarg), shape, dtype)
                             # test operator fallback on cpu
                             if dtype != np.float16:
                                 compare_optimizer(opt1(**kwarg), opt2(**kwarg), shape[:2],
                                                   dtype, w_stype='csr', g_stype='csr')

 class PySparseSGD(mx.optimizer.Optimizer):
     """python reference implemenation of sgd"""
     def __init__(self, learning_rate=0.01, momentum=0.0, **kwargs):
         super(PySparseSGD, self).__init__(learning_rate=learning_rate, **kwargs)
         self.momentum = momentum

     def create_state(self, index, weight):
         """Create additional optimizer state: momentum

         Parameters
         ----------
         weight : NDArray
         The weight data

         """
         if self.momentum == 0.0:
             return None
         else:
             return mx.nd.zeros(weight.shape, weight.context, dtype=weight.dtype)

     def update(self, index, weight, grad, state):
         """Update the parameters.

         Parameters
         ----------
         index : int
         An unique integer key used to index the parameters

         weight : NDArray
         weight ndarray

         grad : NDArray
         grad ndarray

         state : NDArray or other objects returned by init_state
         The auxiliary state used in optimization.
         """
         lr = self._get_lr(index)
         wd = self._get_wd(index)
         self._update_count(index)
         num_rows = weight.shape[0]
         if self.momentum == 0.0:
             # Update on a per row basis, skip all-zero rows
             for row in range(num_rows):
                 grad_row = grad[row].asnumpy()
                 all_zeros = mx.test_utils.almost_equal(grad_row, np.zeros_like(grad_row))
                 if all_zeros:
                    continue
                 if self.clip_gradient is not None:
                     weight[row] = ((1 - lr*wd)*weight[row] -
                         lr*mx.nd.clip(grad[row]*self.rescale_grad,
                                      -self.clip_gradient, self.clip_gradient))
                 else:
                     weight[row] = (1 - lr*wd)*weight[row] - lr*self.rescale_grad*grad[row]
         else:
             mom = state
             for row in range(num_rows):
               grad_row = grad[row].asnumpy()
               all_zeros = mx.test_utils.almost_equal(grad_row, np.zeros_like(grad_row))
               if all_zeros:
                   continue
               if self.clip_gradient is not None:
                   mom[row] = (self.momentum*mom[row] - lr*wd*weight[row] -
                       lr*mx.nd.clip(grad[row]*self.rescale_grad, -self.clip_gradient, self.clip_gradient))
                   weight[row] += mom[row]
               else:
                   mom[row] = self.momentum*mom[row] - lr*wd*weight[row] - lr*self.rescale_grad*grad[row]
                   weight[row] += mom[row]

 @with_seed()
 def test_sparse_sgd():
     opt1 = PySparseSGD
     opt2 = mx.optimizer.SGD
     shape = (3, 4, 5)
     mom_options = [{}, {'momentum': 0.9}]
     cg_options = [{}, {'clip_gradient': 0.4}, {'clip_gradient': 0.5}]
     rg_options = [{}, {'rescale_grad': 0.14}, {'rescale_grad': 0.8}]
     wd_options = [{}, {'wd': 0.03}, {'wd': 0.05}, {'wd': 0.07}]
     mp_options = [{}, {'multi_precision': False}, {'multi_precision': True}]
     for dtype in [np.float32]:
         for mom_option in mom_options:
             for cg_option in cg_options:
                 for rg_option in rg_options:
                     for wd_option in wd_options:
                         for mp_option in mp_options:
                             kwarg = {}
                             kwarg.update(mom_option)
                             kwarg.update(cg_option)
                             kwarg.update(rg_option)
                             kwarg.update(wd_option)
                             kwarg.update(mp_option)
                             compare_optimizer(opt1(**kwarg), opt2(**kwarg), shape, dtype,
                                               w_stype='row_sparse', g_stype='row_sparse')
                             compare_optimizer(opt1(**kwarg), opt2(**kwarg), shape, dtype,
                                               w_stype='default', g_stype='row_sparse')


 @with_seed()
 def test_std_sparse_sgd():
     opt1 = PySGD
     opt2 = mx.optimizer.SGD
     shape = (3, 4, 5)
     mom_options = [{'momentum': 0.0}, {'momentum': 0.9}]
     cg_options = [{}, {'clip_gradient': 0.4}, {'clip_gradient': 0.5}]
     rg_options = [{}, {'rescale_grad': 0.14}, {'rescale_grad': 0.8}]
     wd_options = [{}, {'wd': 0.03}, {'wd': 0.05}, {'wd': 0.07}]
     for dtype in [np.float32]:
         for mom_option in mom_options:
             for cg_option in cg_options:
                 for rg_option in rg_options:
                     for wd_option in wd_options:
                         kwarg = {}
                         kwarg.update(mom_option)
                         kwarg.update(cg_option)
                         kwarg.update(rg_option)
                         kwarg.update(wd_option)
                         compare_optimizer(opt1(**kwarg), opt2(lazy_update=False, **kwarg), shape, dtype,
                                           w_stype='row_sparse', g_stype='row_sparse')
                         compare_optimizer(opt1(**kwarg), opt2(lazy_update=False, **kwarg), shape, dtype,
                                           w_stype='default', g_stype='row_sparse')


 class PyNAG(PySGD):
     def __init__(self, **kwargs):
         super(PyNAG, self).__init__(**kwargs)

     def create_state(self, index, weight):
         """Create additional optimizer state: momentum

         Parameters
         ----------
         weight : NDArray
         The weight data

         """
         momentum = None
         weight_master_copy = None
         do_multi_precision = self.multi_precision and weight.dtype == np.float16
         if do_multi_precision:
             if self.momentum != 0.0:
                 momentum = mx.nd.zeros(weight.shape, weight.context, dtype=np.float32)
             weight_master_copy = array(weight, ctx=weight.context, dtype=np.float32)
             return (weight_master_copy, momentum)
         else:
             if self.momentum != 0.0:
                 momentum = mx.nd.zeros(weight.shape, weight.context, dtype=weight.dtype)
             return momentum

     def create_state_multi_precision(self, index, weight):
         return self.create_state(index, weight)

     def update(self, index, weight, grad, state):
         """Update the parameters.

         Parameters
         ----------
         index : int
         An unique integer key used to index the parameters

         weight : NDArray
         weight ndarray

         grad : NDArray
         grad ndarray

         state : NDArray or other objects returned by init_state
         The auxiliary state used in optimization.
         """
         lr = self._get_lr(index)
         wd = self._get_wd(index)
         self._update_count(index)
         use_multi_precision = isinstance(state, list) or isinstance(state, tuple)
         if not use_multi_precision:
             grad = grad * self.rescale_grad
             if self.clip_gradient is not None:
                 grad = mx.nd.clip(grad, -self.clip_gradient, self.clip_gradient)
             if self.momentum == 0.0:
                 weight[:] += -lr * (grad + wd * weight)
             else:
               mom = state
               mom[:] *= self.momentum
               grad += wd * weight
               mom[:] += grad
               grad[:] += self.momentum * mom
               weight[:] += -lr * grad
         else:
             grad32 = array(grad, ctx=grad.context, dtype=np.float32)
             grad32 = grad32 * self.rescale_grad
             if self.clip_gradient is not None:
                 grad32 = mx.nd.clip(grad32, -self.clip_gradient, self.clip_gradient)
             mom = state[1]
             weight32 = state[0]
             if self.momentum == 0.0:
                 weight32[:] += -lr * (grad32 + wd * weight32)
             else:
                 mom[:] *= self.momentum
                 grad32 += wd * weight32
                 mom[:] += grad32
                 grad32[:] += self.momentum * mom
                 weight32[:] += -lr * grad32
             tmp = weight32.astype(weight.dtype)
             tmp.copyto(weight)

 @with_seed(0)
 def test_nag():
     opt1 = PyNAG
     opt2 = mx.optimizer.NAG
     shape = (3, 4, 5)
     mom_options = [{}, {'momentum': 0.9}]
     cg_options = [{}, {'clip_gradient': 0.4}, {'clip_gradient': 0.5}]
     rg_options = [{}, {'rescale_grad': 0.14}, {'rescale_grad': 0.8}]
     wd_options = [{}, {'wd': 0.03}, {'wd': 0.05}, {'wd': 0.07}]
     mp_options = [{}, {'multi_precision': False}, {'multi_precision': True}]
     for dtype in [np.float16, np.float32, np.float64]:
         for mom_option in mom_options:
             for cg_option in cg_options:
                 for rg_option in rg_options:
                     for wd_option in wd_options:
                         for mp_option in mp_options:
                             kwarg = {}
                             kwarg.update(mom_option)
                             kwarg.update(cg_option)
                             kwarg.update(rg_option)
                             kwarg.update(wd_option)
                             kwarg.update(mp_option)
                             if (dtype == np.float16 and
                                     ('multi_precision' not in kwarg or
                                         not kwarg['multi_precision'])):
                                 continue
                             compare_optimizer(opt1(**kwarg), opt2(**kwarg), shape, dtype)


 # FTML

 class PyFTML(mx.optimizer.Optimizer):
     """python reference implemenation of FTML"""
     def __init__(self, beta1=0.6, beta2=0.999, epsilon=1e-8, **kwargs):
         super(PyFTML, self).__init__(**kwargs)
         self.beta1 = beta1
         self.beta2 = beta2
         self.epsilon = epsilon

     def create_state(self, index, weight):
         return (mx.nd.zeros(weight.shape, weight.context, dtype=weight.dtype), # d_0
                 mx.nd.zeros(weight.shape, weight.context, dtype=weight.dtype), # v_0
                 mx.nd.zeros(weight.shape, weight.context, dtype=weight.dtype)) # z_0

     def update(self, index, weight, grad, state):
         assert(isinstance(weight, mx.nd. NDArray))
         assert(isinstance(grad, mx.nd.NDArray))
         self._update_count(index)
         lr = self._get_lr(index)
         wd = self._get_wd(index)
         t = self._index_update_count[index]

         grad = grad * self.rescale_grad + wd * weight
         if self.clip_gradient is not None:
             grad = mx.nd.clip(grad, -self.clip_gradient, self.clip_gradient)
         # get previous states
         prev_d, prev_v, prev_z = state
         # compute states
         v_t = self.beta2 * prev_v + (1 - self.beta2) * mx.nd.square(grad)
         d_t = (1 - pow(self.beta1, t)) / lr * (mx.nd.sqrt(v_t / (1 - pow(self.beta2, t))) + self.epsilon)
         sigma_t = d_t - self.beta1 * prev_d
         z_t = self.beta1 * prev_z + (1 - self.beta1) * grad - sigma_t * weight
         # update weight
         weight[:] = - z_t / d_t
         # update states
         prev_d[:] = d_t
         prev_v[:] = v_t
         prev_z[:] = z_t

 @with_seed(0)
 def test_ftml():
     opt1 = PyFTML
     opt2 = mx.optimizer.FTML
     shape = (3, 4, 5)
     beta1_options = [{}, {'beta1': 0.5}, {'beta1': 0.7}]
     beta2_options = [{}, {'beta2': 0.8}, {'beta2': 0.9}]
     cg_options = [{}, {'clip_gradient': 0.4}, {'clip_gradient': 0.5}]
     rg_options = [{}, {'rescale_grad': 0.14}, {'rescale_grad': 0.8}]
     wd_options = [{}, {'wd': 0.03}, {'wd': 0.05}, {'wd': 0.07}]
     for dtype in [np.float32]:
         for beta1_option in beta1_options:
             for beta2_option in beta2_options:
                 for cg_option in cg_options:
                     for rg_option in rg_options:
                         for wd_option in wd_options:
                             kwarg = {}
                             kwarg.update(beta1_option)
                             kwarg.update(beta2_option)
                             kwarg.update(cg_option)
                             kwarg.update(rg_option)
                             kwarg.update(wd_option)
                             compare_optimizer(opt1(**kwarg), opt2(**kwarg), shape, dtype)


 # ADAM

 class PyAdam(mx.optimizer.Optimizer):
     """python reference implemenation of adam"""
     def __init__(self, learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8,
                  decay_factor=(1 - 1e-8), lazy_update=True, **kwargs):
         super(PyAdam, self).__init__(learning_rate=learning_rate, **kwargs)
         self.beta1 = beta1
         self.beta2 = beta2
         self.epsilon = epsilon
         self.decay_factor = decay_factor
         self.lazy_update = lazy_update

     def create_state(self, index, weight):
         """Create additional optimizer state: mean, variance

         Parameters
         ----------
         weight : NDArray
         The weight data

         """
         return (mx.nd.zeros(weight.shape, weight.context, dtype=weight.dtype),  # mean
                 mx.nd.zeros(weight.shape, weight.context, dtype=weight.dtype))  # variance

     def update(self, index, weight, grad, state):
         """Update the parameters.

         Parameters
         ----------
         index : int
         An unique integer key used to index the parameters

         weight : NDArray
         weight ndarray

         grad : NDArray
         grad ndarray

         state : NDArray or other objects returned by init_state
         The auxiliary state used in optimization.
         """
         lr = self._get_lr(index)
         self._update_count(index)

         t = self._index_update_count[index]
         mean, variance = state

         wd = self._get_wd(index)
         num_rows = weight.shape[0]
         coef1 = 1. - self.beta1**t
         coef2 = 1. - self.beta2**t
         lr *= math.sqrt(coef2)/coef1
         for row in range(num_rows):
             # check row slices of all zeros
             all_zeros = mx.test_utils.almost_equal(grad[row].asnumpy(), np.zeros_like(grad[row].asnumpy()))
             # skip zeros during lazy update
             if all_zeros and self.lazy_update:
                 continue
             grad[row] = grad[row] * self.rescale_grad + wd * weight[row]
             # clip gradients
             if self.clip_gradient is not None:
                 mx.nd.clip(grad[row], -self.clip_gradient, self.clip_gradient, out=grad[row])
             # update mean
             mean[row] *= self.beta1
             mean[row] += grad[row] * (1. - self.beta1)
             # update variance
             variance[row] *= self.beta2
             variance[row] += (1 - self.beta2) * mx.nd.square(grad[row], out=grad[row])
             # update weight
             weight[row] -= lr*mean[row]/(mx.nd.sqrt(variance[row]) + self.epsilon)


 @with_seed()
 def test_adam():
     opt1 = PyAdam
     opt2 = mx.optimizer.Adam
     shape = (3, 4, 5)
     cg_options = [{}, {'clip_gradient': 0.4}, {'clip_gradient': 0.5}]
     rg_options = [{}, {'rescale_grad': 0.14}, {'rescale_grad': 0.8}]
     wd_options = [{}, {'wd': 0.03}, {'wd': 0.05}, {'wd': 0.07}]
     mp_options = [{}, {'multi_precision': False}, {'multi_precision': True}]
     for dtype in [np.float16, np.float32, np.float64]:
         for cg_option in cg_options:
             for rg_option in rg_options:
                 for wd_option in wd_options:
                     for mp_option in mp_options:
                         kwarg = {}
                         kwarg.update(cg_option)
                         kwarg.update(rg_option)
                         kwarg.update(wd_option)
                         kwarg.update(mp_option)
                         if (dtype == np.float16 and
                                 ('multi_precision' not in kwarg or
                                     not kwarg['multi_precision'])):
                             continue
                         # atol 2e-5 needed to pass with seed 1248389097
                         compare_optimizer(opt1(lazy_update=False, **kwarg), opt2(**kwarg), shape, dtype,
                                           rtol=1e-4, atol=2e-5)
                         # atol 2e-5 needed to pass with seed 781809840
                         compare_optimizer(opt1(**kwarg), opt2(**kwarg), shape,
                                           dtype, w_stype='row_sparse', g_stype='row_sparse',
                                           rtol=1e-4, atol=2e-5)
                         compare_optimizer(opt1(lazy_update=False, **kwarg), opt2(lazy_update=False, **kwarg), shape,
                                           dtype, w_stype='row_sparse', g_stype='row_sparse',
                                           rtol=1e-4, atol=2e-5)
                         compare_optimizer(opt1(**kwarg), opt2(**kwarg), shape,
                                           dtype, w_stype='default', g_stype='row_sparse',
                                           rtol=1e-4, atol=2e-5)
                         compare_optimizer(opt1(lazy_update=False, **kwarg), opt2(lazy_update=False, **kwarg), shape,
                                           dtype, w_stype='default', g_stype='row_sparse',
                                           rtol=1e-4, atol=2e-5)

 # Signum
 class PySignum(mx.optimizer.Optimizer):
     """The python reference of Signum optimizer.

     The optimizer updates the weight by:

         rescaled_grad = rescale_grad * clip(grad, clip_gradient) + wd * weight
         state = momentum * state + (1-momentum)*rescaled_grad
         weight = (1 - lr * wd_lh) * weight - lr * sign(state)

     See the original paper at: https://jeremybernste.in/projects/amazon/signum.pdf

     For details of the update algorithm see
     :class:`~mxnet.ndarray.signsgd_update` and :class:`~mxnet.ndarray.signum_update`.

     This optimizer accepts the following parameters in addition to those accepted
     by :class:`.Optimizer`.

     Parameters
     ----------
     momentum : float, optional
        The momentum value.
     wd_lh : float, optitional
        The amount of decoupled weight decay regularization.
     """
     def __init__(self, learning_rate=0.01, momentum=0.9, wd_lh = 0.0, **kwargs):
         super(PySignum, self).__init__(learning_rate = learning_rate, **kwargs)
         self.momentum = momentum
         self.wd_lh = wd_lh

     def create_state(self, index, weight):
         momentum = None
         if self.momentum != 0.0:
             momentum = mx.nd.zeros(weight.shape, weight.context, dtype=weight.dtype, stype=weight.stype)
         return momentum

     def update(self, index, weight, grad, state):
         self._update_count(index)
         lr = self._get_lr(index)
         wd = self._get_wd(index)

         if state is not None:
             mom = state
             if self.clip_gradient is not None:
               mom[:] = (self.momentum*mom - (1-self.momentum)*(wd*weight +
                   mx.nd.clip(grad*self.rescale_grad, -self.clip_gradient, self.clip_gradient)))
             else:
               mom[:] = self.momentum*mom - (1-self.momentum)*wd*weight - (1-self.momentum)*self.rescale_grad*grad
             weight[:] = (1 - lr*self.wd_lh)*weight + lr*mx.nd.sign(mom)
         else:
             weight[:] = (1 - lr*(wd+self.wd_lh))*weight - lr*mx.nd.sign(grad)

 @with_seed(0)
 def test_signum():
     opt1 = PySignum
     opt2 = mx.optimizer.Signum
     shape = (3, 4, 5)
     cg_options = [{}, {'clip_gradient': 0.4}, {'clip_gradient': 0.5}]
     rg_options = [{}, {'rescale_grad': 0.14}, {'rescale_grad': 0.8}]
     wd_options = [{}, {'wd': 0.03}, {'wd': 0.05}, {'wd': 0.07}]
     wd_lh_options = [{}, {'wd_lh': 0.015}, {'wd_lh': 0.0}]
     mom_options = [{}, {'momentum': 0.9}]
     lr_options = [{'learning_rate': 0.05},{'learning_rate': 0.01}]
     for dtype in [np.float32, np.float64]:
         for cg_option in cg_options:
             for rg_option in rg_options:
                 for wd_option in wd_options:
                     for mp_option in wd_lh_options:
                         for lr_option in lr_options:
                             for mom_option in mom_options:
                                 kwarg = {}
                                 kwarg.update(cg_option)
                                 kwarg.update(rg_option)
                                 kwarg.update(wd_option)
                                 kwarg.update(mp_option)
                                 kwarg.update(lr_option)
                                 kwarg.update(mom_option)
                                 compare_optimizer(opt1(**kwarg), opt2(**kwarg), shape, dtype)


 # RMSProp
 class PyRMSProp(mx.optimizer.Optimizer):
     """RMSProp optimizer of Tieleman & Hinton, 2012,

     For centered=False, the code follows the version in
     http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf by
     Tieleman & Hinton, 2012

     For centered=True, the code follows the version in
     http://arxiv.org/pdf/1308.0850v5.pdf Eq(38) - Eq(45) by Alex Graves, 2013.

     Parameters
     ----------
     learning_rate : float, optional
         Step size.
         Default value is set to 0.001.
     gamma1: float, optional
         decay factor of moving average for gradient, gradient^2.
         Default value is set to 0.9.
     gamma2: float, optional
         "momentum" factor.
         Default value if set to 0.9.
         Only used if centered=True
     epsilon : float, optional
         Default value is set to 1e-8.
     centered : boolean, optional
         Use Graves or Tielemans & Hintons version of RMSProp
     wd : float, optional
         L2 regularization coefficient add to all the weights
     rescale_grad : float, optional
         rescaling factor of gradient.
     clip_gradient : float, optional
         clip gradient in range [-clip_gradient, clip_gradient]
     clip_weights : float, optional
         clip weights in range [-clip_weights, clip_weights]

     """
     def __init__(self, learning_rate=0.001, gamma1=0.9, gamma2=0.9,
                  epsilon=1e-8, centered=False, clip_weights=None, **kwargs):
         super(PyRMSProp, self).__init__(learning_rate=learning_rate, **kwargs)
         self.centered = centered
         self.gamma1 = gamma1
         self.gamma2 = gamma2
         self.epsilon = epsilon
         self.clip_weights = clip_weights

     def create_state(self, index, weight):
         """Create additional optimizer state.

         For centered=False: n
         For centered=True: n, g, delta

         Parameters
         ----------
         weight : NDArray
             The weight data
         """
         if self.centered:
             return (mx.nd.zeros(weight.shape, weight.context),  # n
                     mx.nd.zeros(weight.shape, weight.context),  # g
                     mx.nd.zeros(weight.shape, weight.context))  # delta
         else:
             return (mx.nd.zeros(weight.shape, weight.context), )  # n

     def update(self, index, weight, grad, state):
         """Update the parameters.

         Parameters
         ----------
         index : int
             An unique integer key used to index the parameters

         weight : NDArray
             weight ndarray

         grad : NDArray
             grad ndarray

         state : NDArray or other objects returned by init_state
             The auxiliary state used in optimization.
         """
         lr = self._get_lr(index)
         wd = self._get_wd(index)
         self._update_count(index)
         grad = grad * self.rescale_grad + wd * weight

         if not self.centered:
             (n, ) = state
             if self.clip_gradient is not None:
                 grad = mx.nd.clip(grad, -self.clip_gradient, self.clip_gradient)
             n[:] = (1 - self.gamma1) * (grad * grad) + self.gamma1 * n
             weight[:] -= lr * grad/(mx.nd.sqrt(n + self.epsilon))

         else:
             n, g, delta = state
             if self.clip_gradient is not None:
                 grad = mx.nd.clip(grad, -self.clip_gradient, self.clip_gradient)
             n[:] = (1 - self.gamma1) * (grad * grad) + self.gamma1 * n
             g[:] = (1 - self.gamma1) * grad + self.gamma1 * g
             delta[:] = (self.gamma2) * delta - lr * grad/(mx.nd.sqrt(n - g*g + self.epsilon))
             weight[:] += delta

         if self.clip_weights:
              mx.ndarray.clip(weight, -self.clip_weights, self.clip_weights, out=weight)

 @with_seed()
 def test_rms():
     opt1 = PyRMSProp
     opt2 = mx.optimizer.RMSProp
     shape = (3, 4, 5)
     cg_options = [{}, {'clip_gradient': 0.4}, {'clip_gradient': 0.5}]
     cw_options = [{}, {'clip_weights': 0.01}]
     center_options = [{}, {'centered': False}, {'centered': True}]
     rg_options = [{}, {'rescale_grad': 0.14}, {'rescale_grad': 0.8}]
     wd_options = [{}, {'wd': 0.03}, {'wd': 0.05}, {'wd': 0.07}]
     mp_options = [{}, {'multi_precision': False}, {'multi_precision': True}]
     for dtype in [np.float16, np.float32]:
         # Reduce foating point compare tolerance to avoid flaky test failure.
         rtol, atol = (1e-1, 1e-1) if dtype is np.float16 else (1e-2, 1e-2)

         for cw_option in cw_options:
             for cg_option in cg_options:
                 for center_option in center_options:
                     for rg_option in rg_options:
                         for wd_option in wd_options:
                             for mp_option in mp_options:
                                 kwarg = {}
                                 kwarg.update(cw_option)
                                 kwarg.update(cg_option)
                                 kwarg.update(center_option)
                                 kwarg.update(rg_option)
                                 kwarg.update(wd_option)
                                 kwarg.update(mp_option)
                                 if (dtype == np.float16 and
                                         ('multi_precision' not in kwarg or
                                             not kwarg['multi_precision'])):
                                     continue
                                 compare_optimizer(opt1(**kwarg), opt2(**kwarg), shape, dtype, rtol=rtol, atol=atol)
                                 if (default_context() == mx.cpu()):
                                     compare_optimizer(opt1(**kwarg), opt2(**kwarg), shape, dtype, g_stype='row_sparse', rtol=rtol, atol=atol)

 class PyFtrl(mx.optimizer.Optimizer):
     """The Ftrl optimizer.

     Referenced from *Ad Click Prediction: a View from the Trenches*, available at
     http://dl.acm.org/citation.cfm?id=2488200.

     Parameters
     ----------
     lamda1 : float, optional
         L1 regularization coefficient.
     learning_rate : float, optional
         The initial learning rate.
     beta : float, optional
         Per-coordinate learning rate correlation parameter.
     eta :
         .. math::
            \\eta_{t,i} = \\frac{learningrate}{\\beta+\\sqrt{\\sum_{s=1}^tg_{s,i}^t}}
     """

     def __init__(self, lamda1=0.01, learning_rate=0.1, beta=1, lazy_update=False, **kwargs):
         super(PyFtrl, self).__init__(**kwargs)
         self.lamda1 = lamda1
         self.beta = beta
         self.lr = learning_rate
         self.lazy_update = lazy_update

     def create_state(self, index, weight):
         return (mx.nd.zeros(weight.shape, weight.context, dtype=weight.dtype),  # dn
                 mx.nd.zeros(weight.shape, weight.context, dtype=weight.dtype))  # n

     def update(self, index, weight, grad, state):
         self._update_count(index)
         wd = self._get_wd(index)
         lr = self._get_lr(index)
         num_rows = weight.shape[0]

         dn, n = state
         for row in range(num_rows):
             all_zeros = mx.test_utils.almost_equal(grad[row].asnumpy(), np.zeros_like(grad[row].asnumpy()))
             if all_zeros and self.lazy_update:
                 continue
             grad[row] = grad[row] * self.rescale_grad
             if self.clip_gradient is not None:
                 mx.nd.clip(grad[row], -self.clip_gradient, self.clip_gradient, out=grad[row])

             #update dn, n
             dn[row] += grad[row] - (mx.nd.sqrt(n[row] + grad[row] * grad[row]) - mx.nd.sqrt(n[row])) * weight[row] / lr
             n[row] += grad[row] * grad[row]

             # update weight
             weight[row] = (mx.nd.sign(dn[row]) * self.lamda1 - dn[row]) / \
                           ((self.beta + mx.nd.sqrt(n[row])) / lr + wd) * (mx.nd.abs(dn[row]) > self.lamda1)

 @with_seed()
 def test_ftrl():
     opt1 = PyFtrl
     opt2 = mx.optimizer.Ftrl
     shape = (3, 4, 5)
     kwargs = [{},
               {'clip_gradient': 0.5},
               {'clip_gradient': 0.4, 'rescale_grad': 0.14},
               {'rescale_grad': 0.8},
               {'clip_gradient': 0.5, 'wd': 0.07},
               {'clip_gradient': 0.4, 'rescale_grad': 0.14, 'wd': 0.03},
               {'rescale_grad': 0.8, 'wd': 0.05},
               {'rescale_grad': 0.8, 'wd': 0.05, 'lamda1': 0.01},
               {'clip_gradient': 0.5, 'wd': 0.07, 'lamda1': 1.0}]
     for kwarg in kwargs:
         compare_optimizer(opt1(**kwarg), opt2(**kwarg), shape, np.float32)
         compare_optimizer(opt1(lazy_update=True, **kwarg), opt2(**kwarg), shape,
                           np.float32, w_stype='row_sparse', g_stype='row_sparse')

 @with_seed(1234)
 def test_nadam():

     def get_net(num_hidden, flatten=True):
         data = mx.symbol.Variable('data')
         fc1 = mx.symbol.FullyConnected(data, name='fc1', num_hidden=128, flatten=flatten)
         act1 = mx.symbol.Activation(fc1, name='relu1', act_type="relu")
         fc2 = mx.symbol.FullyConnected(act1, name = 'fc2', num_hidden = 64, flatten=flatten)
         act2 = mx.symbol.Activation(fc2, name='relu2', act_type="relu")
         fc3 = mx.symbol.FullyConnected(act2, name='fc3', num_hidden=num_hidden, flatten=flatten)
         return fc3

     N = 20
     data = mx.random.uniform(-1, 1, shape=(N, 10))
     label = mx.random.uniform(-1, 1, shape=(N, 1))
     data_iter = mx.io.NDArrayIter(data, label, batch_size=5, label_name='label', shuffle=True)
     output = get_net(1)
     l = mx.symbol.Variable('label')
     Loss = gluon.loss.L1Loss()
     loss = Loss(output, l)
     loss = mx.sym.make_loss(loss)
     mod = mx.mod.Module(loss, data_names=('data',), label_names=('label',))
     mod.fit(data_iter, num_epoch=60, optimizer_params={'learning_rate': 0.0005, 'wd': 0.0005},
             initializer=mx.init.Xavier(magnitude=2), eval_metric=mx.metric.Loss(),
             optimizer='nadam')
     assert mod.score(data_iter, eval_metric=mx.metric.Loss())[0][1] < 0.1

 # AdaGrad
 class PyAdaGrad(mx.optimizer.Optimizer):
     """The python reference of AdaGrad optimizer.

     This class implements the AdaGrad optimizer described in *Adaptive Subgradient
     Methods for Online Learning and Stochastic Optimization*, and available at
     http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf.

     Updates are applied by::

         rescaled_grad = clip(grad * rescale_grad + wd * weight, clip_gradient)
         history = history + square(rescaled_grad)
         w = w - learning_rate * rescaled_grad / sqrt(history + epsilon)

     This optimizer accepts the following parameters in addition to those accepted
     by :class:`.Optimizer`.

     Parameters
     ----------
     eps: float, optional
         Small value to avoid division by 0.

     """
     def __init__(self, eps=1e-7, **kwargs):
         super(PyAdaGrad, self).__init__(**kwargs)
         self.float_stable_eps = eps

     def create_state(self, index, weight):
         return mx.nd.zeros(weight.shape, weight.context, stype=weight.stype)

     def update(self, index, weight, grad, state):
         self._update_count(index)
         lr = self._get_lr(index)
         wd = self._get_wd(index)

         history = state
         grad = grad * self.rescale_grad
         if self.clip_gradient is not None:
             grad = mx.nd.clip(grad, -self.clip_gradient, self.clip_gradient)
         history[:] += mx.nd.square(grad)
         div = grad / mx.nd.sqrt(history + self.float_stable_eps)
         weight[:] += (div + weight * wd) * -lr

 def test_adagrad():
     mx.random.seed(0)
     opt1 = PyAdaGrad
     opt2 = mx.optimizer.AdaGrad
     shape = (3, 4, 5)
     eps_options = [{}, {'eps': 1e-8}]
     cg_options = [{}, {'clip_gradient': 0.4}, {'clip_gradient': 0.5}]
     rg_options = [{}, {'rescale_grad': 0.14}, {'rescale_grad': 0.8}]
     wd_options = [{}, {'wd': 0.0}]
     for dtype in [np.float32]:
         for eps_option in eps_options:
             for cg_option in cg_options:
                 for rg_option in rg_options:
                     for wd_option in wd_options:
                         kwarg = {}
                         kwarg.update(eps_option)
                         kwarg.update(cg_option)
                         kwarg.update(rg_option)
                         kwarg.update(wd_option)
                         compare_optimizer(opt1(**kwarg), opt2(**kwarg), shape, dtype)
                         if wd_option.get('wd', 0.0) == 0.0:
                             compare_optimizer(opt1(**kwarg), opt2(**kwarg), shape, dtype,
                                               w_stype='row_sparse', g_stype='row_sparse')
                             compare_optimizer(opt1(**kwarg), opt2(**kwarg), shape, dtype,
                                               g_stype='row_sparse')


 if __name__ == '__main__':
     import nose
     nose.runmodule()