perl-package/AI-MXNet/lib/AI/MXNet/Optimizer.pm - mxnet - Git at Google

 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.

 package AI::MXNet::Optimizer;
 use strict;
 use warnings;
 use AI::MXNet::NS;
 use AI::MXNet::Base;
 use AI::MXNet::NDArray;
 use AI::MXNet::Random;
 use List::Util qw(max);

 =head1 NAME

     AI::MXNet::Optimizer - Common Optimization algorithms with regularizations.

 =head1  DESCRIPTION

     Common Optimization algorithms with regularizations.
 =cut

 use Mouse;
 use AI::MXNet::Function::Parameters;
 my %opt_registry;
 method get_opt_registry()
 {
     return \%opt_registry;
 }

 method register()
 {
     my $name = $self;
     ($name) = $name =~ /::(\w+)$/;
     {  no strict 'refs'; *{__PACKAGE__."::$name"} = sub { shift; $self->new(@_)  }; }
     $name = lc $name;
     if(exists $opt_registry{ $name })
     {
         my $existing = $opt_registry{ $name };
         warn(
             "WARNING: New optimizer $self.$name"
             ."is overriding existing optimizer $existing.$name"
         );
     }
     $opt_registry{ $name } = $self;
 }

 =head2 create_optimizer

         Create an optimizer with specified name.

         Parameters
         ----------
         $name: Str
             Name of required optimizer. Should be the name
             of a subclass of Optimizer. Case insensitive.

         :$rescale_grad : Num
             Rescaling factor on gradient. Normally should be 1/batch_size.

         %kwargs: Hash
             Parameters for optimizer

         Returns
         -------
         opt : Optimizer
             The result optimizer.
 =cut

 method create_optimizer(Str $name, %kwargs)
 {
     if(exists $opt_registry{ lc $name })
     {
         my $rescale_grad = delete($kwargs{rescale_grad})//1;
         return $opt_registry{ lc $name }->new(
             rescale_grad => $rescale_grad,
             %kwargs
         );
     }
     confess("Cannot find optimizer $name");
 }

 *create = \&create_optimizer;

 has 'rescale_grad'        => (is => "rw", isa => "Num", default=>1);
 has 'lr'                  => (is => "rw", isa => "Num");
 has 'learning_rate'       => (is => "rw", isa => "Num", default => 0.01);
 has 'lr_scheduler'        => (is => "rw", isa => "Maybe[AI::MXNet::LRScheduler]");
 has 'wd'                  => (is => "rw", isa => "Num", default => 0);
 has 'lr_mult'             => (is => "rw", isa => "HashRef", default => sub { +{} });
 has 'wd_mult'             => (is => "rw", isa => "HashRef", , default => sub { +{} });
 has 'num_update'          => (is => "rw", isa => "Int");
 has 'begin_num_update'    => (is => "rw", isa => "Int", default => 0);
 has '_index_update_count' => (is => "rw", isa => "HashRef", default => sub { +{} });
 has 'clip_gradient'       => (is => "rw", isa => "Maybe[Num]");
 has 'param_idx2name'      => (is => "rw", isa => "HashRef[Str]", default => sub { +{} });
 has 'idx2name'            => (is => "rw", isa => "HashRef[Str]");
 has 'sym'                 => (is => "rw", isa => "Maybe[AI::MXNet::Symbol]");
 has 'param_dict'          => (is => "rw", isa => "HashRef", default => sub { +{} });

 sub BUILD
 {
     my $self = shift;
     if($self->lr_scheduler)
     {
         $self->lr_scheduler->base_lr($self->learning_rate);
     }
     $self->lr($self->learning_rate);
     $self->num_update($self->begin_num_update);
     $self->idx2name({ %{ $self->param_idx2name } });
     $self->set_lr_mult({});
     $self->set_wd_mult({});
 }
 # Create additional optimizer state such as momentum.
 # override in implementations.
 method create_state($index, $weight){}

 # Update the parameters. override in implementations
 method update($index, $weight, $grad, $state){}

 # set lr scale is deprecated. Use set_lr_mult instead.
 method set_lr_scale($args_lrscale)
 {
     Carp::cluck("set lr scale is deprecated. Use set_lr_mult instead.");
 }

 =head2 set_lr_mult

         Set individual learning rate multipler for parameters

         Parameters
         ----------
         args_lr_mult : dict of string/int to float
             set the lr multipler for name/index to float.
             setting multipler by index is supported for backward compatibility,
             but we recommend using name and symbol.
 =cut

 method set_lr_mult(HashRef[Num] $args_lr_mult)
 {
     $self->lr_mult({});
     if($self->sym)
     {
         my $attr = $self->sym->attr_dict();
         for my $name (@{ $self->sym->list_arguments() })
         {
             if(exists $attr->{ $name } and exists $attr->{ $name }{ __lr_mult__ })
             {
                 $self->lr_mult->{ $name } = $attr->{ $name }{ __lr_mult__ };
             }
         }
     }
     $self->lr_mult({ %{ $self->lr_mult }, %{ $args_lr_mult } });
 }

 =head2 set_wd_mult

         Set individual weight decay multipler for parameters.
         By default wd multipler is 0 for all params whose name doesn't
         end with _weight, if param_idx2name is provided.

         Parameters
         ----------
         args_wd_mult : dict of string/int to float
             set the wd multipler for name/index to float.
             setting multipler by index is supported for backward compatibility,
             but we recommend using name and symbol.
 =cut

 method set_wd_mult(HashRef[Num] $args_wd_mult)
 {
     $self->wd_mult({});
     for my $n (values %{ $self->idx2name })
     {
         if(not $n =~ /(?:_weight|_gamma)$/)
         {
             $self->wd_mult->{ $n } = 0;
         }
     }
     if($self->sym)
     {
         my $attr = $self->sym->attr_dict();
         for my $name (@{ $self->sym->list_arguments() })
         {
             if(exists $attr->{ $name } and exists $attr->{ $name }{ __wd_mult__ })
             {
                 $self->wd_mult->{ $name } = $attr->{ $name }{ __wd_mult__ };
             }
         }
     }
     $self->wd_mult({ %{ $self->wd_mult }, %{ $args_wd_mult } });
 }

 method _update_count(Index $index)
 {
     if(not exists $self->_index_update_count->{ $index })
     {
         $self->_index_update_count->{ $index } = $self->begin_num_update;
     }
     $self->_index_update_count->{ $index } += 1;
     $self->num_update(max($self->_index_update_count->{ $index }, $self->num_update));
 }

 method _get_lr(Index $index)
 {
     my $lr;
     if($self->lr_scheduler)
     {
         $lr = $self->lr_scheduler->($self->num_update);
     }
     else
     {
         $lr = $self->lr;
     }

     if(exists $self->param_dict->{ $index })
     {
         $lr *= $self->param_dict->{ $index }->lr_mult;
     }
     elsif(exists $self->lr_mult->{ $index })
     {
         $lr *= $self->lr_mult->{ $index };
     }
     elsif(exists $self->idx2name->{ $index })
     {
         $lr *= $self->lr_mult->{ $self->idx2name->{ $index } }//1;
     }
     return $lr;
 }

 method _get_wd(Index $index)
 {
     my $wd = $self->wd;
     if(exists $self->param_dict->{ $index })
     {
         $wd *= $self->param_dict->{ $index }->wd_mult;
     }
     elsif(exists $self->wd_mult->{ $index })
     {
         $wd *= $self->wd_mult->{ $index };
     }
     elsif(exists $self->idx2name->{ $index })
     {
         $wd *= $self->wd_mult->{ $self->idx2name->{ $index } }//1;
     }
     return $wd;
 }

 =head1 NAME

     AI::MXNet::SGD - A very simple SGD optimizer with momentum and weight regularization.
 =cut

 =head1 DESCRIPTION

     A very simple SGD optimizer with momentum and weight regularization.

     If the storage types of weight and grad are both 'row_sparse', and 'lazy_update' is True,
     **lazy updates** are applied by

         for row in grad.indices:
             rescaled_grad[row] = lr * rescale_grad * clip(grad[row], clip_gradient) + wd * weight[row]
             state[row] = momentum[row] * state[row] + rescaled_grad[row]
             weight[row] = weight[row] - state[row]

     The sparse update only updates the momentum for the weights whose row_sparse
     gradient indices appear in the current batch, rather than updating it for all
     indices. Compared with the original update, it can provide large
     improvements in model training throughput for some applications. However, it
     provides slightly different semantics than the original update, and
     may lead to different empirical results.

     Otherwise, **standard updates** are applied by::

         rescaled_grad = lr * rescale_grad * clip(grad, clip_gradient) + wd * weight
         state = momentum * state + rescaled_grad
         weight = weight - state

     Parameters
     ----------
     learning_rate : Num, optional
         learning_rate of SGD

     momentum : Num, optional
        momentum value

     wd : Num, optional
         L2 regularization coefficient add to all the weights

     rescale_grad : Num, optional
         rescaling factor of gradient. Normally should be 1/batch_size.

     clip_gradient : Num, optional
         clip gradient in range [-clip_gradient, clip_gradient]

     param_idx2name : hash ref of Str/Int to Num, optional
         special treat weight decay in parameter ends with bias, gamma, and beta

     multi_precision: Bool, optional
         Flag to control the internal precision of the optimizer.
         False results in using the same precision as the weights (default),
         True makes internal 32-bit copy of the weights and applies gradients
         in 32-bit precision even if actual weights used in the model have lower precision.
         Turning this on can improve convergence and accuracy when training with float16.

     lazy_update: Bool, optional, default true
 =cut

 package AI::MXNet::SGD;
 use Mouse;
 extends 'AI::MXNet::Optimizer';

 has 'kwargs'          => (is => "rw", isa => "HashRef[Num]");
 has 'momentum'        => (is => "rw", isa => "Num", default => 0);
 has 'multi_precision' => (is => "ro", isa => "Bool", default => 0);
 has 'lazy_update'     => (is => "ro", isa => "Bool", default => 1);

 sub BUILD
 {
     my $self = shift;
     $self->kwargs({});
     if($self->momentum)
     {
         $self->kwargs->{momentum} = $self->momentum;
     }
     if($self->clip_gradient)
     {
         $self->kwargs->{clip_gradient} = $self->clip_gradient;
     }
 }

 method create_state(Index $index, AI::MXNet::NDArray $weight)
 {
     my $momentum;
     my $weight_master_copy;
     my $stype = $self->lazy_update ? $weight->stype : 'default';
     if($self->multi_precision and $weight->dtype eq 'float16')
     {
         my $weight_master_copy = AI::MXNet::NDArray->array($weight, ctx => $weight->context, dtype => 'float32');
         if($self->momentum != 0)
         {
             $momentum = AI::MXNet::NDArray->zeros($weight->shape, stype => $stype, ctx => $weight->context, dtype => 'float32');
         }
         return [$momentum, $weight_master_copy];
     }
     if($weight->dtype eq 'float16' and not $self->multi_precision)
     {
         AI::MXNet::Logging->warning(
             "Accumulating with float16 in optimizer can lead to ".
             "poor accuracy or slow convergence. ".
             "Consider using multi_precision=True option of the ".
             "SGD optimizer"
         );
     }
     if($self->momentum != 0)
     {
         $momentum = AI::MXNet::NDArray->zeros($weight->shape, stype => $stype, ctx => $weight->context, dtype => $weight->dtype);
     }
     return $momentum;
 }

 method update(
     Index                     $index,
     AI::MXNet::NDArray        $weight,
     AI::MXNet::NDArray        $grad,
     Maybe[AI::MXNet::NDArray|ArrayRef[Maybe[AI::MXNet::NDArray]]] $state
 )
 {
     $self->_update_count($index);
     my $lr = $self->_get_lr($index);
     my $wd = $self->_get_wd($index);
     my $kwargs = {
         out => $weight,
         lr  => $lr,
         wd  => $wd,
         rescale_grad => $self->rescale_grad,
         %{ $self->kwargs }
     };
     my $use_multi_precision = ref($state) eq 'ARRAY';
     if(not $use_multi_precision)
     {
         if(defined $state)
         {
             AI::MXNet::NDArray->sgd_mom_update(
                 $weight, $grad, $state, $kwargs
             );
         }
         else
         {
             AI::MXNet::NDArray->sgd_update(
                 $weight, $grad, $kwargs
             );
         }
     }
     else
     {
         if(defined $state->[0])
         {
             AI::MXNet::NDArray->mp_sgd_mom_update(
                 $weight, $grad, $state->[0], $state->[1], $kwargs
             );
         }
         else
         {
             AI::MXNet::NDArray->mp_sgd_update(
                 $weight, $grad, $state->[1], $kwargs
             );
         }
     }
 }

 __PACKAGE__->register;

 =head1 NAME

     AI::MXNet::Signum - The Signum optimizer that takes the sign of gradient or momentum.
 =cut

 =head1 DESCRIPTION

     The optimizer updates the weight by:

         rescaled_grad = rescale_grad * clip(grad, clip_gradient) + wd * weight
         state = momentum * state + (1-momentum)*rescaled_grad
         weight = (1 - lr * wd_lh) * weight - lr * sign(state)

     See the original paper at: https://jeremybernste.in/projects/amazon/signum.pdf

     This optimizer accepts the following parameters in addition to those accepted
     by AI::MXNet::Optimizer

     Parameters
     ----------
     momentum : Num, optional
        The momentum value.
     wd_lh : Num, optional
        The amount of decoupled weight decay regularization, see details in the original paper at:
        https://arxiv.org/abs/1711.05101
 =cut

 package AI::MXNet::Signum;
 use Mouse;
 extends 'AI::MXNet::Optimizer';

 has 'momentum' => (is => "rw", isa => "Num", default => 0.9);
 has 'wd_lh'    => (is => "rw", isa => "Num", default => 0);

 method create_state(Index $index, AI::MXNet::NDArray $weight)
 {

     my $momentum;
     if($self->momentum != 0)
     {
         $momentum = AI::MXNet::NDArray->zeros(
             $weight->shape,
             ctx => $weight->context,
             dtype=>$weight->dtype,
             stype=>$weight->stype
         );
     }
     return $momentum;
 }

 method update(
     Index                     $index,
     AI::MXNet::NDArray        $weight,
     AI::MXNet::NDArray        $grad,
     Maybe[AI::MXNet::NDArray|ArrayRef[Maybe[AI::MXNet::NDArray]]] $state
 )
 {
     $self->_update_count($index);
     my $lr = $self->_get_lr($index);
     my $wd = $self->_get_wd($index);
     my %kwargs = (
         out => $weight,
         lr  => $lr,
         wd  => $wd,
         rescale_grad => $self->rescale_grad,
     );
     if($self->momentum > 0)
     {
         $kwargs{momentum} = $self->momentum;
     }
     if($self->clip_gradient)
     {
         $kwargs{clip_gradient} = $self->clip_gradient;
     }
     if($self->wd_lh)
     {
         $kwargs{wd_lh} = $self->wd_lh;
     }
     if(defined $state)
     {
         AI::MXNet::NDArray->signum_update(
             $weight, $grad, $state, %kwargs
         );
     }
     else
     {
         AI::MXNet::NDArray->signsgd_update(
             $weight, $grad, %kwargs
         );
     }
 }

 __PACKAGE__->register;

 =head1 NAME

     AI::MXNet::FTML - The FTML optimizer.
 =cut

 =head1 DESCRIPTION

     This class implements the optimizer described in
     *FTML - Follow the Moving Leader in Deep Learning*,
     available at http://proceedings.mlr.press/v70/zheng17a/zheng17a.pdf.

     This optimizer accepts the following parameters in addition to those accepted
     by AI::MXNet::Optimizer

     Parameters
     ----------
     beta1 : Num, optional
         0 < beta1 < 1. Generally close to 0.5.
     beta2 : Num, optional
         0 < beta2 < 1. Generally close to 1.
     epsilon : Num, optional
         Small value to avoid division by 0.
 =cut

 package AI::MXNet::FTML;
 use Mouse;
 extends 'AI::MXNet::Optimizer';

 has 'beta1'   => (is => "rw", isa => "Num", default => 0.6);
 has 'beta2'   => (is => "rw", isa => "Num", default => 0.999);
 has 'epsilon' => (is => "rw", isa => "Num", default => 1e-8);

 method create_state(Index $index, AI::MXNet::NDArray $weight)
 {
     return [
         AI::MXNet::NDArray->zeros($weight->shape, ctx => $weight->context, dtype=>$weight->dtype), # d_0
         AI::MXNet::NDArray->zeros($weight->shape, ctx => $weight->context, dtype=>$weight->dtype), # v_0
         AI::MXNet::NDArray->zeros($weight->shape, ctx => $weight->context, dtype=>$weight->dtype), # z_0
     ];
 }

 method update(
     Index                     $index,
     AI::MXNet::NDArray        $weight,
     AI::MXNet::NDArray        $grad,
     Maybe[AI::MXNet::NDArray|ArrayRef[Maybe[AI::MXNet::NDArray]]] $state
 )
 {
     my $lr = $self->_get_lr($index);
     my $wd = $self->_get_wd($index);
     my $t = $self->_update_count($index);
     my %kwargs = (
         out => $weight,
         lr  => $lr,
         wd  => $wd,
         t   => $t,
         beta1 => $self->beta1,
         beta2 => $self->beta2,
         epsilon => $self->epsilon,
         rescale_grad => $self->rescale_grad
     );
     if($self->clip_gradient)
     {
         $kwargs{clip_grad} = $self->clip_gradient;
     }
     AI::MXNet::NDArray->ftml_update($weight, $grad, @{ $state }, \%kwargs);
 }

 __PACKAGE__->register;

 =head1 NAME

     AI::MXNet::LBSGD - The Large Batch SGD optimizer with momentum and weight decay.
 =cut

 =head1 DESCRIPTION

     The optimizer updates the weight by::

         state = momentum * state + lr * rescale_grad * clip(grad, clip_gradient) + wd * weight
         weight = weight - state

     Parameters
     ----------
     momentum : Num, optional
        The momentum value.
     multi_precision: Bool, optional
        Flag to control the internal precision of the optimizer.
        0 results in using the same precision as the weights (default),
        1 makes internal 32-bit copy of the weights and applies gradients
                 in 32-bit precision even if actual weights used in the model have lower precision.`<
                 Turning this on can improve convergence and accuracy when training with float16.
     warmup_strategy: string ('linear', 'power2', 'sqrt'. , 'lars'   default : 'linear')
     warmup_epochs: unsigned, default: 5
     batch_scale:   unsigned, default: 1 (same as batch size*numworkers)
     updates_per_epoch: updates_per_epoch (default: 32, Default might not reflect true number batches per epoch. Used for warmup.)
     begin_epoch: unsigned, default 0, starting epoch.
 =cut

 package AI::MXNet::LBSGD;
 use Mouse;
 extends 'AI::MXNet::Optimizer';

 has 'momentum'          => (is => 'rw', isa => 'Num', default => 0);
 has 'multi_precision'   => (is => 'rw', isa => 'Bool', default => 0);
 has 'warmup_startegy'   => (is => 'rw', isa => 'Str', default => 'linear');
 has 'warmup_epochs'     => (is => 'rw', isa => 'Int', default => 5);
 has 'batch_scale'       => (is => 'rw', isa => 'Num', default => 1);
 has 'updates_per_epoch' => (is => 'rw', isa => 'Int', default => 32);
 has 'begin_epoch'       => (is => 'rw', isa => 'Int', default => 0);
 has 'num_epochs'        => (is => 'rw', isa => 'Int', default => 60);
 has 'beta2'             => (is => 'rw', isa => 'Num', default => 0.999);
 has 'epsilon'           => (is => 'rw', isa => 'Num', default => 1e-8);
 has 'init_updates'      => (is => 'rw', init_arg => undef);
 has [qw/lbmult
         cumgrads
         adaptive
         init_updates
         admult/]        => (is => 'rw', init_arg => undef);

 sub BUILD
 {
     my $self = shift;
     AI::MXNet::Logging->info('Running Large-Batch SGD Algorithm');
     AI::MXNet::Logging->info(
         '(Batch_scale=%f, warmup_epochs=%d, warmup_strategy=%s, updates_per_epoch=%d)',
         map { $self->$_ } qw/batch_scale warmup_epochs warmup_strategy updates_per_epoch/
     );
     $self->init_updates($self->begin_epoch * $self->updates_per_epoch);
     $self->lbmult(1);
     $self->cumgrads({});
     $self->adaptive(0);
     $self->admult(1);
 }

 method create_state(Index $index, AI::MXNet::NDArray $weight)
 {
     return [
         AI::MXNet::NDArray->zeros($weight->shape, ctx => $weight->context, dtype=>$weight->dtype), # d_0
         AI::MXNet::NDArray->zeros($weight->shape, ctx => $weight->context, dtype=>$weight->dtype), # v_0
         AI::MXNet::NDArray->zeros($weight->shape, ctx => $weight->context, dtype=>$weight->dtype), # z_0
     ];
     my $momentum;
     my $weight_master_copy;
     if($self->multi_precision and $weight->dtype eq 'float16')
     {
         $weight_master_copy = AI::MXNet::NDArray->array($weight, ctx=>$weight->context, dtype=>'float32');
         if($self->momentum != 0)
         {
             $momentum = AI::MXNet::NDArray->zeros(
                 $weight->shape, ctx => $weight->context, dtype => 'float32',
                 stype => $weight->stype
             );
         }
         return [$momentum, $weight_master_copy];
     }
     if($weight->dtype eq 'float16' and not $self->multi_precision)
     {
         AI::MXNet::Logging->warning(
             "Accumulating with float16 in optimizer can lead to "
             ."poor accuracy or slow convergence. "
             ."Consider using multi_precision=True option of the "
             ."LBSGD optimizer"
         );
     }
     if($self->momentum != 0)
     {
         $momentum = AI::MXNet::NDArray->zeros(
             $weight->shape, ctx => $weight->context, dtype => $weight->dtype,
             stype => $weight->stype
         );
     }
     return $momentum;
 }

 method _get_lbmult($nup)
 {
     my $nwup = $self->warmup_epochs * $self->updates_per_epoch;
     my $strategy = $self->warmup_strategy;
     my $maxmult = $self->batch_scale;
     my $mult;
     if($nup >= $nwup)
     {
         $mult = $maxmult;
     }
     elsif($nwup <= 1)
     {
         $mult = 1;
     }
     else
     {
         if ($strategy eq 'linear')
         {
             $mult = 1 + ($maxmult - 1) * $nup / $nwup;
         }
         elsif($strategy eq 'power2')
         {
             $mult = 1 + ($maxmult-1) * ($nup*$nup)/($nwup*$nwup);
         }
         elsif($strategy eq 'sqrt')
         {
             $mult = 1 + ($maxmult - 1) * sqrt($nup / $nwup);
         }
         else
         {
             $mult = 1;
         }
     }
     return $mult;
 }


 method _get_lars($weight, $g, $wd)
 {
     my $weight2 = $self->_l2norm($weight);
     my $grad2 = $self->_l2norm($g);
     my $lars = sqrt($weight2 / ($grad2 + $wd * $weight2 + 1e-18));
     if($lars < 0.01)
     {
         $lars = 0.01;
     }
     elsif($lars > 100)
     {
         $lars = 100;
     }
     return $lars;
 }

 method _l2norm($v)
 {
     my $norm = AI::MXNet::NDArray->multiply($v, $v)->aspdl->sum;
     return $norm;
 }

 method  _reset_cum_gradient($index)
 {
     $self->cumgrads->{$index}{cum_grad} = 0;
 }

 method _get_cum_gradient($index)
 {
     if(exists $self->cumgrads->{$index})
     {
         return $self->cumgrads->{$index};
     }
     else
     {
         return {}
     }
 }

 method _put_cum_gradient($index, $cgrad)
 {
     $self->cumgrads->{$index} = $cgrad;
 }

 method _cumulate_gradient($grad, $index)
 {
     my $cgrad = $self->_get_cum_gradient($index);
     my ($num_cums, $cum_grad);
     if(%{ $cgrad })
     {
         my $num_cums = $cgrad->{num_cums};
         if($num_cums > 0)
         {
             $cum_grad = $cgrad->{cum_grad} + $grad;
             $num_cums += 1;
         }
         else
         {
             $cum_grad = $grad;
             $num_cums = $self->init_updates + 1;
         }
     }
     else
     {
         $cum_grad = $grad;
         $num_cums = $self->init_updates + 1;
     }
     $cgrad = {cum_grad => $cum_grad, num_cums => $num_cums};
     $self->_put_cum_gradient($index, $cgrad);
     return $cgrad;
 }


 method update(
     Index                     $index,
     AI::MXNet::NDArray        $weight,
     AI::MXNet::NDArray        $grad,
     Maybe[AI::MXNet::NDArray|ArrayRef[Maybe[AI::MXNet::NDArray]]] $state
 )
 {
     my $lr = $self->_get_lr($index);
     my $wd = $self->_get_wd($index);
     my $t = $self->_update_count($index);
     my $cgrad = $self->_cumulate_gradient($grad, $index);
     if(($cgrad->{num_cums} % $self->batch_scale) == 0)
     {
         my $lbmult;
         $grad = $cgrad->{cum_grad} / $self->batch_scale;
         if($self->warmup_strategy eq 'lars')
         {
             $lbmult = $self->_get_lars($weight, $grad, $wd);
         }
         else
         {
             $lbmult = $self->_get_lbmult($cgrad->{num_cums});
         }
         $lr = $lr * $lbmult;
         my %kwargs = (
             out => $weight,
             lr  => $lr,
             wd  => $wd,
             rescale_grad => $self->rescale_grad
         );
         if($self->clip_gradient)
         {
             $kwargs{clip_gradient} = $self->clip_gradient;
         }
         if($self->momentum > 0)
         {
             $kwargs{momentum} = $self->momentum;
         }
         my $use_multi_precision = ref($state) eq 'ARRAY';
         if(not $use_multi_precision)
         {
             if(defined $state)
             {
                 AI::MXNet::NDArray->sgd_mom_update($weight, $grad, $state, %kwargs);
             }
             else
             {
                 AI::MXNet::NDArray->sgd_update($weight, $grad, %kwargs);
             }
         }
         else
         {
             if(defined $state->[0])
             {
                 AI::MXNet::NDArray->mp_sgd_mom_update($weight, $grad, @{ $state }, %kwargs);
             }
             else
             {
                 AI::MXNet::NDArray->mp_sgd_update($weight, $grad, $state->[1], %kwargs);
             }
         }
         $self->_reset_cum_gradient($index);
     }
     else
     {
         AI::MXNet::NDArray->sgd_update($weight, $grad, out => $weight, lr => 0, wd => $wd);
     }
 }

 __PACKAGE__->register;

 package AI::MXNet::DCASGD;
 use Mouse;
 use AI::MXNet::Base;
 extends 'AI::MXNet::Optimizer';

 =head1 NAME

     AI::MXNet::DCASGD - DCASGD optimizer with momentum and weight regularization.
 =cut

 =head1 DESCRIPTION

     DCASGD optimizer with momentum and weight regularization.

     Implements paper "Asynchronous Stochastic Gradient Descent with
                     Delay Compensation for Distributed Deep Learning"

     Parameters
     ----------
     learning_rate : Num, optional
         learning_rate of SGD

     momentum : Num, optional
        momentum value

     lamda : NUm, optional
        scale DC value

     wd : Num, optional
         L2 regularization coefficient add to all the weights

     rescale_grad : Num, optional
         rescaling factor of gradient. Normally should be 1/batch_size.

     clip_gradient : Num, optional
         clip gradient in range [-clip_gradient, clip_gradient]

     param_idx2name : hash ref of Str/Int to Num, optional
         special threating of weight decay for parameters that end with bias, gamma, and beta
 =cut
 has 'momentum'        => (is => 'ro', isa => 'Num', default => 0);
 has 'lamda'           => (is => 'ro', isa => 'Num', default => 0.04);
 has 'weight_previous' => (is => 'rw', init_arg => undef);

 sub BUILD
 {
     my $self = shift;
     $self->weight_previous({});
 }

 method create_state(Index $index, AI::MXNet::NDArray $weight)
 {
         return [
             $self->momentum ? AI::MXNet::NDArray->zeros(
                 $weight->shape, ctx => $weight->context, dtype => $weight->dtype
             ) : undef,
             $weight->copy
         ];
 }

 method update(
     Index                     $index,
     AI::MXNet::NDArray        $weight,
     AI::MXNet::NDArray        $grad,
     Maybe[AI::MXNet::NDArray|ArrayRef[Maybe[AI::MXNet::NDArray]]] $state
 )
 {
     my $lr = $self->_get_lr($index);
     my $wd = $self->_get_wd($index);
     $self->_update_count($index);
     $grad *= $self->rescale_grad;
     if($self->clip_gradient)
     {
         $grad = AI::MXNet::NDArray->clip(
             $grad,
             -$self->clip_gradient,
             $self->clip_gradient
         );
     }
     my ($mom, $weight_previous) = @{ $state };
     if(defined $mom)
     {
         $mom *= $self->momentum;
         $mom += -$lr * (
                 $grad + $wd * $weight
                     +
                 $self->lamda * $grad * $grad * ($weight - $weight_previous)
         );
     }
     else
     {
         assert($self->momentum == 0);
         $mom = -$lr * (
                 $grad + $wd * $weight
                     +
                 $self->lamda * $grad * $grad * ($weight - $weight_previous)
         );
     }
     $weight_previous .= $weight;
     $weight += $mom;
 }

 __PACKAGE__->register;

 =head1 NAME

     AI::MXNet::NAG - SGD with Nesterov weight handling.
 =cut

 =head1 DESCRIPTION

     It is implemented according to
     https://github.com/torch/optim/blob/master/sgd.lua
 =cut

 package AI::MXNet::NAG;
 use Mouse;
 extends 'AI::MXNet::SGD';

 method create_state(Index $index, AI::MXNet::NDArray $weight)
 {
     my $momentum;
     my $weight_master_copy;
     my $do_multi_precision = ($self->multi_precision and $weight->dtype eq 'float16');
     if($do_multi_precision)
     {
         if($self->momentum != 0)
         {
             $momentum = AI::MXNet::NDArray->zeros($weight->shape, ctx => $weight->context, dtype=>'float32');
         }
         $weight_master_copy = AI::MXNet::NDArray->array($weight, ctx=>$weight->context, dtype=>'float32');
         return [$weight_master_copy, $momentum];
     }
     else
     {
         if($self->momentum != 0)
         {
             $momentum = AI::MXNet::NDArray->zeros($weight->shape, ctx => $weight->context, dtype=>$weight->dtype);
         }
         return $momentum;
     }
 }

 method update($index, $weight, $grad, $state)
 {
     my $lr = $self->_get_lr($index);
     my $wd = $self->_get_wd($index);
     $self->_update_count($index);
     my $use_multi_precision = (defined $state and not Scalar::Util::blessed($state) and ref($state eq 'ARRAY'));
     if(not $use_multi_precision)
     {
         $grad *= $self->rescale_grad;
         if(defined $self->clip_gradient)
         {
             $grad = AI::MXNet::NDArray->clip($grad, -$self->clip_gradient, $self->clip_gradient);
         }
         if($self->momentum == 0)
         {
             $weight += -$lr * ($grad + $wd * $weight);
         }
         else
         {
             my $mom = $state;
             $mom *= $self->momentum;
             $grad += $wd * $weight;
             $mom += $grad;
             $grad += $self->momentum * $mom;
             $weight += -$lr * $grad;
         }
     }
     else
     {
         my $grad32 = AI::MXNet::NDArray->array($grad, ctx=>$grad->context, dtype=>'float32');
         $grad32 *= $self->rescale_grad;
         if(defined $self->clip_gradient)
         {
             $grad32 = AI::MXNet::NDArray->clip($grad32, -$self->clip_gradient, $self->clip_gradient);
         }
         my $mom = $state->[1];
         my $weight32 = $state->[0];
         if($self->momentum == 0)
         {
             $weight32 += -$lr * ($grad32 + $wd * $weight32);
         }
         else
         {
             $mom *= $self->momentum;
             $grad32 += $wd * $weight32;
             $mom += $grad32;
             $grad32 += $self->momentum * $mom;
             $weight32 += -$lr * $grad32;
         }
         my $tmp = $weight32->astype($weight->dtype);
         $tmp->copyto($weight);
     }
 }

 __PACKAGE__->register;

 =head1 NAME

     AI::MXNet::SGLD - Stochastic Gradient Riemannian Langevin Dynamics.
 =cut

 =head1 DESCRIPTION

     Stochastic Gradient Riemannian Langevin Dynamics.

     This class implements the optimizer described in the paper *Stochastic Gradient
     Riemannian Langevin Dynamics on the Probability Simplex*, available at
     https://papers.nips.cc/paper/4883-stochastic-gradient-riemannian-langevin-dynamics-on-the-probability-simplex.pdf.

     Parameters
     ----------
     learning_rate : Num, optional
         learning_rate of SGD

     wd : Num, optional
         L2 regularization coefficient add to all the weights

     rescale_grad : Num, optional
         rescaling factor of gradient. Normally should be 1/batch_size.

     clip_gradient : Num, optional
         clip gradient in range [-clip_gradient, clip_gradient]
 =cut

 package AI::MXNet::SGLD;
 use Mouse;

 extends 'AI::MXNet::Optimizer';

 method create_state(Index $index, AI::MXNet::NDArray $weight)
 {
     return undef;
 }

 method update(
     Index $index,
     AI::MXNet::NDArray $weight,
     AI::MXNet::NDArray $grad,
     AI::MXNet::NDArray|Undef $state
 )
 {
     my $lr = $self->_get_lr($index);
     my $wd = $self->_get_wd($index);
     $self->_update_count($index);
     $grad *= $self->rescale_grad;
     if($self->clip_gradient)
     {
         $grad = AI::MXNet::NDArray->clip(
             $grad,
             -$self->clip_gradient,
              $self->clip_gradient
         );
     }
     $weight +=  - $lr/2 * ($grad + $wd * $weight)
                     +
                 AI::MXNet::Random->normal(
                         0, sqrt($lr),
                         shape => $weight->shape,
                         ctx   => $weight->context,
                         dtype => $weight->dtype
                 );
 }

 __PACKAGE__->register;

 =head1 NAME

     AI::MXNet::Adam - Adam optimizer as described in [King2014]_.
 =cut

 =head1 DESCRIPTION

     Adam optimizer as described in [King2014]_.

     .. [King2014] Diederik Kingma, Jimmy Ba,
        *Adam: A Method for Stochastic Optimization*,
        http://arxiv.org/abs/1412.6980

     Parameters
     ----------
     learning_rate : Num, optional
         Step size.
         Default value is set to 0.001.
     beta1 : Num, optional
         Exponential decay rate for the first moment estimates.
         Default value is set to 0.9.
     beta2 : Num, optional
         Exponential decay rate for the second moment estimates.
         Default value is set to 0.999.
     epsilon : Num, optional
         Default value is set to 1e-8.

     wd : NUm, optional
         L2 regularization coefficient add to all the weights
     rescale_grad : Num, optional
         rescaling factor of gradient. Normally should be 1/batch_size.

     clip_gradient : Num, optional
         clip gradient in range [-clip_gradient, clip_gradient]
 =cut
 package AI::MXNet::Adam;
 use Mouse;

 extends 'AI::MXNet::Optimizer';

 has 'kwargs'   => (is => "rw", isa => "HashRef[Num]");
 has '+learning_rate' => (default => 0.001);
 has 'beta1'    => (is => "rw", isa => "Num", default => 0.9);
 has 'beta2'    => (is => "rw", isa => "Num", default => 0.999);
 has 'epsilon'  => (is => "rw", isa => "Num", default => 1e-8);
 has 'lazy_update' => (is => 'rw', isa => 'Bool', default => 1);

 sub BUILD
 {
     my $self = shift;
     $self->kwargs({
         beta1   => $self->beta1,
         beta2   => $self->beta2,
         epsilon => $self->epsilon
     });
     if($self->clip_gradient)
     {
         $self->kwargs->{clip_gradient} = $self->clip_gradient;
     }
 }

 method create_state(Index $index, AI::MXNet::NDArray $weight)
 {
     my $stype = $self->lazy_update ? $weight->stype : 'default';
     return [AI::MXNet::NDArray->zeros(
                 $weight->shape,
                 ctx => $weight->context,
                 dtype => $weight->dtype,
                 stype => $stype
             ),  # mean
             AI::MXNet::NDArray->zeros(
                 $weight->shape,
                 ctx => $weight->context,
                 dtype => $weight->dtype,
                 stype => $stype
             )  # variance
     ];
 }

 method update(
     Index $index,
     AI::MXNet::NDArray $weight,
     AI::MXNet::NDArray $grad,
     ArrayRef[AI::MXNet::NDArray] $state
 )
 {
     my $lr = $self->_get_lr($index);
     my $wd = $self->_get_wd($index);
     $self->_update_count($index);
     my $t = $self->_index_update_count->{$index};
     my $coef1 = 1 - $self->beta1**$t;
     my $coef2 = 1 - $self->beta2**$t;
     $lr *= sqrt($coef2)/$coef1;
     my ($mean, $var) = @{ $state };
     AI::MXNet::NDArray->adam_update(
         $weight, $grad, $mean, $var,
         {
             out => $weight,
             lr  => $lr,
             wd  => $wd,
             rescale_grad => $self->rescale_grad,
             %{ $self->kwargs }
         }
     );
 }

 __PACKAGE__->register;

 =head1 NAME

     AI::MXNet::AdaGrad - AdaGrad optimizer of Duchi et al., 2011
 =cut

 =head1 DESCRIPTION

     AdaGrad optimizer of Duchi et al., 2011,

     This code follows the version in http://arxiv.org/pdf/1212.5701v1.pdf  Eq(5)
     by Matthew D. Zeiler, 2012. AdaGrad will help the network to converge faster
     in some cases.

     Parameters
     ----------
     learning_rate : Num, optional
         Step size.
         Default value is set to 0.05.

     wd : Num, optional
         L2 regularization coefficient add to all the weights

     rescale_grad : Num, optional
         rescaling factor of gradient. Normally should be 1/batch_size.

     eps: Num, optional
         A small float number to make the updating processing stable
         Default value is set to 1e-7.

     clip_gradient : Num, optional
         clip gradient in range [-clip_gradient, clip_gradient]
 =cut
 package AI::MXNet::AdaGrad;
 use Mouse;

 extends 'AI::MXNet::Optimizer';

 has 'eps'    => (is => "rw", isa => "Num", default => 1e-7);

 method create_state(Index $index, AI::MXNet::NDArray $weight)
 {
     return AI::MXNet::NDArray->zeros(
                 $weight->shape,
                 ctx => $weight->context,
                 stype => $weight->stype
     );  # history
 }

 method update(
     Index $index,
     AI::MXNet::NDArray $weight,
     AI::MXNet::NDArray $grad,
     AI::MXNet::NDArray $state
 )
 {
     my $lr = $self->_get_lr($index);
     my $wd = $self->_get_wd($index);
     $self->_update_count($index);
     my $is_sparse = $grad->stype eq 'row_sparse' ? 1 : 0;
     my $history = $state;
     if($is_sparse)
     {
         my %kwargs = (
             epsilon => $self->eps,
             rescale_grad => $self->rescale_grad
         );
         if($self->clip_gradient)
         {
             $kwargs{clip_gradient} = $self->clip_gradient;
         }
         AI::MXNet::NDArray::Sparse->adagrad_update($weight, $grad, $history, { out=>$weight, lr=>$lr, wd=>$wd, %kwargs });
     }
     else
     {
         $grad *= $self->rescale_grad;
         if(defined $self->clip_gradient)
         {
             $grad = AI::MXNet::NDArray->clip($grad, -$self->clip_gradient, $self->clip_gradient);
         }
         $history += $grad->square;
         my $div = $grad / ($history + $self->eps)->sqrt;
         $weight += ($div + $weight * $wd) * -$lr;
     }
 }

 __PACKAGE__->register;

 =head1 NAME

     AI::MXNet::RMSProp - RMSProp optimizer of Tieleman & Hinton, 2012.
 =cut

 =head1 DESCRIPTION

     RMSProp optimizer of Tieleman & Hinton, 2012,

     For centered=False, the code follows the version in
     http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf by
     Tieleman & Hinton, 2012

     For centered=True, the code follows the version in
     http://arxiv.org/pdf/1308.0850v5.pdf Eq(38) - Eq(45) by Alex Graves, 2013.

     Parameters
     ----------
     learning_rate : Num, optional
         Step size.
         Default value is set to 0.001.
     gamma1: Num, optional
         decay factor of moving average for gradient^2.
         Default value is set to 0.9.
     gamma2: Num, optional
         "momentum" factor.
         Default value if set to 0.9.
         Only used if centered=True
     epsilon : Num, optional
         Default value is set to 1e-8.
     centered : Bool, optional
         Use Graves or Tielemans & Hintons version of RMSProp
     wd : Num, optional
         L2 regularization coefficient add to all the weights
     rescale_grad : Num, optional
         rescaling factor of gradient.
     clip_gradient : Num, optional
         clip gradient in range [-clip_gradient, clip_gradient]
     clip_weights : Num, optional
         clip weights in range [-clip_weights, clip_weights]
 =cut

 package AI::MXNet::RMSProp;
 use Mouse;

 extends 'AI::MXNet::Optimizer';

 has '+learning_rate' => (default => 0.001);
 has 'gamma1'         => (is => "ro", isa => "Num",  default => 0.9);
 has 'gamma2'         => (is => "ro", isa => "Num",  default => 0.9);
 has 'epsilon'        => (is => "ro", isa => "Num",  default => 1e-8);
 has 'centered'       => (is => "ro", isa => "Bool", default => 0);
 has 'clip_weights'   => (is => "ro", isa => "Num");
 has 'kwargs'         => (is => "rw", init_arg => undef);

 sub BUILD
 {
     my $self = shift;
     $self->kwargs({
         gamma1       => $self->gamma1,
         epsilon      => $self->epsilon
     });
     if($self->centered)
     {
         $self->kwargs->{gamma2} = $self->gamma2;
     }
     if($self->clip_gradient)
     {
         $self->kwargs->{clip_gradient} = $self->clip_gradient;
     }
     if($self->clip_weights)
     {
         $self->kwargs->{clip_weights} = $self->clip_weights;
     }
 }

 # For centered=False: n
 # For centered=True: n, g, delta
 method create_state(Index $index, AI::MXNet::NDArray $weight)
 {
     return [
             $self->centered
             ? (
                 AI::MXNet::NDArray->zeros(
                     $weight->shape,
                     ctx => $weight->context,
                     stype => $weight->stype
                 ),  # n
                 AI::MXNet::NDArray->zeros(
                     $weight->shape,
                     ctx => $weight->context,
                     stype => $weight->stype
                 ),  # g
                 AI::MXNet::NDArray->zeros(
                     $weight->shape,
                     ctx => $weight->context,
                     stype => $weight->stype
                 )
             )   # delta
             : (
                 AI::MXNet::NDArray->zeros(
                     $weight->shape,
                     ctx => $weight->context,
                     stype => $weight->stype
                 ),  # n
             )
     ];
 }

 method update(
     Index $index,
     AI::MXNet::NDArray $weight,
     AI::MXNet::NDArray $grad,
     ArrayRef[AI::MXNet::NDArray] $state
 )
 {
     my $lr = $self->_get_lr($index);
     my $wd = $self->_get_wd($index);
     $self->_update_count($index);
     my ($n, $g, $delta) = @{ $state };
     if($self->centered)
     {
         AI::MXNet::NDArray->rmspropalex_update(
             $weight, $grad, $n, $g, $delta,
             {
                 out => $weight,
                 lr  => $lr,
                 wd  => $wd,
                 rescale_grad => $self->rescale_grad,
                 %{ $self->kwargs }
             }
         );
     }
     else
     {
         AI::MXNet::NDArray->rmsprop_update(
             $weight, $grad, $n,
             {
                 out => $weight,
                 lr  => $lr,
                 wd  => $wd,
                 rescale_grad => $self->rescale_grad,
                 %{ $self->kwargs }
             }
         );
     }
 }

 __PACKAGE__->register;

 =head1 NAME

     AI::MXNet::AdaDelta - AdaDelta optimizer.
 =cut

 =head1 DESCRIPTION

     AdaDelta optimizer as described in
     Zeiler, M. D. (2012).
     *ADADELTA: An adaptive learning rate method.*

     http://arxiv.org/abs/1212.5701

     Parameters
     ----------
     rho: Num
         Decay rate for both squared gradients and delta x
     epsilon : Num
         The constant as described in the thesis
     wd : Num
         L2 regularization coefficient add to all the weights
     rescale_grad : Num, optional
         rescaling factor of gradient. Normally should be 1/batch_size.
     clip_gradient : Num, optional
         clip gradient in range [-clip_gradient, clip_gradient]
 =cut
 package AI::MXNet::AdaDelta;
 use Mouse;

 extends 'AI::MXNet::Optimizer';

 has 'rho'    => (is => "rw", isa => "Num", default => 0.9);
 has 'epsilon'    => (is => "rw", isa => "Num", default => 1e-5);

 method create_state(Index $index, AI::MXNet::NDArray $weight)
 {
     return [
             AI::MXNet::NDArray->zeros(
                 $weight->shape,
                 ctx => $weight->context
             ),  # accumulated g
             AI::MXNet::NDArray->zeros(
                 $weight->shape,
                 ctx => $weight->context
             )   # accumulated delta
     ];
 }

 method update(
     Index $index,
     AI::MXNet::NDArray $weight,
     AI::MXNet::NDArray $grad,
     ArrayRef[AI::MXNet::NDArray] $state
 )
 {
     my $wd = $self->_get_wd($index);
     $self->_update_count($index);
     $grad *= $self->rescale_grad;
     if($self->clip_gradient)
     {
         $grad = AI::MXNet::NDArray->clip(
             $grad,
             -$self->clip_gradient,
              $self->clip_gradient
         );
     }
     my ($acc_g, $acc_delta) = @{ $state };
     $acc_g .= $self->rho * $acc_g + (1 - $self->rho) * $grad * $grad;
     my $current_delta = ($acc_delta + $self->epsilon)->sqrt
                             /
                         ($acc_g + $self->epsilon)->sqrt
                             *
                         $grad;
     $acc_delta .= $self->rho * $acc_delta + (1 - $self->rho) * $current_delta * $current_delta;
     $weight -= $current_delta + $wd * $weight;
 }

 __PACKAGE__->register;

 # For test use
 package AI::MXNet::Test;
 use Mouse;

 extends 'AI::MXNet::Optimizer';

 # Create a state to duplicate weight
 method create_state(Index $index, AI::MXNet::NDArray $weight)
 {
     return AI::MXNet::NDArray->zeros(
                 $weight->shape,
                 ctx => $weight->context
     );
 }

 # performs w += rescale_grad * grad
 method update(
     Index $index,
     AI::MXNet::NDArray $weight,
     AI::MXNet::NDArray $grad,
     AI::MXNet::NDArray $state
 )
 {
     $weight += $grad * $self->rescale_grad;
     $state .= $weight;
 }

 __PACKAGE__->register;

 package AI::MXNet::Ftrl;


 =head1 NAME

     AI::MXNet::Ftrl
 =cut

 =head1 DESCRIPTION

     Referenced from *Ad Click Prediction: a View from the Trenches*, available at
     http://dl.acm.org/citation.cfm?id=2488200.

     The optimizer updates the weight by:

         rescaled_grad = clip(grad * rescale_grad, clip_gradient)
         z += rescaled_grad - (sqrt(n + rescaled_grad**2) - sqrt(n)) * weight / learning_rate
         n += rescaled_grad**2
         w = (sign(z) * lamda1 - z) / ((beta + sqrt(n)) / learning_rate + wd) * (abs(z) > lamda1)

     If the storage types of weight, state and grad are all row_sparse,
     **sparse updates** are applied by::

         for row in grad.indices:
             rescaled_grad[row] = clip(grad[row] * rescale_grad, clip_gradient)
             z[row] += rescaled_grad[row] - (sqrt(n[row] + rescaled_grad[row]**2) - sqrt(n[row])) * weight[row] / learning_rate
             n[row] += rescaled_grad[row]**2
             w[row] = (sign(z[row]) * lamda1 - z[row]) / ((beta + sqrt(n[row])) / learning_rate + wd) * (abs(z[row]) > lamda1)

     The sparse update only updates the z and n for the weights whose row_sparse
     gradient indices appear in the current batch, rather than updating it for all
     indices. Compared with the original update, it can provide large
     improvements in model training throughput for some applications. However, it
     provides slightly different semantics than the original update, and
     may lead to different empirical results.

     This optimizer accepts the following parameters in addition to those accepted
     by AI::MXNet::Optimizer

     Parameters
     ----------
     lamda1 : Num, optional
         L1 regularization coefficient.
     learning_rate : Num, optional
         The initial learning rate.
     beta : Num, optional
         Per-coordinate learning rate correlation parameter.
 =cut

 use Mouse;
 extends 'AI::MXNet::Optimizer';
 has '+learning_rate' => (default => 0.1);
 has 'beta'           => (is => "ro", isa => "Num",  default => 1);
 has 'lamda1'         => (is => "ro", isa => "Num",  default => 0.01);

 method create_state(Index $index, AI::MXNet::NDArray $weight)
 {
     return [
             AI::MXNet::NDArray->zeros(
                 $weight->shape,
                 ctx => $weight->context,
                 stype => $weight->stype
             ),  # z
             AI::MXNet::NDArray->zeros(
                 $weight->shape,
                 ctx => $weight->context,
                 stype => $weight->stype
             )   # n
     ];
 }

 method update(
     Index $index,
     AI::MXNet::NDArray $weight,
     AI::MXNet::NDArray $grad,
     ArrayRef[AI::MXNet::NDArray] $state
 )
 {
     $self->_update_count($index);
     my $wd = $self->_get_wd($index);
     my $lr = $self->_get_lr($index);
     my %kwargs = (lamda1 => $self->lamda1, beta => $self->beta, rescale_grad => $self->rescale_grad);
     if($self->clip_gradient)
     {
         $kwargs{clip_gradient} = $self->clip_gradient;
     }
     # accumulated g and delta initialization
     my ($z, $n) = @{ $state };
     AI::MXNet::NDArray->ftrl_update(
         $weight, $grad, $z, $n,
         { lr => $lr, wd => $wd, %kwargs, out => $weight }
     );
 }

 __PACKAGE__->register;

 package AI::MXNet::Adamax;

 =head1 NAME

     AI::MXNet::Adamax
 =cut

 =head1 DESCRIPTION

     It is a variant of Adam based on the infinity norm
     available at http://arxiv.org/abs/1412.6980 Section 7.

     This optimizer accepts the following parameters in addition to those accepted
     AI::MXNet::Optimizer.

     Parameters
     ----------
     beta1 : Num, optional
         Exponential decay rate for the first moment estimates.
     beta2 : Num, optional
         Exponential decay rate for the second moment estimates.
 =cut

 use Mouse;
 extends 'AI::MXNet::Optimizer';
 has '+learning_rate' => (default => 0.002);
 has 'beta1'          => (is => "ro", isa => "Num",  default => 0.9);
 has 'beta2'          => (is => "ro", isa => "Num",  default => 0.999);

 method create_state(Index $index, AI::MXNet::NDArray $weight)
 {
     return [
             AI::MXNet::NDArray->zeros(
                 $weight->shape,
                 ctx => $weight->context,
                 dtype => $weight->dtype
             ),  # mean
             AI::MXNet::NDArray->zeros(
                 $weight->shape,
                 ctx => $weight->context,
                 dtype => $weight->dtype
             )   # variance
     ];
 }

 method update(
     Index $index,
     AI::MXNet::NDArray $weight,
     AI::MXNet::NDArray $grad,
     ArrayRef[AI::MXNet::NDArray] $state
 )
 {
     my $wd = $self->_get_wd($index);
     my $lr = $self->_get_lr($index);
     $self->_update_count($index);
     my $t = $self->_index_update_count->{$index};
     $lr /= (1 - $self->beta1**$t);

     $grad = $grad * $self->rescale_grad + $wd * $weight;
     if($self->clip_gradient)
     {
         $grad = AI::MXNet::NDArray->clip(
             $grad,
             -$self->clip_gradient,
              $self->clip_gradient
         );
     }

     # update m_t and u_t
     my($m_t, $u_t) = @{ $state };
     $m_t .= $self->beta1 * $m_t + (1 - $self->beta1) * $grad;
     $u_t .= AI::MXNet::NDArray->maximum($self->beta2 * $u_t, $grad->abs);

     # update weight
     $weight -= $lr * $m_t / $u_t;
 }

 __PACKAGE__->register;

 package AI::MXNet::Nadam;

 =head1 NAME

     AI::MXNet::Nadam
 =cut

 =head1 DESCRIPTION

     The Nesterov Adam optimizer.

     Much like Adam is essentially RMSprop with momentum,
     Nadam is Adam RMSprop with Nesterov momentum available
     at http://cs229.stanford.edu/proj2015/054_report.pdf.

     This optimizer accepts the following parameters in addition to those accepted
     by AI::MXNet::Optimizer.

     Parameters
     ----------
     beta1 : Num, optional
         Exponential decay rate for the first moment estimates.
     beta2 : Num, optional
         Exponential decay rate for the second moment estimates.
     epsilon : Num, optional
         Small value to avoid division by 0.
     schedule_decay : Num, optional
         Exponential decay rate for the momentum schedule
 =cut

 use Mouse;
 extends 'AI::MXNet::Optimizer';
 has '+learning_rate' => (default => 0.001);
 has 'beta1'          => (is => "ro", isa => "Num",  default => 0.9);
 has 'beta2'          => (is => "ro", isa => "Num",  default => 0.999);
 has 'epsilon'        => (is => "ro", isa => "Num",  default => 1e-8);
 has 'schedule_decay' => (is => "ro", isa => "Num",  default => 0.004);
 has 'm_schedule'     => (is => "rw", default => 1, init_arg => undef);

 method create_state(Index $index, AI::MXNet::NDArray $weight)
 {
     return [
             AI::MXNet::NDArray->zeros(
                 $weight->shape,
                 ctx => $weight->context,
                 dtype => $weight->dtype
             ),  # mean
             AI::MXNet::NDArray->zeros(
                 $weight->shape,
                 ctx => $weight->context,
                 dtype => $weight->dtype
             )   # variance
     ];
 }

 method update(
     Index $index,
     AI::MXNet::NDArray $weight,
     AI::MXNet::NDArray $grad,
     ArrayRef[AI::MXNet::NDArray] $state
 )
 {
     my $wd = $self->_get_wd($index);
     my $lr = $self->_get_lr($index);
     $self->_update_count($index);
     my $t = $self->_index_update_count->{$index};
     $grad = $grad * $self->rescale_grad + $wd * $weight;
     if($self->clip_gradient)
     {
         $grad = AI::MXNet::NDArray->clip(
             $grad,
             -$self->clip_gradient,
              $self->clip_gradient
         );
     }
     # warming momentum schedule
     my $momentum_t    = $self->beta1 * (1 - 0.5 * (0.96**($t * $self->schedule_decay)));
     my $momentum_t_1  = $self->beta1 * (1 - 0.5 * (0.96**(($t + 1) * $self->schedule_decay)));
     $self->m_schedule = $self->m_schedule * $momentum_t;
     my $m_schedule_next  = $self->m_schedule * $momentum_t_1;

     # update m_t and v_t
     my ($m_t, $v_t) = @{ $state };
     $m_t .= $self->beta1 * $m_t + (1 - $self->beta1) * $grad;
     $v_t .= $self->beta2 * $v_t + (1 - $self->beta2) * $grad * $grad;

     my $grad_prime = $grad / (1 - $self->m_schedule);
     my $m_t_prime  = $m_t  / (1 - $m_schedule_next);
     my $v_t_prime  = $v_t  / (1 - $self->beta2**$t);
     my $m_t_bar    = (1 - $momentum_t) * $grad_prime + $momentum_t_1 * $m_t_prime;

     # update weight
     $weight -= $lr * $m_t_bar / (sqrt($v_t_prime) + $self->epsilon);
 }

 __PACKAGE__->register;

 =head1 NAME

     AI::MXNet::Updater - Updater for kvstore
 =cut

 package AI::MXNet::Updater;
 use Mouse;
 use Storable qw(thaw freeze);
 use overload "&{}" => sub { my $self = shift; sub { $self->call(@_) } },
              fallback => 1;

 has "optimizer"     => (is => "rw", isa => "AI::MXNet::Optimizer");
 has "states"        => (is => "rw", isa => "HashRef", default => sub { +{} });
 has "states_synced" => (is => "rw", isa => "HashRef", default => sub { +{} });

 method call(Index $index, AI::MXNet::NDArray $grad, AI::MXNet::NDArray $weight)
 {
     if(not exists $self->states->{ $index })
     {
         $self->states->{ $index } = $self->optimizer->create_state($index, $weight);
         $self->states_synced->{ $index } = 1;
     }
     elsif(not $self->states_synced->{ $index })
     {
         $self->states->{ $index } = $self->sync_state_context($self->states->{ $index }, $weight->context);
         $self->states_synced->{ $index } = 1;
     }
     $self->optimizer->update($index, $weight, $grad, $self->states->{ $index });
 }
 *slice = *call;

 method sync_state_context(Maybe[AI::MXNet::NDArray|ArrayRef[AI::MXNet::NDArray]] $state, AI::MXNet::Context $context)
 {
     if(blessed $state)
     {
         return $state->as_in_context($context);
     }
     elsif(ref $state)
     {
         return [map { $self->sync_state_context($_, $context) } @{ $state }];
     }
     return $state;
 }

 =head2 set_states

     Sets updater states.
 =cut

 method set_states($states)
 {
     my $thawed_states = thaw($states);
     my ($optimizer);
     if(ref $thawed_states eq 'ARRAY')
     {
         ($thawed_states, $optimizer) = @{ $thawed_states };
         $self->optimizer($optimizer);
     }
     $self->states($thawed_states);
     %{ $self->states_synced } = map { $_ => 0 } keys %{ $thawed_states };
 }

 =head2 get_states

         Gets updater states.

         Parameters
         ----------
         dump_optimizer : bool, default False
             Whether to also save the optimizer itself. This would also save optimizer
             information such as learning rate and weight decay schedules.
 =cut

 method get_states(Bool $dump_optimizer=0)
 {
     if($dump_optimizer)
     {
         my $param_dict = $self->optimizer->param_dict;
         $self->optimizer->param_dict({});
         my $freezed = freeze([$self->states, $self->optimizer]);
         $self->optimizer->param_dict($param_dict);
         return $freezed;
     }
     return freeze($self->states);
 }

 package AI::MXNet::Optimizer;

 method get_updater(AI::MXNet::Optimizer $optimizer)
 {
     return AI::MXNet::Updater->new(optimizer => $optimizer);
 }

 1;