blob: ef6b5d71bbc528ce224cafafcb81d43ba6bec9cc [file] [log] [blame]
<!doctype html>
<html lang="en" class="no-js">
<head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width,initial-scale=1">
<meta http-equiv="x-ua-compatible" content="ie=edge">
<meta name="lang:clipboard.copy" content="Copy to clipboard">
<meta name="lang:clipboard.copied" content="Copied to clipboard">
<meta name="lang:search.language" content="en">
<meta name="lang:search.pipeline.stopwords" content="True">
<meta name="lang:search.pipeline.trimmer" content="True">
<meta name="lang:search.result.none" content="No matching documents">
<meta name="lang:search.result.one" content="1 matching document">
<meta name="lang:search.result.other" content="# matching documents">
<meta name="lang:search.tokenizer" content="[\s\-]+">
<link rel="shortcut icon" href="../../assets/images/favicon.png">
<meta name="generator" content="mkdocs-1.0.4, mkdocs-material-4.6.0">
<title>Optimizers - MXNet.jl</title>
<link rel="stylesheet" href="../../assets/stylesheets/application.1b62728e.css">
<script src="../../assets/javascripts/modernizr.268332fc.js"></script>
<link href="https://fonts.gstatic.com" rel="preconnect" crossorigin>
<link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,400,400i,700%7CRoboto+Mono&display=fallback">
<style>body,input{font-family:"Roboto","Helvetica Neue",Helvetica,Arial,sans-serif}code,kbd,pre{font-family:"Roboto Mono","Courier New",Courier,monospace}</style>
<link rel="stylesheet" href="../../assets/fonts/material-icons.css">
<link rel="stylesheet" href="../../assets/Documenter.css">
</head>
<body dir="ltr">
<svg class="md-svg">
<defs>
<svg xmlns="http://www.w3.org/2000/svg" width="416" height="448" viewBox="0 0 416 448" id="__github"><path fill="currentColor" d="M160 304q0 10-3.125 20.5t-10.75 19T128 352t-18.125-8.5-10.75-19T96 304t3.125-20.5 10.75-19T128 256t18.125 8.5 10.75 19T160 304zm160 0q0 10-3.125 20.5t-10.75 19T288 352t-18.125-8.5-10.75-19T256 304t3.125-20.5 10.75-19T288 256t18.125 8.5 10.75 19T320 304zm40 0q0-30-17.25-51T296 232q-10.25 0-48.75 5.25Q229.5 240 208 240t-39.25-2.75Q130.75 232 120 232q-29.5 0-46.75 21T56 304q0 22 8 38.375t20.25 25.75 30.5 15 35 7.375 37.25 1.75h42q20.5 0 37.25-1.75t35-7.375 30.5-15 20.25-25.75T360 304zm56-44q0 51.75-15.25 82.75-9.5 19.25-26.375 33.25t-35.25 21.5-42.5 11.875-42.875 5.5T212 416q-19.5 0-35.5-.75t-36.875-3.125-38.125-7.5-34.25-12.875T37 371.5t-21.5-28.75Q0 312 0 260q0-59.25 34-99-6.75-20.5-6.75-42.5 0-29 12.75-54.5 27 0 47.5 9.875t47.25 30.875Q171.5 96 212 96q37 0 70 8 26.25-20.5 46.75-30.25T376 64q12.75 25.5 12.75 54.5 0 21.75-6.75 42 34 40 34 99.5z"/></svg>
</defs>
</svg>
<input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
<input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
<label class="md-overlay" data-md-component="overlay" for="__drawer"></label>
<a href="#optimizers" tabindex="1" class="md-skip">
Skip to content
</a>
<header class="md-header" data-md-component="header">
<nav class="md-header-nav md-grid">
<div class="md-flex">
<div class="md-flex__cell md-flex__cell--shrink">
<a href="../.." title="MXNet.jl" class="md-header-nav__button md-logo">
<i class="md-icon"></i>
</a>
</div>
<div class="md-flex__cell md-flex__cell--shrink">
<label class="md-icon md-icon--menu md-header-nav__button" for="__drawer"></label>
</div>
<div class="md-flex__cell md-flex__cell--stretch">
<div class="md-flex__ellipsis md-header-nav__title" data-md-component="title">
<span class="md-header-nav__topic">
MXNet.jl
</span>
<span class="md-header-nav__topic">
Optimizers
</span>
</div>
</div>
<div class="md-flex__cell md-flex__cell--shrink">
<label class="md-icon md-icon--search md-header-nav__button" for="__search"></label>
<div class="md-search" data-md-component="search" role="dialog">
<label class="md-search__overlay" for="__search"></label>
<div class="md-search__inner" role="search">
<form class="md-search__form" name="search">
<input type="text" class="md-search__input" name="query" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="query" data-md-state="active">
<label class="md-icon md-search__icon" for="__search"></label>
<button type="reset" class="md-icon md-search__icon" data-md-component="reset" tabindex="-1">
&#xE5CD;
</button>
</form>
<div class="md-search__output">
<div class="md-search__scrollwrap" data-md-scrollfix>
<div class="md-search-result" data-md-component="result">
<div class="md-search-result__meta">
Type to start searching
</div>
<ol class="md-search-result__list"></ol>
</div>
</div>
</div>
</div>
</div>
</div>
<div class="md-flex__cell md-flex__cell--shrink">
<div class="md-header-nav__source">
<a href="https://github.com/apache/mxnet/tree/master/julia#mxnet/" title="Go to repository" class="md-source" data-md-source="github">
<div class="md-source__icon">
<svg viewBox="0 0 24 24" width="24" height="24">
<use xlink:href="#__github" width="24" height="24"></use>
</svg>
</div>
<div class="md-source__repository">
GitHub
</div>
</a>
</div>
</div>
</div>
</nav>
</header>
<div class="md-container">
<main class="md-main" role="main">
<div class="md-main__inner md-grid" data-md-component="container">
<div class="md-sidebar md-sidebar--primary" data-md-component="navigation">
<div class="md-sidebar__scrollwrap">
<div class="md-sidebar__inner">
<nav class="md-nav md-nav--primary" data-md-level="0">
<label class="md-nav__title md-nav__title--site" for="__drawer">
<a href="../.." title="MXNet.jl" class="md-nav__button md-logo">
<i class="md-icon"></i>
</a>
MXNet.jl
</label>
<div class="md-nav__source">
<a href="https://github.com/apache/mxnet/tree/master/julia#mxnet/" title="Go to repository" class="md-source" data-md-source="github">
<div class="md-source__icon">
<svg viewBox="0 0 24 24" width="24" height="24">
<use xlink:href="#__github" width="24" height="24"></use>
</svg>
</div>
<div class="md-source__repository">
GitHub
</div>
</a>
</div>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../.." title="Home" class="md-nav__link">
Home
</a>
</li>
<li class="md-nav__item md-nav__item--nested">
<input class="md-toggle md-nav__toggle" data-md-toggle="nav-2" type="checkbox" id="nav-2">
<label class="md-nav__link" for="nav-2">
Tutorial
</label>
<nav class="md-nav" data-md-component="collapsible" data-md-level="1">
<label class="md-nav__title" for="nav-2">
Tutorial
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../../tutorial/mnist/" title="Digit Recognition on MNIST" class="md-nav__link">
Digit Recognition on MNIST
</a>
</li>
<li class="md-nav__item">
<a href="../../tutorial/char-lstm/" title="Generating Random Sentence with LSTM RNN" class="md-nav__link">
Generating Random Sentence with LSTM RNN
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item md-nav__item--nested">
<input class="md-toggle md-nav__toggle" data-md-toggle="nav-3" type="checkbox" id="nav-3">
<label class="md-nav__link" for="nav-3">
User Guide
</label>
<nav class="md-nav" data-md-component="collapsible" data-md-level="1">
<label class="md-nav__title" for="nav-3">
User Guide
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../../user-guide/install/" title="Installation Guide" class="md-nav__link">
Installation Guide
</a>
</li>
<li class="md-nav__item">
<a href="../../user-guide/overview/" title="Overview" class="md-nav__link">
Overview
</a>
</li>
<li class="md-nav__item">
<a href="../../user-guide/faq/" title="FAQ" class="md-nav__link">
FAQ
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item md-nav__item--active md-nav__item--nested">
<input class="md-toggle md-nav__toggle" data-md-toggle="nav-4" type="checkbox" id="nav-4" checked>
<label class="md-nav__link" for="nav-4">
API Documentation
</label>
<nav class="md-nav" data-md-component="collapsible" data-md-level="1">
<label class="md-nav__title" for="nav-4">
API Documentation
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../context/" title="Context" class="md-nav__link">
Context
</a>
</li>
<li class="md-nav__item">
<a href="../model/" title="Models" class="md-nav__link">
Models
</a>
</li>
<li class="md-nav__item">
<a href="../initializer/" title="Initializers" class="md-nav__link">
Initializers
</a>
</li>
<li class="md-nav__item md-nav__item--active">
<input class="md-toggle md-nav__toggle" data-md-toggle="toc" type="checkbox" id="__toc">
<label class="md-nav__link md-nav__link--active" for="__toc">
Optimizers
</label>
<a href="./" title="Optimizers" class="md-nav__link md-nav__link--active">
Optimizers
</a>
<nav class="md-nav md-nav--secondary">
<label class="md-nav__title" for="__toc">Table of contents</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="#built-in-optimizers" class="md-nav__link">
Built-in optimizers
</a>
<nav class="md-nav">
<ul class="md-nav__list">
<li class="md-nav__item">
<a href="#stochastic-gradient-descent" class="md-nav__link">
Stochastic Gradient Descent
</a>
</li>
<li class="md-nav__item">
<a href="#adam" class="md-nav__link">
ADAM
</a>
</li>
<li class="md-nav__item">
<a href="#adagrad" class="md-nav__link">
AdaGrad
</a>
</li>
<li class="md-nav__item">
<a href="#adadelta" class="md-nav__link">
AdaDelta
</a>
</li>
<li class="md-nav__item">
<a href="#adamax" class="md-nav__link">
AdaMax
</a>
</li>
<li class="md-nav__item">
<a href="#rmsprop" class="md-nav__link">
RMSProp
</a>
</li>
<li class="md-nav__item">
<a href="#nadam" class="md-nav__link">
Nadam
</a>
</li>
</ul>
</nav>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item">
<a href="../callback/" title="Callbacks in training" class="md-nav__link">
Callbacks in training
</a>
</li>
<li class="md-nav__item">
<a href="../metric/" title="Evaluation Metrics" class="md-nav__link">
Evaluation Metrics
</a>
</li>
<li class="md-nav__item">
<a href="../io/" title="Data Providers" class="md-nav__link">
Data Providers
</a>
</li>
<li class="md-nav__item">
<a href="../ndarray/" title="NDArray API" class="md-nav__link">
NDArray API
</a>
</li>
<li class="md-nav__item">
<a href="../symbolic-node/" title="Symbolic API" class="md-nav__link">
Symbolic API
</a>
</li>
<li class="md-nav__item">
<a href="../nn-factory/" title="Neural Networks Factory" class="md-nav__link">
Neural Networks Factory
</a>
</li>
<li class="md-nav__item">
<a href="../executor/" title="Executor" class="md-nav__link">
Executor
</a>
</li>
<li class="md-nav__item">
<a href="../kvstore/" title="Key-Value Store" class="md-nav__link">
Key-Value Store
</a>
</li>
<li class="md-nav__item">
<a href="../visualize/" title="Network Visualization" class="md-nav__link">
Network Visualization
</a>
</li>
</ul>
</nav>
</li>
</ul>
</nav>
</div>
</div>
</div>
<div class="md-sidebar md-sidebar--secondary" data-md-component="toc">
<div class="md-sidebar__scrollwrap">
<div class="md-sidebar__inner">
<nav class="md-nav md-nav--secondary">
<label class="md-nav__title" for="__toc">Table of contents</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="#built-in-optimizers" class="md-nav__link">
Built-in optimizers
</a>
<nav class="md-nav">
<ul class="md-nav__list">
<li class="md-nav__item">
<a href="#stochastic-gradient-descent" class="md-nav__link">
Stochastic Gradient Descent
</a>
</li>
<li class="md-nav__item">
<a href="#adam" class="md-nav__link">
ADAM
</a>
</li>
<li class="md-nav__item">
<a href="#adagrad" class="md-nav__link">
AdaGrad
</a>
</li>
<li class="md-nav__item">
<a href="#adadelta" class="md-nav__link">
AdaDelta
</a>
</li>
<li class="md-nav__item">
<a href="#adamax" class="md-nav__link">
AdaMax
</a>
</li>
<li class="md-nav__item">
<a href="#rmsprop" class="md-nav__link">
RMSProp
</a>
</li>
<li class="md-nav__item">
<a href="#nadam" class="md-nav__link">
Nadam
</a>
</li>
</ul>
</nav>
</li>
</ul>
</nav>
</div>
</div>
</div>
<div class="md-content">
<article class="md-content__inner md-typeset">
<a href="https://github.com/apache/mxnet/tree/master/edit/master/docs/api/optimizer.md" title="Edit this page" class="md-icon md-content__icon">&#xE3C9;</a>
<!–- Licensed to the Apache Software Foundation (ASF) under one –> <!–- or more contributor license agreements. See the NOTICE file –> <!–- distributed with this work for additional information –> <!–- regarding copyright ownership. The ASF licenses this file –> <!–- to you under the Apache License, Version 2.0 (the –> <!–- "License"); you may not use this file except in compliance –> <!–- with the License. You may obtain a copy of the License at –> <!–- –> <!–- http://www.apache.org/licenses/LICENSE-2.0 –> <!–- –> <!–- Unless required by applicable law or agreed to in writing, –> <!–- software distributed under the License is distributed on an –> <!–- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY –> <!–- KIND, either express or implied. See the License for the –> <!–- specific language governing permissions and limitations –> <!–- under the License. –>
<p><a id='Optimizers-1'></a></p>
<h1 id="optimizers">Optimizers</h1>
<p>Says, you have the parameter <code>W</code> inited for your model and got its gradient stored as <code></code> (perhaps from AutoGrad APIs). Here is minimal snippet of getting your parameter <code>W</code> baked by <code>SGD</code>.</p>
<pre><code class="julia-repl">julia&gt; using MXNet
julia&gt; opt = SGD(η = 10)
SGD(10, 0.0, 0, 0, 0.0001, MXNet.mx.LearningRate.Fixed(10.0), MXNet.mx.Momentum.Null())
julia&gt; decend! = getupdater(opt)
(::getfield(MXNet.mx, Symbol(&quot;#updater#9272&quot;)){SGD,Dict{Int64,Any}}) (generic function with 1 method)
julia&gt; W = NDArray(Float32[1, 2, 3, 4]);
julia&gt; ∇ = NDArray(Float32[.1, .2, .3, .4]);
julia&gt; decend!(1, ∇, W)
4-element NDArray{Float32,1} @ cpu0:
-0.0010000467f0
-0.0020000935f0
-0.003000021f0
-0.004000187f0
</code></pre>
<p><a id='MXNet.mx.AbstractOptimizer' href='#MXNet.mx.AbstractOptimizer'>#</a>
<strong><code>MXNet.mx.AbstractOptimizer</code></strong> &mdash; <em>Type</em>.</p>
<pre><code class="julia">AbstractOptimizer
</code></pre>
<p>Base type for all optimizers.</p>
<p><a target='_blank' href='https://github.com/apache/mxnet/blob/26a5ad1f39784a60d1564f6f740e5c7bd971cd65/julia/src/optimizer.jl#L22-L26' class='documenter-source'>source</a><br></p>
<p><a id='MXNet.mx.getupdater-Tuple{AbstractOptimizer}' href='#MXNet.mx.getupdater-Tuple{AbstractOptimizer}'>#</a>
<strong><code>MXNet.mx.getupdater</code></strong> &mdash; <em>Method</em>.</p>
<pre><code class="julia">getupdater(optimizer)
</code></pre>
<p>A utility function to create an updater function of <code>KVStore</code>, that uses its closure to store all the states needed for each weights.</p>
<p>Ther returned function has following signature:</p>
<pre><code class="julia">decend!(index::Int, ∇::NDArray, x::NDArray)
</code></pre>
<p>If the optimizer is stateful and need access/store states during updating, <code>index</code> will be the key to access/store states.</p>
<p><a target='_blank' href='https://github.com/apache/mxnet/blob/26a5ad1f39784a60d1564f6f740e5c7bd971cd65/julia/src/optimizer.jl#L252-L266' class='documenter-source'>source</a><br></p>
<p><a id='MXNet.mx.normgrad!-Tuple{AbstractOptimizer,NDArray,NDArray}' href='#MXNet.mx.normgrad!-Tuple{AbstractOptimizer,NDArray,NDArray}'>#</a>
<strong><code>MXNet.mx.normgrad!</code></strong> &mdash; <em>Method</em>.</p>
<pre><code class="julia">normgrad(optimizer, W, ∇)
</code></pre>
<p>Get the properly normalized gradient (re-scaled and clipped if necessary).</p>
<ul>
<li><code>optimizer</code>: the optimizer, should contain the field <code>scale</code>, <code>clip</code> and <code>λ</code>.</li>
<li><code>W::NDArray</code>: the trainable weights.</li>
<li><code>∇::NDArray</code>: the original gradient of the weights.</li>
</ul>
<p><a target='_blank' href='https://github.com/apache/mxnet/blob/26a5ad1f39784a60d1564f6f740e5c7bd971cd65/julia/src/optimizer.jl#L278-L287' class='documenter-source'>source</a><br></p>
<p><a id='MXNet.mx.AbstractLearningRateScheduler' href='#MXNet.mx.AbstractLearningRateScheduler'>#</a>
<strong><code>MXNet.mx.AbstractLearningRateScheduler</code></strong> &mdash; <em>Type</em>.</p>
<pre><code class="julia">AbstractLearningRateScheduler
</code></pre>
<p>Base type for all learning rate scheduler.</p>
<p><a target='_blank' href='https://github.com/apache/mxnet/blob/26a5ad1f39784a60d1564f6f740e5c7bd971cd65/julia/src/optimizer.jl#L29-L33' class='documenter-source'>source</a><br></p>
<p><a id='MXNet.mx.AbstractMomentumScheduler' href='#MXNet.mx.AbstractMomentumScheduler'>#</a>
<strong><code>MXNet.mx.AbstractMomentumScheduler</code></strong> &mdash; <em>Type</em>.</p>
<pre><code class="julia">AbstractMomentumScheduler
</code></pre>
<p>Base type for all momentum scheduler.</p>
<p><a target='_blank' href='https://github.com/apache/mxnet/blob/26a5ad1f39784a60d1564f6f740e5c7bd971cd65/julia/src/optimizer.jl#L36-L40' class='documenter-source'>source</a><br></p>
<p><a id='MXNet.mx.OptimizationState' href='#MXNet.mx.OptimizationState'>#</a>
<strong><code>MXNet.mx.OptimizationState</code></strong> &mdash; <em>Type</em>.</p>
<pre><code class="julia">OptimizationState
</code></pre>
<p><strong>Attributes</strong></p>
<ul>
<li><code>batch_size</code>: The size of the mini-batch used in stochastic training.</li>
<li><code>curr_epoch</code>: The current epoch count. Epoch 0 means no training yet, during the first pass through the data, the epoch will be 1; during the second pass, the epoch count will be 1, and so on.</li>
<li><code>curr_batch</code>: The current mini-batch count. The batch count is reset during every epoch. The batch count 0 means the beginning of each epoch, with no mini-batch seen yet. During the first mini-batch, the mini-batch count will be 1.</li>
<li><code>curr_iter</code>: The current iteration count. One iteration corresponds to one mini-batch, but unlike the mini-batch count, the iteration count does <strong>not</strong> reset in each epoch. So it track the <em>total</em> number of mini-batches seen so far.</li>
</ul>
<p><a target='_blank' href='https://github.com/apache/mxnet/blob/26a5ad1f39784a60d1564f6f740e5c7bd971cd65/julia/src/optimizer.jl#L43-L60' class='documenter-source'>source</a><br></p>
<p><a id='MXNet.mx.LearningRate.Exp' href='#MXNet.mx.LearningRate.Exp'>#</a>
<strong><code>MXNet.mx.LearningRate.Exp</code></strong> &mdash; <em>Type</em>.</p>
<pre><code class="julia">LearningRate.Exp(η₀; γ = 0.9)
</code></pre>
<p>
<script type="math/tex; mode=display">
\eta_t = \eta_0\gamma^t
</script>
</p>
<p>Where <code>t</code> is the epoch count, or the iteration count.</p>
<p><a target='_blank' href='https://github.com/apache/mxnet/blob/26a5ad1f39784a60d1564f6f740e5c7bd971cd65/julia/src/optimizer.jl#L105' class='documenter-source'>source</a><br></p>
<p><a id='MXNet.mx.LearningRate.Fixed' href='#MXNet.mx.LearningRate.Fixed'>#</a>
<strong><code>MXNet.mx.LearningRate.Fixed</code></strong> &mdash; <em>Type</em>.</p>
<pre><code class="julia">LearningRate.Fixed(η)
</code></pre>
<p>Fixed learning rate scheduler always return the same learning rate.</p>
<p><a target='_blank' href='https://github.com/apache/mxnet/blob/26a5ad1f39784a60d1564f6f740e5c7bd971cd65/julia/src/optimizer.jl#L94-L98' class='documenter-source'>source</a><br></p>
<p><a id='MXNet.mx.LearningRate.Inv' href='#MXNet.mx.LearningRate.Inv'>#</a>
<strong><code>MXNet.mx.LearningRate.Inv</code></strong> &mdash; <em>Type</em>.</p>
<pre><code class="julia">LearningRate.Inv(η₀; γ = 0.9, p = 0.5)
</code></pre>
<p>
<script type="math/tex; mode=display">
\eta_t = \eta_0 (1 + \gamma t)^{-p}
</script>
</p>
<p>Where <code>t</code> is the epoch count, or the iteration count.</p>
<p><a target='_blank' href='https://github.com/apache/mxnet/blob/26a5ad1f39784a60d1564f6f740e5c7bd971cd65/julia/src/optimizer.jl#L127' class='documenter-source'>source</a><br></p>
<p><a id='Base.get-Tuple{MXNet.mx.AbstractLearningRateScheduler}' href='#Base.get-Tuple{MXNet.mx.AbstractLearningRateScheduler}'>#</a>
<strong><code>Base.get</code></strong> &mdash; <em>Method</em>.</p>
<pre><code class="julia">get(sched::AbstractLearningRateScheduler)
</code></pre>
<p>Returns the current learning rate.</p>
<p><a target='_blank' href='https://github.com/apache/mxnet/blob/26a5ad1f39784a60d1564f6f740e5c7bd971cd65/julia/src/optimizer.jl#L87-L91' class='documenter-source'>source</a><br></p>
<p><a id='MXNet.mx.Momentum.Fixed' href='#MXNet.mx.Momentum.Fixed'>#</a>
<strong><code>MXNet.mx.Momentum.Fixed</code></strong> &mdash; <em>Type</em>.</p>
<pre><code class="julia">Momentum.Fixed
</code></pre>
<p>Fixed momentum scheduler always returns the same value.</p>
<p><a target='_blank' href='https://github.com/apache/mxnet/blob/26a5ad1f39784a60d1564f6f740e5c7bd971cd65/julia/src/optimizer.jl#L190-L194' class='documenter-source'>source</a><br></p>
<p><a id='MXNet.mx.Momentum.NadamScheduler' href='#MXNet.mx.Momentum.NadamScheduler'>#</a>
<strong><code>MXNet.mx.Momentum.NadamScheduler</code></strong> &mdash; <em>Type</em>.</p>
<pre><code class="julia">NadamScheduler(; μ = 0.99, δ = 0.004, γ = 0.5, α = 0.96)
</code></pre>
<p>Nesterov-accelerated adaptive momentum scheduler.</p>
<p>Description in <a href="http://cs229.stanford.edu/proj2015/054_report.pdf">Incorporating Nesterov Momentum into Adam</a>.</p>
<p>
<script type="math/tex; mode=display">
\mu_t = \mu_0 * (1 - \gamma * \alpha^{t * \delta})
</script>
</p>
<p>Where</p>
<ul>
<li><code>t</code>: iteration count</li>
<li><code>μ</code>: default <code>0.99</code>, μ₀</li>
<li><code>δ</code>: default <code>0.004</code> is scheduler decay.</li>
<li><code>γ</code>: default <code>0.5</code></li>
<li><code>α</code>: default <code>0.96</code></li>
</ul>
<p><a target='_blank' href='https://github.com/apache/mxnet/blob/26a5ad1f39784a60d1564f6f740e5c7bd971cd65/julia/src/optimizer.jl#L201' class='documenter-source'>source</a><br></p>
<p><a id='MXNet.mx.Momentum.Null' href='#MXNet.mx.Momentum.Null'>#</a>
<strong><code>MXNet.mx.Momentum.Null</code></strong> &mdash; <em>Type</em>.</p>
<pre><code class="julia">Momentum.Null
</code></pre>
<p>The null momentum scheduler always returns 0 for momentum. It is also used to explicitly indicate momentum should not be used.</p>
<p><a target='_blank' href='https://github.com/apache/mxnet/blob/26a5ad1f39784a60d1564f6f740e5c7bd971cd65/julia/src/optimizer.jl#L179-L184' class='documenter-source'>source</a><br></p>
<p><a id='Base.get-Tuple{MXNet.mx.Momentum.NadamScheduler,Any}' href='#Base.get-Tuple{MXNet.mx.Momentum.NadamScheduler,Any}'>#</a>
<strong><code>Base.get</code></strong> &mdash; <em>Method</em>.</p>
<pre><code class="julia">get(n::NadamScheduler, t)
</code></pre>
<p>Where <code>t</code> is the iteration count.</p>
<p><a target='_blank' href='https://github.com/apache/mxnet/blob/26a5ad1f39784a60d1564f6f740e5c7bd971cd65/julia/src/optimizer.jl#L235-L239' class='documenter-source'>source</a><br></p>
<p><a id='Built-in-optimizers-1'></a></p>
<h2 id="built-in-optimizers">Built-in optimizers</h2>
<p><a id='Stochastic-Gradient-Descent-1'></a></p>
<h3 id="stochastic-gradient-descent">Stochastic Gradient Descent</h3>
<p><a id='MXNet.mx.SGD' href='#MXNet.mx.SGD'>#</a>
<strong><code>MXNet.mx.SGD</code></strong> &mdash; <em>Type</em>.</p>
<pre><code class="julia">SGD(; kwargs...)
</code></pre>
<p>Stochastic gradient descent optimizer.</p>
<p>Vanilla SGD:</p>
<p>
<script type="math/tex; mode=display">
\theta \leftarrow \theta - \eta \nabla
</script>
</p>
<p>SGD with momentum::</p>
<p>
<script type="math/tex; mode=display">
\begin{align*}
\nu & \leftarrow \mu \nu_{t-1} - \eta \nabla \\
\theta & \leftarrow \theta + \nu_t
\end{align*}
</script>
</p>
<p><strong>Arguments</strong></p>
<ul>
<li><code>η</code>: default <code>0.01</code>, learning rate.</li>
<li><code>μ</code>: default <code>0</code>, the momentum, usually set to <code>0.9</code> in this implementation.</li>
<li><code>λ</code>: default <code>0.0001</code>, weight decay is equivalent to adding a global l2 regularizer to the parameters.</li>
<li><code>clip</code>: default <code>0</code>, gradient clipping. If positive, will clip the gradient into the bounded range <code>[-clip, clip]</code>.</li>
<li><code>scale</code>: default <code>0</code>, gradient rescaling. If != 0, multiply the gradient with <code>scale</code> before updating. Often choose to be <code>1.0 / batch_size</code>. If leave it default, high-level API like <code>fit!</code> will set it to <code>1.0 / batch_size</code>, since <code>fit!</code> knows the <code>batch_size</code>.</li>
<li><code>μ_sched::AbstractMomentumScheduler</code>: default <code>Momentum.Null()</code>, a dynamic momentum scheduler. If set, will overwrite the <code>momentum</code> parameter.</li>
<li><code>η_sched::AbstractLearningRateScheduler</code>: default <code>LearningRate.Fixed(η)</code>, a dynamic learning rate scheduler. If set, will overwrite the <code>η</code> parameter.</li>
</ul>
<p><a target='_blank' href='https://github.com/apache/mxnet/blob/26a5ad1f39784a60d1564f6f740e5c7bd971cd65/julia/src/optimizers/sgd.jl#L18' class='documenter-source'>source</a><br></p>
<p><a id='ADAM-1'></a></p>
<h3 id="adam">ADAM</h3>
<p><a id='MXNet.mx.ADAM' href='#MXNet.mx.ADAM'>#</a>
<strong><code>MXNet.mx.ADAM</code></strong> &mdash; <em>Type</em>.</p>
<pre><code class="julia"> ADAM
</code></pre>
<p>The solver described in Diederik Kingma, Jimmy Ba: <em>Adam: A Method for Stochastic Optimization</em>. arXiv:1412.6980 [cs.LG].</p>
<pre><code>ADAM(; kwargs...)
</code></pre>
<p><strong>Arguments</strong></p>
<ul>
<li><code>η</code>: default <code>0.001</code>, learning rate.</li>
<li><code>β1</code>: default <code>0.9</code>.</li>
<li><code>β2</code>: default <code>0.999</code>.</li>
<li><code>ϵ</code>: default <code>1e-8</code>.</li>
<li><code>clip</code>: default <code>0</code>, gradient clipping. If positive, will clip the gradient into the range <code>[-clip, clip]</code>.</li>
<li><code>scale</code>: default <code>0</code>, gradient rescaling. If != 0, multiply the gradient with <code>scale</code> before updating. Often choose to be <code>1.0 / batch_size</code>. If leave it default, high-level API like <code>fit!</code> will set it to <code>1.0 / batch_size</code>, since <code>fit!</code> knows the <code>batch_size</code>.</li>
<li><code>λ</code>: default <code>0.00001</code>, weight decay is equivalent to adding a global l2 regularizer for all the parameters.</li>
<li><code>η_sched::AbstractLearningRateScheduler</code>: default <code>LearningRate.Fixed(η)</code>, a dynamic learning rate scheduler. If set, will overwrite the <code>η</code> parameter.</li>
</ul>
<p><a target='_blank' href='https://github.com/apache/mxnet/blob/26a5ad1f39784a60d1564f6f740e5c7bd971cd65/julia/src/optimizers/adam.jl#L18-L42' class='documenter-source'>source</a><br></p>
<p><a id='AdaGrad-1'></a></p>
<h3 id="adagrad">AdaGrad</h3>
<p><a id='MXNet.mx.AdaGrad' href='#MXNet.mx.AdaGrad'>#</a>
<strong><code>MXNet.mx.AdaGrad</code></strong> &mdash; <em>Type</em>.</p>
<pre><code class="julia">AdaGrad(; kwargs...)
</code></pre>
<p>Scale learning rates by dividing with the square root of accumulated squared gradients. See [1] for further description.</p>
<p><strong>Arguments</strong></p>
<ul>
<li><code>η</code>: default <code>0.1</code>, learning rate.</li>
<li><code>ϵ</code>: default <code>1e-6</code>, small value added for numerical stability.</li>
<li><code>clip</code>: default <code>0</code>, gradient clipping. If positive, will clip the gradient into the range <code>[-clip, clip]</code>.</li>
<li><code>scale</code>: default <code>0</code>, gradient rescaling. If != 0, multiply the gradient with <code>scale</code> before updating. Often choose to be <code>1.0 / batch_size</code>. If leave it default, high-level API like <code>fit!</code> will set it to <code>1.0 / batch_size</code>, since <code>fit!</code> knows the <code>batch_size</code>.</li>
<li><code>λ</code>: default <code>0.00001</code>, weight decay is equivalent to adding a global l2 regularizer for all the parameters.</li>
</ul>
<p><strong>Notes</strong></p>
<p>Using step size <code>η</code> AdaGrad calculates the learning rate for feature <code>i</code> at time step t as:</p>
<p>
<script type="math/tex; mode=display">
η_{t,i} = \frac{lr}{\sqrt{\sum^t_{t^\prime} g^2_{t^\prime,i} + ϵ}} g_{t,i}
</script>
</p>
<p>as such the learning rate is monotonically decreasing. Epsilon is not included in the typical formula, see [2].</p>
<p><strong>References</strong></p>
<ol>
<li>Duchi, J., Hazan, E., &amp; Singer, Y. (2011): Adaptive subgradient methods for online learning and stochastic optimization. JMLR, 12:2121-2159.</li>
<li>Chris Dyer: Notes on AdaGrad. <a href="http://www.ark.cs.cmu.edu/cdyer/adagrad.pdf">http://www.ark.cs.cmu.edu/cdyer/adagrad.pdf</a></li>
</ol>
<p><a target='_blank' href='https://github.com/apache/mxnet/blob/26a5ad1f39784a60d1564f6f740e5c7bd971cd65/julia/src/optimizers/adagrad.jl#L18' class='documenter-source'>source</a><br></p>
<p><a id='AdaDelta-1'></a></p>
<h3 id="adadelta">AdaDelta</h3>
<p><a id='MXNet.mx.AdaDelta' href='#MXNet.mx.AdaDelta'>#</a>
<strong><code>MXNet.mx.AdaDelta</code></strong> &mdash; <em>Type</em>.</p>
<pre><code class="julia">AdaDelta(; kwargs...)
</code></pre>
<p>Scale learning rates by the ratio of accumulated gradients to accumulated updates, see [1] and notes for further description.</p>
<p><strong>Attributes</strong></p>
<ul>
<li><code>η</code>: default <code>1.0</code>, learning rate.</li>
<li><code>ρ</code>: default <code>0.95</code>, squared gradient moving average decay factor.</li>
<li><code>ϵ</code>: default <code>1e-6</code>, small value added for numerical stability.</li>
<li><code>clip</code>: default <code>0</code>, gradient clipping. If positive, will clip the gradient into the range <code>[-clip, clip]</code>.</li>
<li><code>scale</code>: default <code>0</code>, gradient rescaling. If != 0, multiply the gradient with <code>scale</code> before updating. Often choose to be <code>1.0 / batch_size</code>. If leave it default, high-level API like <code>fit!</code> will set it to <code>1.0 / batch_size</code>, since <code>fit!</code> knows the <code>batch_size</code>.</li>
<li><code>λ</code>: default <code>0.00001</code>, weight decay is equivalent to adding a global l2 regularizer for all the parameters.</li>
</ul>
<p><strong>Notes</strong></p>
<p><code>ρ</code> should be between 0 and 1. A value of <code>ρ</code> close to 1 will decay the moving average slowly and a value close to 0 will decay the moving average fast.</p>
<p><code>ρ = 0.95</code> and <code>ϵ = 1e-6</code> are suggested in the paper and reported to work for multiple datasets (MNIST, speech). In the paper, no learning rate is considered (so <code>η = 1.0</code>). Probably best to keep it at this value.</p>
<p><code>ϵ</code> is important for the very first update (so the numerator does not become 0).</p>
<p>Using the step size <code>η</code> and a decay factor <code>ρ</code> the learning rate is calculated as:</p>
<p>
<script type="math/tex; mode=display">
\begin{align*}
r_t &= ρ r_{t-1} + (1 - ρ) g^2 \\
η_t &= η \frac{\sqrt{s_{t-1} + ϵ}} {\sqrt{r_t + ϵ}} \\
s_t &= ρ s_{t-1} + (1 - ρ) _t \times g)^2
\end{align*}
</script>
</p>
<p><strong>References</strong></p>
<ol>
<li>Zeiler, M. D. (2012): ADADELTA: An Adaptive Learning Rate Method. arXiv Preprint arXiv:1212.5701.</li>
</ol>
<p><a target='_blank' href='https://github.com/apache/mxnet/blob/26a5ad1f39784a60d1564f6f740e5c7bd971cd65/julia/src/optimizers/adadelta.jl#L18' class='documenter-source'>source</a><br></p>
<p><a id='AdaMax-1'></a></p>
<h3 id="adamax">AdaMax</h3>
<p><a id='MXNet.mx.AdaMax' href='#MXNet.mx.AdaMax'>#</a>
<strong><code>MXNet.mx.AdaMax</code></strong> &mdash; <em>Type</em>.</p>
<pre><code class="julia">AdaMax(; kwargs...)
</code></pre>
<p>This is a variant of of the Adam algorithm based on the infinity norm. See [1] for further description.</p>
<p><strong>Arguments</strong></p>
<ul>
<li><code>η</code>: default <code>0.002</code>, learning rate.</li>
<li><code>β1</code>: default <code>0.9</code>, exponential decay rate for the first moment estimates.</li>
<li><code>β2</code>: default <code>0.999</code>, exponential decay rate for the weighted infinity norm estimates.</li>
<li><code>ϵ</code>: default <code>1e-8</code>, small value added for numerical stability.</li>
<li><code>clip</code>: default <code>0</code>, gradient clipping. If positive, will clip the gradient into the range <code>[-clip, clip]</code>.</li>
<li><code>scale</code>: default <code>0</code>, gradient rescaling. If != 0, multiply the gradient with <code>scale</code> before updating. Often choose to be <code>1.0 / batch_size</code>. If leave it default, high-level API like <code>fit!</code> will set it to <code>1.0 / batch_size</code>, since <code>fit!</code> knows the <code>batch_size</code>.</li>
<li><code>λ</code>: default <code>0.00001</code>, weight decay is equivalent to adding a global l2 regularizer for all the parameters.</li>
</ul>
<p><strong>References</strong></p>
<ol>
<li>Kingma, Diederik, and Jimmy Ba (2014): Adam: A Method for Stochastic Optimization. Section 7. <a href="http://arxiv.org/abs/1412.6980">http://arxiv.org/abs/1412.6980</a>.</li>
</ol>
<p><a target='_blank' href='https://github.com/apache/mxnet/blob/26a5ad1f39784a60d1564f6f740e5c7bd971cd65/julia/src/optimizers/adamax.jl#L18-L45' class='documenter-source'>source</a><br></p>
<p><a id='RMSProp-1'></a></p>
<h3 id="rmsprop">RMSProp</h3>
<p><a id='MXNet.mx.RMSProp' href='#MXNet.mx.RMSProp'>#</a>
<strong><code>MXNet.mx.RMSProp</code></strong> &mdash; <em>Type</em>.</p>
<pre><code class="julia">RMSProp(; kwargs...)
</code></pre>
<p>Scale learning rates by dividing with the moving average of the root mean squared (RMS) gradients. See [1] for further description.</p>
<p><strong>Arguments</strong></p>
<ul>
<li><code>η</code>: default <code>0.1</code>, learning rate.</li>
<li><code>ρ</code>: default <code>0.9</code>, gradient moving average decay factor.</li>
<li><code>ϵ</code>: default <code>1e-8</code>, small value added for numerical stability.</li>
<li><code>clip</code>: default <code>0</code>, gradient clipping. If positive, will clip the gradient into the range <code>[-clip, clip]</code>.</li>
<li><code>scale</code>: default <code>0</code>, gradient rescaling. If != 0, multiply the gradient with <code>scale</code> before updating. Often choose to be <code>1.0 / batch_size</code>. If leave it default, high-level API like <code>fit!</code> will set it to <code>1.0 / batch_size</code>, since <code>fit!</code> knows the <code>batch_size</code>.</li>
<li><code>λ</code>: default <code>0.00001</code>, weight decay is equivalent to adding a global l2 regularizer for all the parameters.</li>
</ul>
<p><strong>Notes</strong></p>
<p><code>ρ</code> should be between 0 and 1. A value of <code>ρ</code> close to 1 will decay the moving average slowly and a value close to 0 will decay the moving average fast.</p>
<p>Using the step size <code>η</code> and a decay factor <code>ρ the learning rate</code>ηₜ` is calculated as:</p>
<p>
<script type="math/tex; mode=display">
\begin{align*}
r_t &= ρ r_{t-1} + (1 - ρ)g^2 \\
η_t &= \frac{η}{\sqrt{r_t + ϵ}}
\end{align*}
</script>
</p>
<p><strong>References</strong></p>
<ol>
<li>Tieleman, T. and Hinton, G. (2012): Neural Networks for Machine Learning, Lecture 6.5 - rmsprop. Coursera. <a href="http://www.youtube.com/watch?v=O3sxAc4hxZU">http://www.youtube.com/watch?v=O3sxAc4hxZU</a> (formula @5:20)</li>
</ol>
<p><a target='_blank' href='https://github.com/apache/mxnet/blob/26a5ad1f39784a60d1564f6f740e5c7bd971cd65/julia/src/optimizers/rmsprop.jl#L18' class='documenter-source'>source</a><br></p>
<p><a id='Nadam-1'></a></p>
<h3 id="nadam">Nadam</h3>
<p><a id='MXNet.mx.Nadam' href='#MXNet.mx.Nadam'>#</a>
<strong><code>MXNet.mx.Nadam</code></strong> &mdash; <em>Type</em>.</p>
<pre><code class="julia">Nadam(; kwargs...)
</code></pre>
<p>Nesterov Adam optimizer: Adam RMSprop with Nesterov momentum, see [1] and notes for further description.</p>
<p><strong>Arguments</strong></p>
<ul>
<li><code>η</code>: default <code>0.001</code>, learning rate.</li>
<li><code>β1</code>: default <code>0.99</code>.</li>
<li><code>β2</code>: default <code>0.999</code>.</li>
<li><code>ϵ</code>: default <code>1e-8</code>, small value added for numerical stability.</li>
<li><code>clip</code>: default <code>0</code>, gradient clipping. If positive, will clip the gradient into the range <code>[-clip, clip]</code>.</li>
<li><code>scale</code>: default <code>0</code>, gradient rescaling. If != 0, multiply the gradient with <code>scale</code> before updating. Often choose to be <code>1.0 / batch_size</code>. If leave it default, high-level API like <code>fit!</code> will set it to <code>1.0 / batch_size</code>, since <code>fit!</code> knows the <code>batch_size</code>.</li>
<li><code>λ</code>: default <code>0.00001</code>, weight decay is equivalent to adding a global l2 regularizer for all the parameters.</li>
<li><code>η_sched::AbstractLearningRateScheduler</code>: default <code>nothing</code>, a dynamic learning rate scheduler. If set, will overwrite the <code>η</code> parameter.</li>
<li>
<p><code>μ_sched::NadamScheduler</code> default <code>NadamScheduler()</code> of the form.</p>
<p>
<script type="math/tex; mode=display">
\mu_t = β_1 (1 - 0.5 \times 0.96^{t \times 0.004})
</script>
</p>
</li>
</ul>
<p><strong>Notes</strong></p>
<p>Default parameters follow those provided in the paper. It is recommended to leave the parameters of this optimizer at their default values.</p>
<p><strong>References</strong></p>
<ol>
<li><a href="http://cs229.stanford.edu/proj2015/054_report.pdf">Incorporating Nesterov Momentum into Adam</a>.</li>
<li><a href="http://www.cs.toronto.edu/~fritz/absps/momentum.pdf">On the importance of initialization and momentum in deep learning</a>.</li>
</ol>
<p><a target='_blank' href='https://github.com/apache/mxnet/blob/26a5ad1f39784a60d1564f6f740e5c7bd971cd65/julia/src/optimizers/nadam.jl#L18' class='documenter-source'>source</a><br></p>
</article>
</div>
</div>
</main>
<footer class="md-footer">
<div class="md-footer-nav">
<nav class="md-footer-nav__inner md-grid">
<a href="../initializer/" title="Initializers" class="md-flex md-footer-nav__link md-footer-nav__link--prev" rel="prev">
<div class="md-flex__cell md-flex__cell--shrink">
<i class="md-icon md-icon--arrow-back md-footer-nav__button"></i>
</div>
<div class="md-flex__cell md-flex__cell--stretch md-footer-nav__title">
<span class="md-flex__ellipsis">
<span class="md-footer-nav__direction">
Previous
</span>
Initializers
</span>
</div>
</a>
<a href="../callback/" title="Callbacks in training" class="md-flex md-footer-nav__link md-footer-nav__link--next" rel="next">
<div class="md-flex__cell md-flex__cell--stretch md-footer-nav__title">
<span class="md-flex__ellipsis">
<span class="md-footer-nav__direction">
Next
</span>
Callbacks in training
</span>
</div>
<div class="md-flex__cell md-flex__cell--shrink">
<i class="md-icon md-icon--arrow-forward md-footer-nav__button"></i>
</div>
</a>
</nav>
</div>
<div class="md-footer-meta md-typeset">
<div class="md-footer-meta__inner md-grid">
<div class="md-footer-copyright">
powered by
<a href="https://www.mkdocs.org">MkDocs</a>
and
<a href="https://squidfunk.github.io/mkdocs-material/">
Material for MkDocs</a>
</div>
</div>
</div>
</footer>
</div>
<script src="../../assets/javascripts/application.808e90bb.js"></script>
<script>app.initialize({version:"1.0.4",url:{base:"../.."}})</script>
<script src="https://cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS_HTML"></script>
<script src="../../assets/mathjaxhelper.js"></script>
</body>
</html>