| |
| |
| |
| |
| <!doctype html> |
| <html lang="en" class="no-js"> |
| <head> |
| |
| <meta charset="utf-8"> |
| <meta name="viewport" content="width=device-width,initial-scale=1"> |
| <meta http-equiv="x-ua-compatible" content="ie=edge"> |
| |
| |
| |
| |
| <meta name="lang:clipboard.copy" content="Copy to clipboard"> |
| |
| <meta name="lang:clipboard.copied" content="Copied to clipboard"> |
| |
| <meta name="lang:search.language" content="en"> |
| |
| <meta name="lang:search.pipeline.stopwords" content="True"> |
| |
| <meta name="lang:search.pipeline.trimmer" content="True"> |
| |
| <meta name="lang:search.result.none" content="No matching documents"> |
| |
| <meta name="lang:search.result.one" content="1 matching document"> |
| |
| <meta name="lang:search.result.other" content="# matching documents"> |
| |
| <meta name="lang:search.tokenizer" content="[\s\-]+"> |
| |
| <link rel="shortcut icon" href="../../assets/images/favicon.png"> |
| <meta name="generator" content="mkdocs-1.0.4, mkdocs-material-4.6.0"> |
| |
| |
| |
| <title>Optimizers - MXNet.jl</title> |
| |
| |
| |
| <link rel="stylesheet" href="../../assets/stylesheets/application.1b62728e.css"> |
| |
| |
| |
| |
| <script src="../../assets/javascripts/modernizr.268332fc.js"></script> |
| |
| |
| |
| <link href="https://fonts.gstatic.com" rel="preconnect" crossorigin> |
| <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,400,400i,700%7CRoboto+Mono&display=fallback"> |
| <style>body,input{font-family:"Roboto","Helvetica Neue",Helvetica,Arial,sans-serif}code,kbd,pre{font-family:"Roboto Mono","Courier New",Courier,monospace}</style> |
| |
| |
| <link rel="stylesheet" href="../../assets/fonts/material-icons.css"> |
| |
| |
| <link rel="stylesheet" href="../../assets/Documenter.css"> |
| |
| |
| |
| |
| |
| </head> |
| |
| <body dir="ltr"> |
| |
| <svg class="md-svg"> |
| <defs> |
| |
| |
| <svg xmlns="http://www.w3.org/2000/svg" width="416" height="448" viewBox="0 0 416 448" id="__github"><path fill="currentColor" d="M160 304q0 10-3.125 20.5t-10.75 19T128 352t-18.125-8.5-10.75-19T96 304t3.125-20.5 10.75-19T128 256t18.125 8.5 10.75 19T160 304zm160 0q0 10-3.125 20.5t-10.75 19T288 352t-18.125-8.5-10.75-19T256 304t3.125-20.5 10.75-19T288 256t18.125 8.5 10.75 19T320 304zm40 0q0-30-17.25-51T296 232q-10.25 0-48.75 5.25Q229.5 240 208 240t-39.25-2.75Q130.75 232 120 232q-29.5 0-46.75 21T56 304q0 22 8 38.375t20.25 25.75 30.5 15 35 7.375 37.25 1.75h42q20.5 0 37.25-1.75t35-7.375 30.5-15 20.25-25.75T360 304zm56-44q0 51.75-15.25 82.75-9.5 19.25-26.375 33.25t-35.25 21.5-42.5 11.875-42.875 5.5T212 416q-19.5 0-35.5-.75t-36.875-3.125-38.125-7.5-34.25-12.875T37 371.5t-21.5-28.75Q0 312 0 260q0-59.25 34-99-6.75-20.5-6.75-42.5 0-29 12.75-54.5 27 0 47.5 9.875t47.25 30.875Q171.5 96 212 96q37 0 70 8 26.25-20.5 46.75-30.25T376 64q12.75 25.5 12.75 54.5 0 21.75-6.75 42 34 40 34 99.5z"/></svg> |
| |
| </defs> |
| </svg> |
| <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off"> |
| <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off"> |
| <label class="md-overlay" data-md-component="overlay" for="__drawer"></label> |
| |
| <a href="#optimizers" tabindex="1" class="md-skip"> |
| Skip to content |
| </a> |
| |
| |
| <header class="md-header" data-md-component="header"> |
| <nav class="md-header-nav md-grid"> |
| <div class="md-flex"> |
| <div class="md-flex__cell md-flex__cell--shrink"> |
| <a href="../.." title="MXNet.jl" class="md-header-nav__button md-logo"> |
| |
| <i class="md-icon"></i> |
| |
| </a> |
| </div> |
| <div class="md-flex__cell md-flex__cell--shrink"> |
| <label class="md-icon md-icon--menu md-header-nav__button" for="__drawer"></label> |
| </div> |
| <div class="md-flex__cell md-flex__cell--stretch"> |
| <div class="md-flex__ellipsis md-header-nav__title" data-md-component="title"> |
| |
| <span class="md-header-nav__topic"> |
| MXNet.jl |
| </span> |
| <span class="md-header-nav__topic"> |
| |
| Optimizers |
| |
| </span> |
| |
| </div> |
| </div> |
| <div class="md-flex__cell md-flex__cell--shrink"> |
| |
| <label class="md-icon md-icon--search md-header-nav__button" for="__search"></label> |
| |
| <div class="md-search" data-md-component="search" role="dialog"> |
| <label class="md-search__overlay" for="__search"></label> |
| <div class="md-search__inner" role="search"> |
| <form class="md-search__form" name="search"> |
| <input type="text" class="md-search__input" name="query" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="query" data-md-state="active"> |
| <label class="md-icon md-search__icon" for="__search"></label> |
| <button type="reset" class="md-icon md-search__icon" data-md-component="reset" tabindex="-1"> |
|  |
| </button> |
| </form> |
| <div class="md-search__output"> |
| <div class="md-search__scrollwrap" data-md-scrollfix> |
| <div class="md-search-result" data-md-component="result"> |
| <div class="md-search-result__meta"> |
| Type to start searching |
| </div> |
| <ol class="md-search-result__list"></ol> |
| </div> |
| </div> |
| </div> |
| </div> |
| </div> |
| |
| </div> |
| |
| <div class="md-flex__cell md-flex__cell--shrink"> |
| <div class="md-header-nav__source"> |
| |
| |
| |
| |
| |
| <a href="https://github.com/apache/mxnet/tree/master/julia#mxnet/" title="Go to repository" class="md-source" data-md-source="github"> |
| |
| <div class="md-source__icon"> |
| <svg viewBox="0 0 24 24" width="24" height="24"> |
| <use xlink:href="#__github" width="24" height="24"></use> |
| </svg> |
| </div> |
| |
| <div class="md-source__repository"> |
| GitHub |
| </div> |
| </a> |
| </div> |
| </div> |
| |
| </div> |
| </nav> |
| </header> |
| |
| <div class="md-container"> |
| |
| |
| |
| |
| <main class="md-main" role="main"> |
| <div class="md-main__inner md-grid" data-md-component="container"> |
| |
| |
| <div class="md-sidebar md-sidebar--primary" data-md-component="navigation"> |
| <div class="md-sidebar__scrollwrap"> |
| <div class="md-sidebar__inner"> |
| <nav class="md-nav md-nav--primary" data-md-level="0"> |
| <label class="md-nav__title md-nav__title--site" for="__drawer"> |
| <a href="../.." title="MXNet.jl" class="md-nav__button md-logo"> |
| |
| <i class="md-icon"></i> |
| |
| </a> |
| MXNet.jl |
| </label> |
| |
| <div class="md-nav__source"> |
| |
| |
| |
| |
| |
| <a href="https://github.com/apache/mxnet/tree/master/julia#mxnet/" title="Go to repository" class="md-source" data-md-source="github"> |
| |
| <div class="md-source__icon"> |
| <svg viewBox="0 0 24 24" width="24" height="24"> |
| <use xlink:href="#__github" width="24" height="24"></use> |
| </svg> |
| </div> |
| |
| <div class="md-source__repository"> |
| GitHub |
| </div> |
| </a> |
| </div> |
| |
| <ul class="md-nav__list" data-md-scrollfix> |
| |
| |
| |
| |
| |
| |
| <li class="md-nav__item"> |
| <a href="../.." title="Home" class="md-nav__link"> |
| Home |
| </a> |
| </li> |
| |
| |
| |
| |
| |
| |
| |
| <li class="md-nav__item md-nav__item--nested"> |
| |
| <input class="md-toggle md-nav__toggle" data-md-toggle="nav-2" type="checkbox" id="nav-2"> |
| |
| <label class="md-nav__link" for="nav-2"> |
| Tutorial |
| </label> |
| <nav class="md-nav" data-md-component="collapsible" data-md-level="1"> |
| <label class="md-nav__title" for="nav-2"> |
| Tutorial |
| </label> |
| <ul class="md-nav__list" data-md-scrollfix> |
| |
| |
| |
| |
| |
| |
| |
| <li class="md-nav__item"> |
| <a href="../../tutorial/mnist/" title="Digit Recognition on MNIST" class="md-nav__link"> |
| Digit Recognition on MNIST |
| </a> |
| </li> |
| |
| |
| |
| |
| |
| |
| |
| <li class="md-nav__item"> |
| <a href="../../tutorial/char-lstm/" title="Generating Random Sentence with LSTM RNN" class="md-nav__link"> |
| Generating Random Sentence with LSTM RNN |
| </a> |
| </li> |
| |
| |
| </ul> |
| </nav> |
| </li> |
| |
| |
| |
| |
| |
| |
| |
| <li class="md-nav__item md-nav__item--nested"> |
| |
| <input class="md-toggle md-nav__toggle" data-md-toggle="nav-3" type="checkbox" id="nav-3"> |
| |
| <label class="md-nav__link" for="nav-3"> |
| User Guide |
| </label> |
| <nav class="md-nav" data-md-component="collapsible" data-md-level="1"> |
| <label class="md-nav__title" for="nav-3"> |
| User Guide |
| </label> |
| <ul class="md-nav__list" data-md-scrollfix> |
| |
| |
| |
| |
| |
| |
| |
| <li class="md-nav__item"> |
| <a href="../../user-guide/install/" title="Installation Guide" class="md-nav__link"> |
| Installation Guide |
| </a> |
| </li> |
| |
| |
| |
| |
| |
| |
| |
| <li class="md-nav__item"> |
| <a href="../../user-guide/overview/" title="Overview" class="md-nav__link"> |
| Overview |
| </a> |
| </li> |
| |
| |
| |
| |
| |
| |
| |
| <li class="md-nav__item"> |
| <a href="../../user-guide/faq/" title="FAQ" class="md-nav__link"> |
| FAQ |
| </a> |
| </li> |
| |
| |
| </ul> |
| </nav> |
| </li> |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| <li class="md-nav__item md-nav__item--active md-nav__item--nested"> |
| |
| <input class="md-toggle md-nav__toggle" data-md-toggle="nav-4" type="checkbox" id="nav-4" checked> |
| |
| <label class="md-nav__link" for="nav-4"> |
| API Documentation |
| </label> |
| <nav class="md-nav" data-md-component="collapsible" data-md-level="1"> |
| <label class="md-nav__title" for="nav-4"> |
| API Documentation |
| </label> |
| <ul class="md-nav__list" data-md-scrollfix> |
| |
| |
| |
| |
| |
| |
| |
| <li class="md-nav__item"> |
| <a href="../context/" title="Context" class="md-nav__link"> |
| Context |
| </a> |
| </li> |
| |
| |
| |
| |
| |
| |
| |
| <li class="md-nav__item"> |
| <a href="../model/" title="Models" class="md-nav__link"> |
| Models |
| </a> |
| </li> |
| |
| |
| |
| |
| |
| |
| |
| <li class="md-nav__item"> |
| <a href="../initializer/" title="Initializers" class="md-nav__link"> |
| Initializers |
| </a> |
| </li> |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| <li class="md-nav__item md-nav__item--active"> |
| |
| <input class="md-toggle md-nav__toggle" data-md-toggle="toc" type="checkbox" id="__toc"> |
| |
| |
| |
| |
| <label class="md-nav__link md-nav__link--active" for="__toc"> |
| Optimizers |
| </label> |
| |
| <a href="./" title="Optimizers" class="md-nav__link md-nav__link--active"> |
| Optimizers |
| </a> |
| |
| |
| <nav class="md-nav md-nav--secondary"> |
| |
| |
| |
| |
| |
| <label class="md-nav__title" for="__toc">Table of contents</label> |
| <ul class="md-nav__list" data-md-scrollfix> |
| |
| <li class="md-nav__item"> |
| <a href="#built-in-optimizers" class="md-nav__link"> |
| Built-in optimizers |
| </a> |
| |
| <nav class="md-nav"> |
| <ul class="md-nav__list"> |
| |
| <li class="md-nav__item"> |
| <a href="#stochastic-gradient-descent" class="md-nav__link"> |
| Stochastic Gradient Descent |
| </a> |
| |
| </li> |
| |
| <li class="md-nav__item"> |
| <a href="#adam" class="md-nav__link"> |
| ADAM |
| </a> |
| |
| </li> |
| |
| <li class="md-nav__item"> |
| <a href="#adagrad" class="md-nav__link"> |
| AdaGrad |
| </a> |
| |
| </li> |
| |
| <li class="md-nav__item"> |
| <a href="#adadelta" class="md-nav__link"> |
| AdaDelta |
| </a> |
| |
| </li> |
| |
| <li class="md-nav__item"> |
| <a href="#adamax" class="md-nav__link"> |
| AdaMax |
| </a> |
| |
| </li> |
| |
| <li class="md-nav__item"> |
| <a href="#rmsprop" class="md-nav__link"> |
| RMSProp |
| </a> |
| |
| </li> |
| |
| <li class="md-nav__item"> |
| <a href="#nadam" class="md-nav__link"> |
| Nadam |
| </a> |
| |
| </li> |
| |
| </ul> |
| </nav> |
| |
| </li> |
| |
| |
| |
| |
| |
| </ul> |
| |
| </nav> |
| |
| </li> |
| |
| |
| |
| |
| |
| |
| |
| <li class="md-nav__item"> |
| <a href="../callback/" title="Callbacks in training" class="md-nav__link"> |
| Callbacks in training |
| </a> |
| </li> |
| |
| |
| |
| |
| |
| |
| |
| <li class="md-nav__item"> |
| <a href="../metric/" title="Evaluation Metrics" class="md-nav__link"> |
| Evaluation Metrics |
| </a> |
| </li> |
| |
| |
| |
| |
| |
| |
| |
| <li class="md-nav__item"> |
| <a href="../io/" title="Data Providers" class="md-nav__link"> |
| Data Providers |
| </a> |
| </li> |
| |
| |
| |
| |
| |
| |
| |
| <li class="md-nav__item"> |
| <a href="../ndarray/" title="NDArray API" class="md-nav__link"> |
| NDArray API |
| </a> |
| </li> |
| |
| |
| |
| |
| |
| |
| |
| <li class="md-nav__item"> |
| <a href="../symbolic-node/" title="Symbolic API" class="md-nav__link"> |
| Symbolic API |
| </a> |
| </li> |
| |
| |
| |
| |
| |
| |
| |
| <li class="md-nav__item"> |
| <a href="../nn-factory/" title="Neural Networks Factory" class="md-nav__link"> |
| Neural Networks Factory |
| </a> |
| </li> |
| |
| |
| |
| |
| |
| |
| |
| <li class="md-nav__item"> |
| <a href="../executor/" title="Executor" class="md-nav__link"> |
| Executor |
| </a> |
| </li> |
| |
| |
| |
| |
| |
| |
| |
| <li class="md-nav__item"> |
| <a href="../kvstore/" title="Key-Value Store" class="md-nav__link"> |
| Key-Value Store |
| </a> |
| </li> |
| |
| |
| |
| |
| |
| |
| |
| <li class="md-nav__item"> |
| <a href="../visualize/" title="Network Visualization" class="md-nav__link"> |
| Network Visualization |
| </a> |
| </li> |
| |
| |
| </ul> |
| </nav> |
| </li> |
| |
| |
| </ul> |
| </nav> |
| </div> |
| </div> |
| </div> |
| |
| |
| <div class="md-sidebar md-sidebar--secondary" data-md-component="toc"> |
| <div class="md-sidebar__scrollwrap"> |
| <div class="md-sidebar__inner"> |
| |
| <nav class="md-nav md-nav--secondary"> |
| |
| |
| |
| |
| |
| <label class="md-nav__title" for="__toc">Table of contents</label> |
| <ul class="md-nav__list" data-md-scrollfix> |
| |
| <li class="md-nav__item"> |
| <a href="#built-in-optimizers" class="md-nav__link"> |
| Built-in optimizers |
| </a> |
| |
| <nav class="md-nav"> |
| <ul class="md-nav__list"> |
| |
| <li class="md-nav__item"> |
| <a href="#stochastic-gradient-descent" class="md-nav__link"> |
| Stochastic Gradient Descent |
| </a> |
| |
| </li> |
| |
| <li class="md-nav__item"> |
| <a href="#adam" class="md-nav__link"> |
| ADAM |
| </a> |
| |
| </li> |
| |
| <li class="md-nav__item"> |
| <a href="#adagrad" class="md-nav__link"> |
| AdaGrad |
| </a> |
| |
| </li> |
| |
| <li class="md-nav__item"> |
| <a href="#adadelta" class="md-nav__link"> |
| AdaDelta |
| </a> |
| |
| </li> |
| |
| <li class="md-nav__item"> |
| <a href="#adamax" class="md-nav__link"> |
| AdaMax |
| </a> |
| |
| </li> |
| |
| <li class="md-nav__item"> |
| <a href="#rmsprop" class="md-nav__link"> |
| RMSProp |
| </a> |
| |
| </li> |
| |
| <li class="md-nav__item"> |
| <a href="#nadam" class="md-nav__link"> |
| Nadam |
| </a> |
| |
| </li> |
| |
| </ul> |
| </nav> |
| |
| </li> |
| |
| |
| |
| |
| |
| </ul> |
| |
| </nav> |
| </div> |
| </div> |
| </div> |
| |
| |
| <div class="md-content"> |
| <article class="md-content__inner md-typeset"> |
| |
| |
| <a href="https://github.com/apache/mxnet/tree/master/edit/master/docs/api/optimizer.md" title="Edit this page" class="md-icon md-content__icon"></a> |
| |
| |
| <!–- Licensed to the Apache Software Foundation (ASF) under one –> <!–- or more contributor license agreements. See the NOTICE file –> <!–- distributed with this work for additional information –> <!–- regarding copyright ownership. The ASF licenses this file –> <!–- to you under the Apache License, Version 2.0 (the –> <!–- "License"); you may not use this file except in compliance –> <!–- with the License. You may obtain a copy of the License at –> <!–- –> <!–- http://www.apache.org/licenses/LICENSE-2.0 –> <!–- –> <!–- Unless required by applicable law or agreed to in writing, –> <!–- software distributed under the License is distributed on an –> <!–- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY –> <!–- KIND, either express or implied. See the License for the –> <!–- specific language governing permissions and limitations –> <!–- under the License. –> |
| |
| <p><a id='Optimizers-1'></a></p> |
| <h1 id="optimizers">Optimizers</h1> |
| <p>Says, you have the parameter <code>W</code> inited for your model and got its gradient stored as <code>∇</code> (perhaps from AutoGrad APIs). Here is minimal snippet of getting your parameter <code>W</code> baked by <code>SGD</code>.</p> |
| <pre><code class="julia-repl">julia> using MXNet |
| |
| julia> opt = SGD(η = 10) |
| SGD(10, 0.0, 0, 0, 0.0001, MXNet.mx.LearningRate.Fixed(10.0), MXNet.mx.Momentum.Null()) |
| |
| julia> decend! = getupdater(opt) |
| (::getfield(MXNet.mx, Symbol("#updater#9272")){SGD,Dict{Int64,Any}}) (generic function with 1 method) |
| |
| julia> W = NDArray(Float32[1, 2, 3, 4]); |
| |
| julia> ∇ = NDArray(Float32[.1, .2, .3, .4]); |
| |
| julia> decend!(1, ∇, W) |
| 4-element NDArray{Float32,1} @ cpu0: |
| -0.0010000467f0 |
| -0.0020000935f0 |
| -0.003000021f0 |
| -0.004000187f0 |
| </code></pre> |
| |
| <p><a id='MXNet.mx.AbstractOptimizer' href='#MXNet.mx.AbstractOptimizer'>#</a> |
| <strong><code>MXNet.mx.AbstractOptimizer</code></strong> — <em>Type</em>.</p> |
| <pre><code class="julia">AbstractOptimizer |
| </code></pre> |
| |
| <p>Base type for all optimizers.</p> |
| <p><a target='_blank' href='https://github.com/apache/mxnet/blob/26a5ad1f39784a60d1564f6f740e5c7bd971cd65/julia/src/optimizer.jl#L22-L26' class='documenter-source'>source</a><br></p> |
| <p><a id='MXNet.mx.getupdater-Tuple{AbstractOptimizer}' href='#MXNet.mx.getupdater-Tuple{AbstractOptimizer}'>#</a> |
| <strong><code>MXNet.mx.getupdater</code></strong> — <em>Method</em>.</p> |
| <pre><code class="julia">getupdater(optimizer) |
| </code></pre> |
| |
| <p>A utility function to create an updater function of <code>KVStore</code>, that uses its closure to store all the states needed for each weights.</p> |
| <p>Ther returned function has following signature:</p> |
| <pre><code class="julia">decend!(index::Int, ∇::NDArray, x::NDArray) |
| </code></pre> |
| |
| <p>If the optimizer is stateful and need access/store states during updating, <code>index</code> will be the key to access/store states.</p> |
| <p><a target='_blank' href='https://github.com/apache/mxnet/blob/26a5ad1f39784a60d1564f6f740e5c7bd971cd65/julia/src/optimizer.jl#L252-L266' class='documenter-source'>source</a><br></p> |
| <p><a id='MXNet.mx.normgrad!-Tuple{AbstractOptimizer,NDArray,NDArray}' href='#MXNet.mx.normgrad!-Tuple{AbstractOptimizer,NDArray,NDArray}'>#</a> |
| <strong><code>MXNet.mx.normgrad!</code></strong> — <em>Method</em>.</p> |
| <pre><code class="julia">normgrad(optimizer, W, ∇) |
| </code></pre> |
| |
| <p>Get the properly normalized gradient (re-scaled and clipped if necessary).</p> |
| <ul> |
| <li><code>optimizer</code>: the optimizer, should contain the field <code>scale</code>, <code>clip</code> and <code>λ</code>.</li> |
| <li><code>W::NDArray</code>: the trainable weights.</li> |
| <li><code>∇::NDArray</code>: the original gradient of the weights.</li> |
| </ul> |
| <p><a target='_blank' href='https://github.com/apache/mxnet/blob/26a5ad1f39784a60d1564f6f740e5c7bd971cd65/julia/src/optimizer.jl#L278-L287' class='documenter-source'>source</a><br></p> |
| <p><a id='MXNet.mx.AbstractLearningRateScheduler' href='#MXNet.mx.AbstractLearningRateScheduler'>#</a> |
| <strong><code>MXNet.mx.AbstractLearningRateScheduler</code></strong> — <em>Type</em>.</p> |
| <pre><code class="julia">AbstractLearningRateScheduler |
| </code></pre> |
| |
| <p>Base type for all learning rate scheduler.</p> |
| <p><a target='_blank' href='https://github.com/apache/mxnet/blob/26a5ad1f39784a60d1564f6f740e5c7bd971cd65/julia/src/optimizer.jl#L29-L33' class='documenter-source'>source</a><br></p> |
| <p><a id='MXNet.mx.AbstractMomentumScheduler' href='#MXNet.mx.AbstractMomentumScheduler'>#</a> |
| <strong><code>MXNet.mx.AbstractMomentumScheduler</code></strong> — <em>Type</em>.</p> |
| <pre><code class="julia">AbstractMomentumScheduler |
| </code></pre> |
| |
| <p>Base type for all momentum scheduler.</p> |
| <p><a target='_blank' href='https://github.com/apache/mxnet/blob/26a5ad1f39784a60d1564f6f740e5c7bd971cd65/julia/src/optimizer.jl#L36-L40' class='documenter-source'>source</a><br></p> |
| <p><a id='MXNet.mx.OptimizationState' href='#MXNet.mx.OptimizationState'>#</a> |
| <strong><code>MXNet.mx.OptimizationState</code></strong> — <em>Type</em>.</p> |
| <pre><code class="julia">OptimizationState |
| </code></pre> |
| |
| <p><strong>Attributes</strong></p> |
| <ul> |
| <li><code>batch_size</code>: The size of the mini-batch used in stochastic training.</li> |
| <li><code>curr_epoch</code>: The current epoch count. Epoch 0 means no training yet, during the first pass through the data, the epoch will be 1; during the second pass, the epoch count will be 1, and so on.</li> |
| <li><code>curr_batch</code>: The current mini-batch count. The batch count is reset during every epoch. The batch count 0 means the beginning of each epoch, with no mini-batch seen yet. During the first mini-batch, the mini-batch count will be 1.</li> |
| <li><code>curr_iter</code>: The current iteration count. One iteration corresponds to one mini-batch, but unlike the mini-batch count, the iteration count does <strong>not</strong> reset in each epoch. So it track the <em>total</em> number of mini-batches seen so far.</li> |
| </ul> |
| <p><a target='_blank' href='https://github.com/apache/mxnet/blob/26a5ad1f39784a60d1564f6f740e5c7bd971cd65/julia/src/optimizer.jl#L43-L60' class='documenter-source'>source</a><br></p> |
| <p><a id='MXNet.mx.LearningRate.Exp' href='#MXNet.mx.LearningRate.Exp'>#</a> |
| <strong><code>MXNet.mx.LearningRate.Exp</code></strong> — <em>Type</em>.</p> |
| <pre><code class="julia">LearningRate.Exp(η₀; γ = 0.9) |
| </code></pre> |
| |
| <p> |
| <script type="math/tex; mode=display"> |
| \eta_t = \eta_0\gamma^t |
| </script> |
| </p> |
| <p>Where <code>t</code> is the epoch count, or the iteration count.</p> |
| <p><a target='_blank' href='https://github.com/apache/mxnet/blob/26a5ad1f39784a60d1564f6f740e5c7bd971cd65/julia/src/optimizer.jl#L105' class='documenter-source'>source</a><br></p> |
| <p><a id='MXNet.mx.LearningRate.Fixed' href='#MXNet.mx.LearningRate.Fixed'>#</a> |
| <strong><code>MXNet.mx.LearningRate.Fixed</code></strong> — <em>Type</em>.</p> |
| <pre><code class="julia">LearningRate.Fixed(η) |
| </code></pre> |
| |
| <p>Fixed learning rate scheduler always return the same learning rate.</p> |
| <p><a target='_blank' href='https://github.com/apache/mxnet/blob/26a5ad1f39784a60d1564f6f740e5c7bd971cd65/julia/src/optimizer.jl#L94-L98' class='documenter-source'>source</a><br></p> |
| <p><a id='MXNet.mx.LearningRate.Inv' href='#MXNet.mx.LearningRate.Inv'>#</a> |
| <strong><code>MXNet.mx.LearningRate.Inv</code></strong> — <em>Type</em>.</p> |
| <pre><code class="julia">LearningRate.Inv(η₀; γ = 0.9, p = 0.5) |
| </code></pre> |
| |
| <p> |
| <script type="math/tex; mode=display"> |
| \eta_t = \eta_0 (1 + \gamma t)^{-p} |
| </script> |
| </p> |
| <p>Where <code>t</code> is the epoch count, or the iteration count.</p> |
| <p><a target='_blank' href='https://github.com/apache/mxnet/blob/26a5ad1f39784a60d1564f6f740e5c7bd971cd65/julia/src/optimizer.jl#L127' class='documenter-source'>source</a><br></p> |
| <p><a id='Base.get-Tuple{MXNet.mx.AbstractLearningRateScheduler}' href='#Base.get-Tuple{MXNet.mx.AbstractLearningRateScheduler}'>#</a> |
| <strong><code>Base.get</code></strong> — <em>Method</em>.</p> |
| <pre><code class="julia">get(sched::AbstractLearningRateScheduler) |
| </code></pre> |
| |
| <p>Returns the current learning rate.</p> |
| <p><a target='_blank' href='https://github.com/apache/mxnet/blob/26a5ad1f39784a60d1564f6f740e5c7bd971cd65/julia/src/optimizer.jl#L87-L91' class='documenter-source'>source</a><br></p> |
| <p><a id='MXNet.mx.Momentum.Fixed' href='#MXNet.mx.Momentum.Fixed'>#</a> |
| <strong><code>MXNet.mx.Momentum.Fixed</code></strong> — <em>Type</em>.</p> |
| <pre><code class="julia">Momentum.Fixed |
| </code></pre> |
| |
| <p>Fixed momentum scheduler always returns the same value.</p> |
| <p><a target='_blank' href='https://github.com/apache/mxnet/blob/26a5ad1f39784a60d1564f6f740e5c7bd971cd65/julia/src/optimizer.jl#L190-L194' class='documenter-source'>source</a><br></p> |
| <p><a id='MXNet.mx.Momentum.NadamScheduler' href='#MXNet.mx.Momentum.NadamScheduler'>#</a> |
| <strong><code>MXNet.mx.Momentum.NadamScheduler</code></strong> — <em>Type</em>.</p> |
| <pre><code class="julia">NadamScheduler(; μ = 0.99, δ = 0.004, γ = 0.5, α = 0.96) |
| </code></pre> |
| |
| <p>Nesterov-accelerated adaptive momentum scheduler.</p> |
| <p>Description in <a href="http://cs229.stanford.edu/proj2015/054_report.pdf">Incorporating Nesterov Momentum into Adam</a>.</p> |
| <p> |
| <script type="math/tex; mode=display"> |
| \mu_t = \mu_0 * (1 - \gamma * \alpha^{t * \delta}) |
| </script> |
| </p> |
| <p>Where</p> |
| <ul> |
| <li><code>t</code>: iteration count</li> |
| <li><code>μ</code>: default <code>0.99</code>, μ₀</li> |
| <li><code>δ</code>: default <code>0.004</code> is scheduler decay.</li> |
| <li><code>γ</code>: default <code>0.5</code></li> |
| <li><code>α</code>: default <code>0.96</code></li> |
| </ul> |
| <p><a target='_blank' href='https://github.com/apache/mxnet/blob/26a5ad1f39784a60d1564f6f740e5c7bd971cd65/julia/src/optimizer.jl#L201' class='documenter-source'>source</a><br></p> |
| <p><a id='MXNet.mx.Momentum.Null' href='#MXNet.mx.Momentum.Null'>#</a> |
| <strong><code>MXNet.mx.Momentum.Null</code></strong> — <em>Type</em>.</p> |
| <pre><code class="julia">Momentum.Null |
| </code></pre> |
| |
| <p>The null momentum scheduler always returns 0 for momentum. It is also used to explicitly indicate momentum should not be used.</p> |
| <p><a target='_blank' href='https://github.com/apache/mxnet/blob/26a5ad1f39784a60d1564f6f740e5c7bd971cd65/julia/src/optimizer.jl#L179-L184' class='documenter-source'>source</a><br></p> |
| <p><a id='Base.get-Tuple{MXNet.mx.Momentum.NadamScheduler,Any}' href='#Base.get-Tuple{MXNet.mx.Momentum.NadamScheduler,Any}'>#</a> |
| <strong><code>Base.get</code></strong> — <em>Method</em>.</p> |
| <pre><code class="julia">get(n::NadamScheduler, t) |
| </code></pre> |
| |
| <p>Where <code>t</code> is the iteration count.</p> |
| <p><a target='_blank' href='https://github.com/apache/mxnet/blob/26a5ad1f39784a60d1564f6f740e5c7bd971cd65/julia/src/optimizer.jl#L235-L239' class='documenter-source'>source</a><br></p> |
| <p><a id='Built-in-optimizers-1'></a></p> |
| <h2 id="built-in-optimizers">Built-in optimizers</h2> |
| <p><a id='Stochastic-Gradient-Descent-1'></a></p> |
| <h3 id="stochastic-gradient-descent">Stochastic Gradient Descent</h3> |
| <p><a id='MXNet.mx.SGD' href='#MXNet.mx.SGD'>#</a> |
| <strong><code>MXNet.mx.SGD</code></strong> — <em>Type</em>.</p> |
| <pre><code class="julia">SGD(; kwargs...) |
| </code></pre> |
| |
| <p>Stochastic gradient descent optimizer.</p> |
| <p>Vanilla SGD:</p> |
| <p> |
| <script type="math/tex; mode=display"> |
| \theta \leftarrow \theta - \eta \nabla |
| </script> |
| </p> |
| <p>SGD with momentum::</p> |
| <p> |
| <script type="math/tex; mode=display"> |
| \begin{align*} |
| \nu & \leftarrow \mu \nu_{t-1} - \eta \nabla \\ |
| \theta & \leftarrow \theta + \nu_t |
| \end{align*} |
| </script> |
| </p> |
| <p><strong>Arguments</strong></p> |
| <ul> |
| <li><code>η</code>: default <code>0.01</code>, learning rate.</li> |
| <li><code>μ</code>: default <code>0</code>, the momentum, usually set to <code>0.9</code> in this implementation.</li> |
| <li><code>λ</code>: default <code>0.0001</code>, weight decay is equivalent to adding a global l2 regularizer to the parameters.</li> |
| <li><code>clip</code>: default <code>0</code>, gradient clipping. If positive, will clip the gradient into the bounded range <code>[-clip, clip]</code>.</li> |
| <li><code>scale</code>: default <code>0</code>, gradient rescaling. If != 0, multiply the gradient with <code>scale</code> before updating. Often choose to be <code>1.0 / batch_size</code>. If leave it default, high-level API like <code>fit!</code> will set it to <code>1.0 / batch_size</code>, since <code>fit!</code> knows the <code>batch_size</code>.</li> |
| <li><code>μ_sched::AbstractMomentumScheduler</code>: default <code>Momentum.Null()</code>, a dynamic momentum scheduler. If set, will overwrite the <code>momentum</code> parameter.</li> |
| <li><code>η_sched::AbstractLearningRateScheduler</code>: default <code>LearningRate.Fixed(η)</code>, a dynamic learning rate scheduler. If set, will overwrite the <code>η</code> parameter.</li> |
| </ul> |
| <p><a target='_blank' href='https://github.com/apache/mxnet/blob/26a5ad1f39784a60d1564f6f740e5c7bd971cd65/julia/src/optimizers/sgd.jl#L18' class='documenter-source'>source</a><br></p> |
| <p><a id='ADAM-1'></a></p> |
| <h3 id="adam">ADAM</h3> |
| <p><a id='MXNet.mx.ADAM' href='#MXNet.mx.ADAM'>#</a> |
| <strong><code>MXNet.mx.ADAM</code></strong> — <em>Type</em>.</p> |
| <pre><code class="julia"> ADAM |
| </code></pre> |
| |
| <p>The solver described in Diederik Kingma, Jimmy Ba: <em>Adam: A Method for Stochastic Optimization</em>. arXiv:1412.6980 [cs.LG].</p> |
| <pre><code>ADAM(; kwargs...) |
| </code></pre> |
| |
| <p><strong>Arguments</strong></p> |
| <ul> |
| <li><code>η</code>: default <code>0.001</code>, learning rate.</li> |
| <li><code>β1</code>: default <code>0.9</code>.</li> |
| <li><code>β2</code>: default <code>0.999</code>.</li> |
| <li><code>ϵ</code>: default <code>1e-8</code>.</li> |
| <li><code>clip</code>: default <code>0</code>, gradient clipping. If positive, will clip the gradient into the range <code>[-clip, clip]</code>.</li> |
| <li><code>scale</code>: default <code>0</code>, gradient rescaling. If != 0, multiply the gradient with <code>scale</code> before updating. Often choose to be <code>1.0 / batch_size</code>. If leave it default, high-level API like <code>fit!</code> will set it to <code>1.0 / batch_size</code>, since <code>fit!</code> knows the <code>batch_size</code>.</li> |
| <li><code>λ</code>: default <code>0.00001</code>, weight decay is equivalent to adding a global l2 regularizer for all the parameters.</li> |
| <li><code>η_sched::AbstractLearningRateScheduler</code>: default <code>LearningRate.Fixed(η)</code>, a dynamic learning rate scheduler. If set, will overwrite the <code>η</code> parameter.</li> |
| </ul> |
| <p><a target='_blank' href='https://github.com/apache/mxnet/blob/26a5ad1f39784a60d1564f6f740e5c7bd971cd65/julia/src/optimizers/adam.jl#L18-L42' class='documenter-source'>source</a><br></p> |
| <p><a id='AdaGrad-1'></a></p> |
| <h3 id="adagrad">AdaGrad</h3> |
| <p><a id='MXNet.mx.AdaGrad' href='#MXNet.mx.AdaGrad'>#</a> |
| <strong><code>MXNet.mx.AdaGrad</code></strong> — <em>Type</em>.</p> |
| <pre><code class="julia">AdaGrad(; kwargs...) |
| </code></pre> |
| |
| <p>Scale learning rates by dividing with the square root of accumulated squared gradients. See [1] for further description.</p> |
| <p><strong>Arguments</strong></p> |
| <ul> |
| <li><code>η</code>: default <code>0.1</code>, learning rate.</li> |
| <li><code>ϵ</code>: default <code>1e-6</code>, small value added for numerical stability.</li> |
| <li><code>clip</code>: default <code>0</code>, gradient clipping. If positive, will clip the gradient into the range <code>[-clip, clip]</code>.</li> |
| <li><code>scale</code>: default <code>0</code>, gradient rescaling. If != 0, multiply the gradient with <code>scale</code> before updating. Often choose to be <code>1.0 / batch_size</code>. If leave it default, high-level API like <code>fit!</code> will set it to <code>1.0 / batch_size</code>, since <code>fit!</code> knows the <code>batch_size</code>.</li> |
| <li><code>λ</code>: default <code>0.00001</code>, weight decay is equivalent to adding a global l2 regularizer for all the parameters.</li> |
| </ul> |
| <p><strong>Notes</strong></p> |
| <p>Using step size <code>η</code> AdaGrad calculates the learning rate for feature <code>i</code> at time step t as:</p> |
| <p> |
| <script type="math/tex; mode=display"> |
| η_{t,i} = \frac{lr}{\sqrt{\sum^t_{t^\prime} g^2_{t^\prime,i} + ϵ}} g_{t,i} |
| </script> |
| </p> |
| <p>as such the learning rate is monotonically decreasing. Epsilon is not included in the typical formula, see [2].</p> |
| <p><strong>References</strong></p> |
| <ol> |
| <li>Duchi, J., Hazan, E., & Singer, Y. (2011): Adaptive subgradient methods for online learning and stochastic optimization. JMLR, 12:2121-2159.</li> |
| <li>Chris Dyer: Notes on AdaGrad. <a href="http://www.ark.cs.cmu.edu/cdyer/adagrad.pdf">http://www.ark.cs.cmu.edu/cdyer/adagrad.pdf</a></li> |
| </ol> |
| <p><a target='_blank' href='https://github.com/apache/mxnet/blob/26a5ad1f39784a60d1564f6f740e5c7bd971cd65/julia/src/optimizers/adagrad.jl#L18' class='documenter-source'>source</a><br></p> |
| <p><a id='AdaDelta-1'></a></p> |
| <h3 id="adadelta">AdaDelta</h3> |
| <p><a id='MXNet.mx.AdaDelta' href='#MXNet.mx.AdaDelta'>#</a> |
| <strong><code>MXNet.mx.AdaDelta</code></strong> — <em>Type</em>.</p> |
| <pre><code class="julia">AdaDelta(; kwargs...) |
| </code></pre> |
| |
| <p>Scale learning rates by the ratio of accumulated gradients to accumulated updates, see [1] and notes for further description.</p> |
| <p><strong>Attributes</strong></p> |
| <ul> |
| <li><code>η</code>: default <code>1.0</code>, learning rate.</li> |
| <li><code>ρ</code>: default <code>0.95</code>, squared gradient moving average decay factor.</li> |
| <li><code>ϵ</code>: default <code>1e-6</code>, small value added for numerical stability.</li> |
| <li><code>clip</code>: default <code>0</code>, gradient clipping. If positive, will clip the gradient into the range <code>[-clip, clip]</code>.</li> |
| <li><code>scale</code>: default <code>0</code>, gradient rescaling. If != 0, multiply the gradient with <code>scale</code> before updating. Often choose to be <code>1.0 / batch_size</code>. If leave it default, high-level API like <code>fit!</code> will set it to <code>1.0 / batch_size</code>, since <code>fit!</code> knows the <code>batch_size</code>.</li> |
| <li><code>λ</code>: default <code>0.00001</code>, weight decay is equivalent to adding a global l2 regularizer for all the parameters.</li> |
| </ul> |
| <p><strong>Notes</strong></p> |
| <p><code>ρ</code> should be between 0 and 1. A value of <code>ρ</code> close to 1 will decay the moving average slowly and a value close to 0 will decay the moving average fast.</p> |
| <p><code>ρ = 0.95</code> and <code>ϵ = 1e-6</code> are suggested in the paper and reported to work for multiple datasets (MNIST, speech). In the paper, no learning rate is considered (so <code>η = 1.0</code>). Probably best to keep it at this value.</p> |
| <p><code>ϵ</code> is important for the very first update (so the numerator does not become 0).</p> |
| <p>Using the step size <code>η</code> and a decay factor <code>ρ</code> the learning rate is calculated as:</p> |
| <p> |
| <script type="math/tex; mode=display"> |
| \begin{align*} |
| r_t &= ρ r_{t-1} + (1 - ρ) g^2 \\ |
| η_t &= η \frac{\sqrt{s_{t-1} + ϵ}} {\sqrt{r_t + ϵ}} \\ |
| s_t &= ρ s_{t-1} + (1 - ρ) (η_t \times g)^2 |
| \end{align*} |
| </script> |
| </p> |
| <p><strong>References</strong></p> |
| <ol> |
| <li>Zeiler, M. D. (2012): ADADELTA: An Adaptive Learning Rate Method. arXiv Preprint arXiv:1212.5701.</li> |
| </ol> |
| <p><a target='_blank' href='https://github.com/apache/mxnet/blob/26a5ad1f39784a60d1564f6f740e5c7bd971cd65/julia/src/optimizers/adadelta.jl#L18' class='documenter-source'>source</a><br></p> |
| <p><a id='AdaMax-1'></a></p> |
| <h3 id="adamax">AdaMax</h3> |
| <p><a id='MXNet.mx.AdaMax' href='#MXNet.mx.AdaMax'>#</a> |
| <strong><code>MXNet.mx.AdaMax</code></strong> — <em>Type</em>.</p> |
| <pre><code class="julia">AdaMax(; kwargs...) |
| </code></pre> |
| |
| <p>This is a variant of of the Adam algorithm based on the infinity norm. See [1] for further description.</p> |
| <p><strong>Arguments</strong></p> |
| <ul> |
| <li><code>η</code>: default <code>0.002</code>, learning rate.</li> |
| <li><code>β1</code>: default <code>0.9</code>, exponential decay rate for the first moment estimates.</li> |
| <li><code>β2</code>: default <code>0.999</code>, exponential decay rate for the weighted infinity norm estimates.</li> |
| <li><code>ϵ</code>: default <code>1e-8</code>, small value added for numerical stability.</li> |
| <li><code>clip</code>: default <code>0</code>, gradient clipping. If positive, will clip the gradient into the range <code>[-clip, clip]</code>.</li> |
| <li><code>scale</code>: default <code>0</code>, gradient rescaling. If != 0, multiply the gradient with <code>scale</code> before updating. Often choose to be <code>1.0 / batch_size</code>. If leave it default, high-level API like <code>fit!</code> will set it to <code>1.0 / batch_size</code>, since <code>fit!</code> knows the <code>batch_size</code>.</li> |
| <li><code>λ</code>: default <code>0.00001</code>, weight decay is equivalent to adding a global l2 regularizer for all the parameters.</li> |
| </ul> |
| <p><strong>References</strong></p> |
| <ol> |
| <li>Kingma, Diederik, and Jimmy Ba (2014): Adam: A Method for Stochastic Optimization. Section 7. <a href="http://arxiv.org/abs/1412.6980">http://arxiv.org/abs/1412.6980</a>.</li> |
| </ol> |
| <p><a target='_blank' href='https://github.com/apache/mxnet/blob/26a5ad1f39784a60d1564f6f740e5c7bd971cd65/julia/src/optimizers/adamax.jl#L18-L45' class='documenter-source'>source</a><br></p> |
| <p><a id='RMSProp-1'></a></p> |
| <h3 id="rmsprop">RMSProp</h3> |
| <p><a id='MXNet.mx.RMSProp' href='#MXNet.mx.RMSProp'>#</a> |
| <strong><code>MXNet.mx.RMSProp</code></strong> — <em>Type</em>.</p> |
| <pre><code class="julia">RMSProp(; kwargs...) |
| </code></pre> |
| |
| <p>Scale learning rates by dividing with the moving average of the root mean squared (RMS) gradients. See [1] for further description.</p> |
| <p><strong>Arguments</strong></p> |
| <ul> |
| <li><code>η</code>: default <code>0.1</code>, learning rate.</li> |
| <li><code>ρ</code>: default <code>0.9</code>, gradient moving average decay factor.</li> |
| <li><code>ϵ</code>: default <code>1e-8</code>, small value added for numerical stability.</li> |
| <li><code>clip</code>: default <code>0</code>, gradient clipping. If positive, will clip the gradient into the range <code>[-clip, clip]</code>.</li> |
| <li><code>scale</code>: default <code>0</code>, gradient rescaling. If != 0, multiply the gradient with <code>scale</code> before updating. Often choose to be <code>1.0 / batch_size</code>. If leave it default, high-level API like <code>fit!</code> will set it to <code>1.0 / batch_size</code>, since <code>fit!</code> knows the <code>batch_size</code>.</li> |
| <li><code>λ</code>: default <code>0.00001</code>, weight decay is equivalent to adding a global l2 regularizer for all the parameters.</li> |
| </ul> |
| <p><strong>Notes</strong></p> |
| <p><code>ρ</code> should be between 0 and 1. A value of <code>ρ</code> close to 1 will decay the moving average slowly and a value close to 0 will decay the moving average fast.</p> |
| <p>Using the step size <code>η</code> and a decay factor <code>ρ the learning rate</code>ηₜ` is calculated as:</p> |
| <p> |
| <script type="math/tex; mode=display"> |
| \begin{align*} |
| r_t &= ρ r_{t-1} + (1 - ρ)g^2 \\ |
| η_t &= \frac{η}{\sqrt{r_t + ϵ}} |
| \end{align*} |
| </script> |
| </p> |
| <p><strong>References</strong></p> |
| <ol> |
| <li>Tieleman, T. and Hinton, G. (2012): Neural Networks for Machine Learning, Lecture 6.5 - rmsprop. Coursera. <a href="http://www.youtube.com/watch?v=O3sxAc4hxZU">http://www.youtube.com/watch?v=O3sxAc4hxZU</a> (formula @5:20)</li> |
| </ol> |
| <p><a target='_blank' href='https://github.com/apache/mxnet/blob/26a5ad1f39784a60d1564f6f740e5c7bd971cd65/julia/src/optimizers/rmsprop.jl#L18' class='documenter-source'>source</a><br></p> |
| <p><a id='Nadam-1'></a></p> |
| <h3 id="nadam">Nadam</h3> |
| <p><a id='MXNet.mx.Nadam' href='#MXNet.mx.Nadam'>#</a> |
| <strong><code>MXNet.mx.Nadam</code></strong> — <em>Type</em>.</p> |
| <pre><code class="julia">Nadam(; kwargs...) |
| </code></pre> |
| |
| <p>Nesterov Adam optimizer: Adam RMSprop with Nesterov momentum, see [1] and notes for further description.</p> |
| <p><strong>Arguments</strong></p> |
| <ul> |
| <li><code>η</code>: default <code>0.001</code>, learning rate.</li> |
| <li><code>β1</code>: default <code>0.99</code>.</li> |
| <li><code>β2</code>: default <code>0.999</code>.</li> |
| <li><code>ϵ</code>: default <code>1e-8</code>, small value added for numerical stability.</li> |
| <li><code>clip</code>: default <code>0</code>, gradient clipping. If positive, will clip the gradient into the range <code>[-clip, clip]</code>.</li> |
| <li><code>scale</code>: default <code>0</code>, gradient rescaling. If != 0, multiply the gradient with <code>scale</code> before updating. Often choose to be <code>1.0 / batch_size</code>. If leave it default, high-level API like <code>fit!</code> will set it to <code>1.0 / batch_size</code>, since <code>fit!</code> knows the <code>batch_size</code>.</li> |
| <li><code>λ</code>: default <code>0.00001</code>, weight decay is equivalent to adding a global l2 regularizer for all the parameters.</li> |
| <li><code>η_sched::AbstractLearningRateScheduler</code>: default <code>nothing</code>, a dynamic learning rate scheduler. If set, will overwrite the <code>η</code> parameter.</li> |
| <li> |
| <p><code>μ_sched::NadamScheduler</code> default <code>NadamScheduler()</code> of the form.</p> |
| <p> |
| <script type="math/tex; mode=display"> |
| \mu_t = β_1 (1 - 0.5 \times 0.96^{t \times 0.004}) |
| </script> |
| </p> |
| </li> |
| </ul> |
| <p><strong>Notes</strong></p> |
| <p>Default parameters follow those provided in the paper. It is recommended to leave the parameters of this optimizer at their default values.</p> |
| <p><strong>References</strong></p> |
| <ol> |
| <li><a href="http://cs229.stanford.edu/proj2015/054_report.pdf">Incorporating Nesterov Momentum into Adam</a>.</li> |
| <li><a href="http://www.cs.toronto.edu/~fritz/absps/momentum.pdf">On the importance of initialization and momentum in deep learning</a>.</li> |
| </ol> |
| <p><a target='_blank' href='https://github.com/apache/mxnet/blob/26a5ad1f39784a60d1564f6f740e5c7bd971cd65/julia/src/optimizers/nadam.jl#L18' class='documenter-source'>source</a><br></p> |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| </article> |
| </div> |
| </div> |
| </main> |
| |
| |
| <footer class="md-footer"> |
| |
| <div class="md-footer-nav"> |
| <nav class="md-footer-nav__inner md-grid"> |
| |
| <a href="../initializer/" title="Initializers" class="md-flex md-footer-nav__link md-footer-nav__link--prev" rel="prev"> |
| <div class="md-flex__cell md-flex__cell--shrink"> |
| <i class="md-icon md-icon--arrow-back md-footer-nav__button"></i> |
| </div> |
| <div class="md-flex__cell md-flex__cell--stretch md-footer-nav__title"> |
| <span class="md-flex__ellipsis"> |
| <span class="md-footer-nav__direction"> |
| Previous |
| </span> |
| Initializers |
| </span> |
| </div> |
| </a> |
| |
| |
| <a href="../callback/" title="Callbacks in training" class="md-flex md-footer-nav__link md-footer-nav__link--next" rel="next"> |
| <div class="md-flex__cell md-flex__cell--stretch md-footer-nav__title"> |
| <span class="md-flex__ellipsis"> |
| <span class="md-footer-nav__direction"> |
| Next |
| </span> |
| Callbacks in training |
| </span> |
| </div> |
| <div class="md-flex__cell md-flex__cell--shrink"> |
| <i class="md-icon md-icon--arrow-forward md-footer-nav__button"></i> |
| </div> |
| </a> |
| |
| </nav> |
| </div> |
| |
| <div class="md-footer-meta md-typeset"> |
| <div class="md-footer-meta__inner md-grid"> |
| <div class="md-footer-copyright"> |
| |
| powered by |
| <a href="https://www.mkdocs.org">MkDocs</a> |
| and |
| <a href="https://squidfunk.github.io/mkdocs-material/"> |
| Material for MkDocs</a> |
| </div> |
| |
| </div> |
| </div> |
| </footer> |
| |
| </div> |
| |
| <script src="../../assets/javascripts/application.808e90bb.js"></script> |
| |
| <script>app.initialize({version:"1.0.4",url:{base:"../.."}})</script> |
| |
| <script src="https://cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS_HTML"></script> |
| |
| <script src="../../assets/mathjaxhelper.js"></script> |
| |
| |
| </body> |
| </html> |