versions/master/api/python/gluon.html - mxnet-test - Git at Google

 <!DOCTYPE html>

 <html lang="en">
 <head>
 <meta charset="utf-8"/>
 <meta content="IE=edge" http-equiv="X-UA-Compatible"/>
 <meta content="width=device-width, initial-scale=1" name="viewport"/>
 <title>Gluon Package — mxnet  documentation</title>
 <link crossorigin="anonymous" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.6/css/bootstrap.min.css" integrity="sha384-1q8mTJOASx8j1Au+a5WDVnPi2lkFfwwEAa8hDDdjZlpLegxhjVME1fgjWPGmkzs7" rel="stylesheet"/>
 <link href="https://maxcdn.bootstrapcdn.com/font-awesome/4.5.0/css/font-awesome.min.css" rel="stylesheet"/>
 <link href="../../_static/basic.css" rel="stylesheet" type="text/css"/>
 <link href="../../_static/pygments.css" rel="stylesheet" type="text/css"/>
 <link href="../../_static/mxnet.css" rel="stylesheet" type="text/css">
 <script type="text/javascript">
       var DOCUMENTATION_OPTIONS = {
         URL_ROOT:    '../../',
         VERSION:     '',
         COLLAPSE_INDEX: false,
         FILE_SUFFIX: '.html',
         HAS_SOURCE:  true,
         SOURCELINK_SUFFIX: '.txt'
       };
     </script>
 <script src="../../_static/jquery-1.11.1.js" type="text/javascript"></script>
 <script src="../../_static/underscore.js" type="text/javascript"></script>
 <script src="../../_static/searchtools_custom.js" type="text/javascript"></script>
 <script src="../../_static/doctools.js" type="text/javascript"></script>
 <script src="../../_static/selectlang.js" type="text/javascript"></script>
 <script src="https://cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML" type="text/javascript"></script>
 <script type="text/javascript"> jQuery(function() { Search.loadIndex("/searchindex.js"); Search.init();}); </script>
 <script>
       (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
       (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new
       Date();a=s.createElement(o),
       m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
       })(window,document,'script','https://www.google-analytics.com/analytics.js','ga');

       ga('create', 'UA-96378503-1', 'auto');
       ga('send', 'pageview');

     </script>
 <!-- -->
 <!-- <script type="text/javascript" src="../../_static/jquery.js"></script> -->
 <!-- -->
 <!-- <script type="text/javascript" src="../../_static/underscore.js"></script> -->
 <!-- -->
 <!-- <script type="text/javascript" src="../../_static/doctools.js"></script> -->
 <!-- -->
 <!-- <script type="text/javascript" src="https://cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML"></script> -->
 <!-- -->
 <link href="../../genindex.html" rel="index" title="Index"/>
 <link href="../../search.html" rel="search" title="Search">
 <link href="index.html" rel="up" title="MXNet - Python API">
 <link href="rnn.html" rel="next" title="RNN Cell API"/>
 <link href="module.html" rel="prev" title="Module API"/>
 <link href="https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/image/mxnet-icon.png" rel="icon" type="image/png"/>
 </link></link></link></head>
 <body role="document"><!-- Previous Navbar Layout
 <div class="navbar navbar-default navbar-fixed-top">
   <div class="container">
     <div class="navbar-header">
       <button type="button" class="navbar-toggle collapsed" data-toggle="collapse" data-target="#navbar" aria-expanded="false" aria-controls="navbar">
         <span class="sr-only">Toggle navigation</span>
         <span class="icon-bar"></span>
         <span class="icon-bar"></span>
         <span class="icon-bar"></span>
       </button>
       <a href="../../" class="navbar-brand">
         <img src="http://data.mxnet.io/theme/mxnet.png">
       </a>
     </div>
     <div id="navbar" class="navbar-collapse collapse">
       <ul id="navbar" class="navbar navbar-left">

         <li> <a href="../../get_started/index.html">Get Started</a> </li>

         <li> <a href="../../tutorials/index.html">Tutorials</a> </li>

         <li> <a href="../../how_to/index.html">How To</a> </li>


         <li class="dropdown">
           <a href="#" class="dropdown-toggle" data-toggle="dropdown" role="button" aria-haspopup="true" aria-expanded="true">Packages <span class="caret"></span></a>
           <ul class="dropdown-menu">

             <li><a href="../../packages/python/index.html">
                 Python
             </a></li>

             <li><a href="../../packages/r/index.html">
                 R
             </a></li>

             <li><a href="../../packages/julia/index.html">
                 Julia
             </a></li>

             <li><a href="../../packages/c++/index.html">
                 C++
             </a></li>

             <li><a href="../../packages/scala/index.html">
                 Scala
             </a></li>

             <li><a href="../../packages/perl/index.html">
                 Perl
             </a></li>

           </ul>
         </li>

         <li> <a href="../../system/index.html">System</a> </li>
         <li>
 <form class="" role="search" action="../../search.html" method="get" autocomplete="off">
   <div class="form-group inner-addon left-addon">
     <i class="glyphicon glyphicon-search"></i>
     <input type="text" name="q" class="form-control" placeholder="Search">
   </div>
   <input type="hidden" name="check_keywords" value="yes" />
   <input type="hidden" name="area" value="default" />

 </form> </li>
       </ul>
       <ul id="navbar" class="navbar navbar-right">
         <li> <a href="../../index.html"><span class="flag-icon flag-icon-us"></span></a> </li>
         <li> <a href="../..//zh/index.html"><span class="flag-icon flag-icon-cn"></span></a> </li>
       </ul>
     </div>
   </div>
 </div>
 Previous Navbar Layout End -->
 <div class="navbar navbar-fixed-top">
 <div class="container" id="navContainer">
 <div class="innder" id="header-inner">
 <h1 id="logo-wrap">
 <a href="../../" id="logo"><img src="http://data.mxnet.io/theme/mxnet.png"/></a>
 </h1>
 <nav class="nav-bar" id="main-nav">
 <a class="main-nav-link" href="../../get_started/install.html">Install</a>
 <a class="main-nav-link" href="../../tutorials/index.html">Tutorials</a>
 <a class="main-nav-link" href="../../how_to/index.html">How To</a>
 <span id="dropdown-menu-position-anchor">
 <a aria-expanded="true" aria-haspopup="true" class="main-nav-link dropdown-toggle" data-toggle="dropdown" href="#" role="button">API <span class="caret"></span></a>
 <ul class="dropdown-menu" id="package-dropdown-menu">
 <li><a class="main-nav-link" href="../../api/python/index.html">Python</a></li>
 <li><a class="main-nav-link" href="../../api/scala/index.html">Scala</a></li>
 <li><a class="main-nav-link" href="../../api/r/index.html">R</a></li>
 <li><a class="main-nav-link" href="../../api/julia/index.html">Julia</a></li>
 <li><a class="main-nav-link" href="../../api/c++/index.html">C++</a></li>
 <li><a class="main-nav-link" href="../../api/perl/index.html">Perl</a></li>
 </ul>
 </span>
 <a class="main-nav-link" href="../../architecture/index.html">Architecture</a>
 <!-- <a class="main-nav-link" href="../../community/index.html">Community</a> -->
 <a class="main-nav-link" href="https://github.com/dmlc/mxnet">Github</a>
 <span id="dropdown-menu-position-anchor-version" style="position: relative"><a href="#" class="main-nav-link dropdown-toggle" data-toggle="dropdown" role="button" aria-haspopup="true" aria-expanded="true">Versions(master)<span class="caret"></span></a><ul id="package-dropdown-menu" class="dropdown-menu"><li><a class="main-nav-link" href=http://mxnet.incubator.apache.org/test/>v0.10.14</a></li><li><a class="main-nav-link" href=http://mxnet.incubator.apache.org/test/versions/0.10/index.html>0.10</a></li><li><a class="main-nav-link" href=http://mxnet.incubator.apache.org/test/versions/master/index.html>master</a></li></ul></span></nav>
 <script> function getRootPath(){ return "../../" } </script>
 <div class="burgerIcon dropdown">
 <a class="dropdown-toggle" data-toggle="dropdown" href="#" role="button">☰</a>
 <ul class="dropdown-menu dropdown-menu-right" id="burgerMenu">
 <li><a href="../../get_started/install.html">Install</a></li>
 <li><a href="../../tutorials/index.html">Tutorials</a></li>
 <li><a href="../../how_to/index.html">How To</a></li>
 <li class="dropdown-submenu">
 <a href="#" tabindex="-1">API</a>
 <ul class="dropdown-menu">
 <li><a href="../../api/python/index.html" tabindex="-1">Python</a>
 </li>
 <li><a href="../../api/scala/index.html" tabindex="-1">Scala</a>
 </li>
 <li><a href="../../api/r/index.html" tabindex="-1">R</a>
 </li>
 <li><a href="../../api/julia/index.html" tabindex="-1">Julia</a>
 </li>
 <li><a href="../../api/c++/index.html" tabindex="-1">C++</a>
 </li>
 <li><a href="../../api/perl/index.html" tabindex="-1">Perl</a>
 </li>
 </ul>
 </li>
 <li><a href="../../architecture/index.html">Architecture</a></li>
 <li><a class="main-nav-link" href="https://github.com/dmlc/mxnet">Github</a></li>
 <li id="dropdown-menu-position-anchor-version-mobile" class="dropdown-submenu" style="position: relative"><a href="#" tabindex="-1">Versions(master)</a><ul class="dropdown-menu"><li><a tabindex="-1" href=http://mxnet.incubator.apache.org/test/>v0.10.14</a></li><li><a tabindex="-1" href=http://mxnet.incubator.apache.org/test/versions/0.10/index.html>0.10</a></li><li><a tabindex="-1" href=http://mxnet.incubator.apache.org/test/versions/master/index.html>master</a></li></ul></li></ul>
 </div>
 <div class="plusIcon dropdown">
 <a class="dropdown-toggle" data-toggle="dropdown" href="#" role="button"><span aria-hidden="true" class="glyphicon glyphicon-plus"></span></a>
 <ul class="dropdown-menu dropdown-menu-right" id="plusMenu"></ul>
 </div>
 <div id="search-input-wrap">
 <form action="../../search.html" autocomplete="off" class="" method="get" role="search">
 <div class="form-group inner-addon left-addon">
 <i class="glyphicon glyphicon-search"></i>
 <input class="form-control" name="q" placeholder="Search" type="text"/>
 </div>
 <input name="check_keywords" type="hidden" value="yes"/>
 <input name="area" type="hidden" value="default"/>
 </form>
 <div id="search-preview"></div>
 </div>
 <div id="searchIcon">
 <span aria-hidden="true" class="glyphicon glyphicon-search"></span>
 </div>
 <!-- <div id="lang-select-wrap"> -->
 <!--   <label id="lang-select-label"> -->
 <!--     <\!-- <i class="fa fa-globe"></i> -\-> -->
 <!--     <span></span> -->
 <!--   </label> -->
 <!--   <select id="lang-select"> -->
 <!--     <option value="en">Eng</option> -->
 <!--     <option value="zh">中文</option> -->
 <!--   </select> -->
 <!-- </div> -->
 <!--     <a id="mobile-nav-toggle">
         <span class="mobile-nav-toggle-bar"></span>
         <span class="mobile-nav-toggle-bar"></span>
         <span class="mobile-nav-toggle-bar"></span>
       </a> -->
 </div>
 </div>
 </div>
 <div class="container">
 <div class="row">
 <div aria-label="main navigation" class="sphinxsidebar leftsidebar" role="navigation">
 <div class="sphinxsidebarwrapper">
 <ul class="current">
 <li class="toctree-l1 current"><a class="reference internal" href="index.html">Python Documents</a><ul class="current">
 <li class="toctree-l2 current"><a class="reference internal" href="index.html#table-of-contents">Table of contents</a><ul class="current">
 <li class="toctree-l3"><a class="reference internal" href="ndarray.html">NDArray API</a></li>
 <li class="toctree-l3"><a class="reference internal" href="symbol.html">Symbol API</a></li>
 <li class="toctree-l3"><a class="reference internal" href="module.html">Module API</a></li>
 <li class="toctree-l3 current"><a class="current reference internal" href="#">Gluon Package</a><ul>
 <li class="toctree-l4"><a class="reference internal" href="#overview">Overview</a></li>
 <li class="toctree-l4"><a class="reference internal" href="#parameter">Parameter</a></li>
 <li class="toctree-l4"><a class="reference internal" href="#containers">Containers</a></li>
 <li class="toctree-l4"><a class="reference internal" href="#neural-network-layers">Neural Network Layers</a></li>
 <li class="toctree-l4"><a class="reference internal" href="#recurrent-layers">Recurrent Layers</a></li>
 <li class="toctree-l4"><a class="reference internal" href="#trainer">Trainer</a></li>
 <li class="toctree-l4"><a class="reference internal" href="#loss-functions">Loss functions</a></li>
 <li class="toctree-l4"><a class="reference internal" href="#utilities">Utilities</a></li>
 </ul>
 </li>
 <li class="toctree-l3"><a class="reference internal" href="rnn.html">RNN Cell API</a></li>
 <li class="toctree-l3"><a class="reference internal" href="kvstore.html">KVStore API</a></li>
 <li class="toctree-l3"><a class="reference internal" href="io.html">Data Loading API</a></li>
 <li class="toctree-l3"><a class="reference internal" href="image.html">Image API</a></li>
 <li class="toctree-l3"><a class="reference internal" href="optimization.html">Optimization: initialize and update weights</a></li>
 <li class="toctree-l3"><a class="reference internal" href="callback.html">Callback API</a></li>
 <li class="toctree-l3"><a class="reference internal" href="metric.html">Evaluation Metric API</a></li>
 </ul>
 </li>
 </ul>
 </li>
 <li class="toctree-l1"><a class="reference internal" href="../r/index.html">R Documents</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../julia/index.html">Julia Documents</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../c++/index.html">C++ Documents</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../scala/index.html">Scala Documents</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../perl/index.html">Perl Documents</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../how_to/index.html">HowTo Documents</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../architecture/index.html">System Documents</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../tutorials/index.html">Tutorials</a></li>
 </ul>
 </div>
 </div>
 <div class="content">
 <div class="section" id="gluon-package">
 <span id="gluon-package"></span><h1>Gluon Package<a class="headerlink" href="#gluon-package" title="Permalink to this headline">¶</a></h1>
 <div class="admonition warning">
 <p class="first admonition-title">Warning</p>
 <p class="last">This package is currently experimental and may change in the near future.</p>
 </div>
 <script src="../../_static/js/auto_module_index.js" type="text/javascript"></script><div class="section" id="overview">
 <span id="overview"></span><h2>Overview<a class="headerlink" href="#overview" title="Permalink to this headline">¶</a></h2>
 <p>Gluon package is a high-level interface for MXNet designed to be easy to use while
 keeping most of the flexibility of low level API. Gluon supports both imperative
 and symbolic programming, making it easy to train complex models imperatively
 in Python and then deploy with symbolic graph in C++ and Scala.</p>
 </div>
 <div class="section" id="parameter">
 <span id="parameter"></span><h2>Parameter<a class="headerlink" href="#parameter" title="Permalink to this headline">¶</a></h2>
 <dl class="class">
 <dt id="mxnet.gluon.Parameter">
 <em class="property">class </em><code class="descclassname">mxnet.gluon.</code><code class="descname">Parameter</code><span class="sig-paren">(</span><em>name</em>, <em>grad_req='write'</em>, <em>shape=None</em>, <em>dtype=<type 'numpy.float32'=""></type></em>, <em>lr_mult=1.0</em>, <em>wd_mult=1.0</em>, <em>init=None</em>, <em>allow_deferred_init=False</em><span class="sig-paren">)</span><a class="headerlink" href="#mxnet.gluon.Parameter" title="Permalink to this definition">¶</a></dt>
 <dd><p>A Container holding parameters (weights) of <a href="#id2"><span class="problematic" id="id3">`</span></a>Block`s.</p>
 <p><cite>Parameter</cite> holds a copy of the the parameter on each <cite>Context</cite> after
 it is initialized with <cite>Parameter.initialize(...)</cite>. If <cite>grad_req</cite> is
 not <cite>null</cite>, it will also hold a gradient array on each <cite>Context</cite>:</p>
 <div class="highlight-default"><div class="highlight"><pre><span></span><span class="n">ctx</span> <span class="o">=</span> <span class="n">mx</span><span class="o">.</span><span class="n">gpu</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span>
 <span class="n">x</span> <span class="o">=</span> <span class="n">mx</span><span class="o">.</span><span class="n">nd</span><span class="o">.</span><span class="n">zeros</span><span class="p">((</span><span class="mi">16</span><span class="p">,</span> <span class="mi">100</span><span class="p">),</span> <span class="n">ctx</span><span class="o">=</span><span class="n">ctx</span><span class="p">)</span>
 <span class="n">w</span> <span class="o">=</span> <span class="n">mx</span><span class="o">.</span><span class="n">gluon</span><span class="o">.</span><span class="n">Parameter</span><span class="p">(</span><span class="s1">'fc_weight'</span><span class="p">,</span> <span class="n">shape</span><span class="o">=</span><span class="p">(</span><span class="mi">64</span><span class="p">,</span> <span class="mi">100</span><span class="p">),</span> <span class="n">init</span><span class="o">=</span><span class="n">mx</span><span class="o">.</span><span class="n">init</span><span class="o">.</span><span class="n">Xavier</span><span class="p">())</span>
 <span class="n">b</span> <span class="o">=</span> <span class="n">mx</span><span class="o">.</span><span class="n">gluon</span><span class="o">.</span><span class="n">Parameter</span><span class="p">(</span><span class="s1">'fc_bias'</span><span class="p">,</span> <span class="n">shape</span><span class="o">=</span><span class="p">(</span><span class="mi">64</span><span class="p">,),</span> <span class="n">init</span><span class="o">=</span><span class="n">mx</span><span class="o">.</span><span class="n">init</span><span class="o">.</span><span class="n">Zero</span><span class="p">())</span>
 <span class="n">w</span><span class="o">.</span><span class="n">initialize</span><span class="p">(</span><span class="n">ctx</span><span class="o">=</span><span class="n">ctx</span><span class="p">)</span>
 <span class="n">b</span><span class="o">.</span><span class="n">initialize</span><span class="p">(</span><span class="n">ctx</span><span class="o">=</span><span class="n">ctx</span><span class="p">)</span>
 <span class="n">out</span> <span class="o">=</span> <span class="n">mx</span><span class="o">.</span><span class="n">nd</span><span class="o">.</span><span class="n">FullyConnected</span><span class="p">(</span><span class="n">x</span><span class="p">,</span> <span class="n">w</span><span class="o">.</span><span class="n">data</span><span class="p">(</span><span class="n">ctx</span><span class="p">),</span> <span class="n">b</span><span class="o">.</span><span class="n">data</span><span class="p">(</span><span class="n">ctx</span><span class="p">),</span> <span class="n">num_hidden</span><span class="o">=</span><span class="mi">64</span><span class="p">)</span>
 </pre></div>
 </div>
 <table class="docutils field-list" frame="void" rules="none">
 <col class="field-name"/>
 <col class="field-body"/>
 <tbody valign="top">
 <tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
 <li><strong>name</strong> (<em>str</em>) – Name of this parameter.</li>
 <li><strong>grad_req</strong> (<em>{'write'</em><em>, </em><em>'add'</em><em>, </em><em>'null'}</em><em>, </em><em>default 'write'</em>) – <p>Specifies how to update gradient to grad arrays.</p>
 <ul>
 <li>‘write’ means everytime gradient is written to grad <cite>NDArray</cite>.</li>
 <li>‘add’ means everytime gradient is added to the grad <cite>NDArray</cite>. You need
 to manually call <cite>zero_grad()</cite> to clear the gradient buffer before each
 iteration when using this option.</li>
 <li>‘null’ means gradient is not requested for this parameter. gradient arrays
 will not be allocated.</li>
 </ul>
 </li>
 <li><strong>shape</strong> (<em>tuple of int</em><em>, </em><em>default None</em>) – Shape of this parameter. By default shape is not specified. Parameter with
 unknown shape can be used for <cite>Symbol</cite> API, but <cite>init</cite> will throw an error
 when using <cite>NDArray</cite> API.</li>
 <li><strong>dtype</strong> (<em>numpy.dtype</em><em> or </em><em>str</em><em>, </em><em>default 'float32'</em>) – Data type of this parameter. For example, numpy.float32 or ‘float32’.</li>
 <li><strong>lr_mult</strong> (<em>float</em><em>, </em><em>default 1.0</em>) – Learning rate multiplier. Learning rate will be multiplied by lr_mult
 when updating this parameter with optimizer.</li>
 <li><strong>wd_mult</strong> (<em>float</em><em>, </em><em>default 1.0</em>) – Weight decay multiplier (L2 regularizer coefficient). Works similar to lr_mult.</li>
 <li><strong>init</strong> (<a class="reference internal" href="optimization.html#mxnet.initializer.Initializer" title="mxnet.initializer.Initializer"><em>Initializer</em></a><em>, </em><em>default None</em>) – Initializer of this parameter. Will use the global initializer by default.</li>
 </ul>
 </td>
 </tr>
 </tbody>
 </table>
 <dl class="method">
 <dt id="mxnet.gluon.Parameter.initialize">
 <code class="descname">initialize</code><span class="sig-paren">(</span><em>init=None</em>, <em>ctx=None</em>, <em>default_init=<mxnet.initializer.uniform object=""></mxnet.initializer.uniform></em><span class="sig-paren">)</span><a class="headerlink" href="#mxnet.gluon.Parameter.initialize" title="Permalink to this definition">¶</a></dt>
 <dd><p>Initializes parameter and gradient arrays. Only used for <cite>NDArray</cite> API.</p>
 <table class="docutils field-list" frame="void" rules="none">
 <col class="field-name"/>
 <col class="field-body"/>
 <tbody valign="top">
 <tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
 <li><strong>init</strong> (<a class="reference internal" href="optimization.html#mxnet.initializer.Initializer" title="mxnet.initializer.Initializer"><em>Initializer</em></a>) – The initializer to use. Overrides <cite>Parameter.init</cite> and default_init.</li>
 <li><strong>ctx</strong> (Context or list of Context, defaults to <cite>context.current_context()</cite>.) – <p>Initialize Parameter on given context. If ctx is a list of Context, a
 copy will be made for each context.</p>
 <div class="admonition note">
 <p class="first admonition-title">Note</p>
 <p class="last">Copies are independent arrays. User is responsible for keeping</p>
 </div>
 <p>their values consistent when updating. Normally <cite>gluon.Trainer</cite> does this for you.</p>
 </li>
 <li><strong>default_init</strong> (<a class="reference internal" href="optimization.html#mxnet.initializer.Initializer" title="mxnet.initializer.Initializer"><em>Initializer</em></a>) – Default initializer is used when both <cite>init</cite> and <cite>Parameter.init</cite> are <cite>None</cite>.</li>
 </ul>
 </td>
 </tr>
 </tbody>
 </table>
 <p class="rubric">Examples</p>
 <div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">>>> </span><span class="n">weight</span> <span class="o">=</span> <span class="n">mx</span><span class="o">.</span><span class="n">gluon</span><span class="o">.</span><span class="n">Parameter</span><span class="p">(</span><span class="s1">'weight'</span><span class="p">,</span> <span class="n">shape</span><span class="o">=</span><span class="p">(</span><span class="mi">2</span><span class="p">,</span> <span class="mi">2</span><span class="p">))</span>
 <span class="gp">>>> </span><span class="n">weight</span><span class="o">.</span><span class="n">initialize</span><span class="p">(</span><span class="n">ctx</span><span class="o">=</span><span class="n">mx</span><span class="o">.</span><span class="n">cpu</span><span class="p">(</span><span class="mi">0</span><span class="p">))</span>
 <span class="gp">>>> </span><span class="n">weight</span><span class="o">.</span><span class="n">data</span><span class="p">()</span>
 <span class="go">[[-0.01068833  0.01729892]</span>
 <span class="go"> [ 0.02042518 -0.01618656]]</span>
 <span class="go"><ndarray 2x2="" @cpu(0)=""></ndarray></span>
 <span class="gp">>>> </span><span class="n">weight</span><span class="o">.</span><span class="n">grad</span><span class="p">()</span>
 <span class="go">[[ 0.  0.]</span>
 <span class="go"> [ 0.  0.]]</span>
 <span class="go"><ndarray 2x2="" @cpu(0)=""></ndarray></span>
 <span class="gp">>>> </span><span class="n">weight</span><span class="o">.</span><span class="n">initialize</span><span class="p">(</span><span class="n">ctx</span><span class="o">=</span><span class="p">[</span><span class="n">mx</span><span class="o">.</span><span class="n">gpu</span><span class="p">(</span><span class="mi">0</span><span class="p">),</span> <span class="n">mx</span><span class="o">.</span><span class="n">gpu</span><span class="p">(</span><span class="mi">1</span><span class="p">)])</span>
 <span class="gp">>>> </span><span class="n">weight</span><span class="o">.</span><span class="n">data</span><span class="p">(</span><span class="n">mx</span><span class="o">.</span><span class="n">gpu</span><span class="p">(</span><span class="mi">0</span><span class="p">))</span>
 <span class="go">[[-0.00873779 -0.02834515]</span>
 <span class="go"> [ 0.05484822 -0.06206018]]</span>
 <span class="go"><ndarray 2x2="" @gpu(0)=""></ndarray></span>
 <span class="gp">>>> </span><span class="n">weight</span><span class="o">.</span><span class="n">data</span><span class="p">(</span><span class="n">mx</span><span class="o">.</span><span class="n">gpu</span><span class="p">(</span><span class="mi">1</span><span class="p">))</span>
 <span class="go">[[-0.00873779 -0.02834515]</span>
 <span class="go"> [ 0.05484822 -0.06206018]]</span>
 <span class="go"><ndarray 2x2="" @gpu(1)=""></ndarray></span>
 </pre></div>
 </div>
 </dd></dl>
 <dl class="method">
 <dt id="mxnet.gluon.Parameter.set_data">
 <code class="descname">set_data</code><span class="sig-paren">(</span><em>data</em><span class="sig-paren">)</span><a class="headerlink" href="#mxnet.gluon.Parameter.set_data" title="Permalink to this definition">¶</a></dt>
 <dd><p>Sets this parameter’s value on all contexts to data.</p>
 </dd></dl>
 <dl class="method">
 <dt id="mxnet.gluon.Parameter.data">
 <code class="descname">data</code><span class="sig-paren">(</span><em>ctx=None</em><span class="sig-paren">)</span><a class="headerlink" href="#mxnet.gluon.Parameter.data" title="Permalink to this definition">¶</a></dt>
 <dd><p>Returns a copy of this parameter on one context. Must have been
 initialized on this context before.</p>
 <table class="docutils field-list" frame="void" rules="none">
 <col class="field-name"/>
 <col class="field-body"/>
 <tbody valign="top">
 <tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>ctx</strong> (<em>Context</em>) – Desired context.</td>
 </tr>
 <tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"></td>
 </tr>
 <tr class="field-odd field"><th class="field-name">Return type:</th><td class="field-body">NDArray on ctx</td>
 </tr>
 </tbody>
 </table>
 </dd></dl>
 <dl class="method">
 <dt id="mxnet.gluon.Parameter.list_data">
 <code class="descname">list_data</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#mxnet.gluon.Parameter.list_data" title="Permalink to this definition">¶</a></dt>
 <dd><p>Returns copies of this parameter on all contexts, in the same order
 as creation.</p>
 </dd></dl>
 <dl class="method">
 <dt id="mxnet.gluon.Parameter.grad">
 <code class="descname">grad</code><span class="sig-paren">(</span><em>ctx=None</em><span class="sig-paren">)</span><a class="headerlink" href="#mxnet.gluon.Parameter.grad" title="Permalink to this definition">¶</a></dt>
 <dd><p>Returns a gradient buffer for this parameter on one context.</p>
 <table class="docutils field-list" frame="void" rules="none">
 <col class="field-name"/>
 <col class="field-body"/>
 <tbody valign="top">
 <tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>ctx</strong> (<em>Context</em>) – Desired context.</td>
 </tr>
 </tbody>
 </table>
 </dd></dl>
 <dl class="method">
 <dt id="mxnet.gluon.Parameter.list_grad">
 <code class="descname">list_grad</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#mxnet.gluon.Parameter.list_grad" title="Permalink to this definition">¶</a></dt>
 <dd><p>Returns gradient buffers on all contexts, in the same order
 as <cite>values</cite>.</p>
 </dd></dl>
 <dl class="method">
 <dt id="mxnet.gluon.Parameter.list_ctx">
 <code class="descname">list_ctx</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#mxnet.gluon.Parameter.list_ctx" title="Permalink to this definition">¶</a></dt>
 <dd><p>Returns a list of contexts this parameter is initialized on.</p>
 </dd></dl>
 <dl class="method">
 <dt id="mxnet.gluon.Parameter.zero_grad">
 <code class="descname">zero_grad</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#mxnet.gluon.Parameter.zero_grad" title="Permalink to this definition">¶</a></dt>
 <dd><p>Sets gradient buffer on all contexts to 0. No action is taken if
 parameter is uninitialized or doesn’t require gradient.</p>
 </dd></dl>
 <dl class="method">
 <dt id="mxnet.gluon.Parameter.var">
 <code class="descname">var</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#mxnet.gluon.Parameter.var" title="Permalink to this definition">¶</a></dt>
 <dd><p>Returns a symbol representing this parameter.</p>
 </dd></dl>
 </dd></dl>
 <dl class="class">
 <dt id="mxnet.gluon.ParameterDict">
 <em class="property">class </em><code class="descclassname">mxnet.gluon.</code><code class="descname">ParameterDict</code><span class="sig-paren">(</span><em>prefix=''</em>, <em>shared=None</em><span class="sig-paren">)</span><a class="headerlink" href="#mxnet.gluon.ParameterDict" title="Permalink to this definition">¶</a></dt>
 <dd><p>A dictionary managing a set of parameters.</p>
 <table class="docutils field-list" frame="void" rules="none">
 <col class="field-name"/>
 <col class="field-body"/>
 <tbody valign="top">
 <tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
 <li><strong>prefix</strong> (<em>str</em><em>, </em><em>default ''</em>) – The prefix to be prepended to all Parameters’ name created by this dict.</li>
 <li><strong>shared</strong> (<a class="reference internal" href="#mxnet.gluon.ParameterDict" title="mxnet.gluon.ParameterDict"><em>ParameterDict</em></a><em> or </em><em>None</em>) – If not <cite>None</cite>, when this dict’s <cite>get</cite> method creates a new parameter, will
 first try to retrieve it from <cite>shared</cite> dict. Usually used for sharing
 parameters with another <cite>Block</cite>.</li>
 </ul>
 </td>
 </tr>
 </tbody>
 </table>
 <dl class="attribute">
 <dt id="mxnet.gluon.ParameterDict.prefix">
 <code class="descname">prefix</code><a class="headerlink" href="#mxnet.gluon.ParameterDict.prefix" title="Permalink to this definition">¶</a></dt>
 <dd><p>Prefix of this dict. It will be prepended to Parameters’ name created
 with <cite>get</cite>.</p>
 </dd></dl>
 <dl class="method">
 <dt id="mxnet.gluon.ParameterDict.get">
 <code class="descname">get</code><span class="sig-paren">(</span><em>name</em>, <em>**kwargs</em><span class="sig-paren">)</span><a class="headerlink" href="#mxnet.gluon.ParameterDict.get" title="Permalink to this definition">¶</a></dt>
 <dd><p>Retrieves a <cite>Parameter</cite> with name <cite>self.prefix+name</cite>. If not found,
 <cite>get</cite> will first try to retrieve it from <cite>shared</cite> dict. If still not
 found, <cite>get</cite> will create a new <cite>Parameter</cite> with key-word arguments and
 insert it to self.</p>
 <table class="docutils field-list" frame="void" rules="none">
 <col class="field-name"/>
 <col class="field-body"/>
 <tbody valign="top">
 <tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
 <li><strong>name</strong> (<em>str</em>) – Name of the desired Parameter. It will be prepended with this dictionary’s
 prefix.</li>
 <li><strong>**kwargs</strong> (<em>dict</em>) – The rest of key-word arguments for the created <cite>Parameter</cite>.</li>
 </ul>
 </td>
 </tr>
 <tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first">The created or retrieved <cite>Parameter</cite>.</p>
 </td>
 </tr>
 <tr class="field-odd field"><th class="field-name">Return type:</th><td class="field-body"><p class="first last"><a class="reference internal" href="#mxnet.gluon.Parameter" title="mxnet.gluon.Parameter">Parameter</a></p>
 </td>
 </tr>
 </tbody>
 </table>
 </dd></dl>
 <dl class="method">
 <dt id="mxnet.gluon.ParameterDict.update">
 <code class="descname">update</code><span class="sig-paren">(</span><em>other</em><span class="sig-paren">)</span><a class="headerlink" href="#mxnet.gluon.ParameterDict.update" title="Permalink to this definition">¶</a></dt>
 <dd><p>Copies all Parameters in <cite>other</cite> to self.</p>
 </dd></dl>
 <dl class="method">
 <dt id="mxnet.gluon.ParameterDict.initialize">
 <code class="descname">initialize</code><span class="sig-paren">(</span><em>init=<mxnet.initializer.uniform object=""></mxnet.initializer.uniform></em>, <em>ctx=None</em>, <em>verbose=False</em><span class="sig-paren">)</span><a class="headerlink" href="#mxnet.gluon.ParameterDict.initialize" title="Permalink to this definition">¶</a></dt>
 <dd><p>Initializes all Parameters managed by this dictionary to be used for <cite>NDArray</cite>
 API. It has no effect when using <cite>Symbol</cite> API.</p>
 <table class="docutils field-list" frame="void" rules="none">
 <col class="field-name"/>
 <col class="field-body"/>
 <tbody valign="top">
 <tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
 <li><strong>init</strong> (<a class="reference internal" href="optimization.html#mxnet.initializer.Initializer" title="mxnet.initializer.Initializer"><em>Initializer</em></a>) – Global default Initializer to be used when <cite>Parameter.init</cite> is <cite>None</cite>.
 Otherwise, <cite>Parameter.init</cite> takes precedence.</li>
 <li><strong>ctx</strong> (<em>Context</em><em> or </em><em>list of Context</em>) – Keeps a copy of Parameters on one or many context(s).</li>
 </ul>
 </td>
 </tr>
 </tbody>
 </table>
 </dd></dl>
 <dl class="method">
 <dt id="mxnet.gluon.ParameterDict.zero_grad">
 <code class="descname">zero_grad</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#mxnet.gluon.ParameterDict.zero_grad" title="Permalink to this definition">¶</a></dt>
 <dd><p>Sets all Parameters’ gradient buffer to 0.</p>
 </dd></dl>
 <dl class="method">
 <dt id="mxnet.gluon.ParameterDict.save">
 <code class="descname">save</code><span class="sig-paren">(</span><em>filename</em>, <em>strip_prefix=''</em><span class="sig-paren">)</span><a class="headerlink" href="#mxnet.gluon.ParameterDict.save" title="Permalink to this definition">¶</a></dt>
 <dd><p>Save parameters to file.</p>
 <dl class="docutils">
 <dt>filename</dt>
 <span class="classifier-delimiter">:</span> <span class="classifier">str</span><dd>Path to parameter file.</dd>
 <dt>strip_prefix</dt>
 <span class="classifier-delimiter">:</span> <span class="classifier">str, default ‘’</span><dd>Strip prefix from parameter names before saving.</dd>
 </dl>
 </dd></dl>
 <dl class="method">
 <dt id="mxnet.gluon.ParameterDict.load">
 <code class="descname">load</code><span class="sig-paren">(</span><em>filename</em>, <em>ctx</em>, <em>allow_missing=False</em>, <em>ignore_extra=False</em>, <em>restore_prefix=''</em><span class="sig-paren">)</span><a class="headerlink" href="#mxnet.gluon.ParameterDict.load" title="Permalink to this definition">¶</a></dt>
 <dd><p>Load parameters from file.</p>
 <dl class="docutils">
 <dt>filename</dt>
 <span class="classifier-delimiter">:</span> <span class="classifier">str</span><dd>Path to parameter file.</dd>
 <dt>ctx</dt>
 <span class="classifier-delimiter">:</span> <span class="classifier">Context or list of Context</span><dd>Context(s) initialize loaded parameters on.</dd>
 <dt>allow_missing</dt>
 <span class="classifier-delimiter">:</span> <span class="classifier">bool, default False</span><dd>Whether to silently skip loading parameters not represents in the file.</dd>
 <dt>ignore_extra</dt>
 <span class="classifier-delimiter">:</span> <span class="classifier">bool, default False</span><dd>Whether to silently ignore parameters from the file that are not
 present in this ParameterDict.</dd>
 <dt>restore_prefix</dt>
 <span class="classifier-delimiter">:</span> <span class="classifier">str, default ‘’</span><dd>prepend prefix to names of stored parameters before loading.</dd>
 </dl>
 </dd></dl>
 </dd></dl>
 </div>
 <div class="section" id="containers">
 <span id="containers"></span><h2>Containers<a class="headerlink" href="#containers" title="Permalink to this headline">¶</a></h2>
 <dl class="class">
 <dt id="mxnet.gluon.Block">
 <em class="property">class </em><code class="descclassname">mxnet.gluon.</code><code class="descname">Block</code><span class="sig-paren">(</span><em>prefix=None</em>, <em>params=None</em><span class="sig-paren">)</span><a class="headerlink" href="#mxnet.gluon.Block" title="Permalink to this definition">¶</a></dt>
 <dd><p>Base class for all neural network layers and models. Your models should
 subclass this class.</p>
 <p><cite>Block</cite> can be nested recursively in a tree structure. You can create and
 assign child <cite>Block</cite> as regular attributes:</p>
 <div class="highlight-default"><div class="highlight"><pre><span></span><span class="kn">from</span> <span class="nn">mxnet.gluon</span> <span class="k">import</span> <span class="n">Block</span><span class="p">,</span> <span class="n">nn</span>
 <span class="kn">from</span> <span class="nn">mxnet</span> <span class="k">import</span> <span class="n">ndarray</span> <span class="k">as</span> <span class="n">F</span>

 <span class="k">class</span> <span class="nc">Model</span><span class="p">(</span><span class="n">Block</span><span class="p">):</span>
     <span class="k">def</span> <span class="nf">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">):</span>
         <span class="nb">super</span><span class="p">(</span><span class="n">Model</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
         <span class="c1"># use name_scope to give child Blocks appropriate names.</span>
         <span class="c1"># It also allows sharing Parameters between Blocks recursively.</span>
         <span class="k">with</span> <span class="bp">self</span><span class="o">.</span><span class="n">name_scope</span><span class="p">():</span>
             <span class="bp">self</span><span class="o">.</span><span class="n">dense0</span> <span class="o">=</span> <span class="n">nn</span><span class="o">.</span><span class="n">Dense</span><span class="p">(</span><span class="mi">20</span><span class="p">)</span>
             <span class="bp">self</span><span class="o">.</span><span class="n">dense1</span> <span class="o">=</span> <span class="n">nn</span><span class="o">.</span><span class="n">Dense</span><span class="p">(</span><span class="mi">20</span><span class="p">)</span>

     <span class="k">def</span> <span class="nf">forward</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">x</span><span class="p">):</span>
         <span class="n">x</span> <span class="o">=</span> <span class="n">F</span><span class="o">.</span><span class="n">relu</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">dense0</span><span class="p">(</span><span class="n">x</span><span class="p">))</span>
         <span class="k">return</span> <span class="n">F</span><span class="o">.</span><span class="n">relu</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">dense1</span><span class="p">(</span><span class="n">x</span><span class="p">))</span>

 <span class="n">model</span> <span class="o">=</span> <span class="n">Model</span><span class="p">()</span>
 <span class="n">model</span><span class="o">.</span><span class="n">initialize</span><span class="p">(</span><span class="n">ctx</span><span class="o">=</span><span class="n">mx</span><span class="o">.</span><span class="n">cpu</span><span class="p">(</span><span class="mi">0</span><span class="p">))</span>
 <span class="n">model</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">zeros</span><span class="p">((</span><span class="mi">10</span><span class="p">,</span> <span class="mi">10</span><span class="p">),</span> <span class="n">ctx</span><span class="o">=</span><span class="n">mx</span><span class="o">.</span><span class="n">cpu</span><span class="p">(</span><span class="mi">0</span><span class="p">)))</span>
 </pre></div>
 </div>
 <p>Child <cite>Block</cite> assigned this way will be registered and <cite>collect_params</cite>
 will collect their Parameters recursively.</p>
 <table class="docutils field-list" frame="void" rules="none">
 <col class="field-name"/>
 <col class="field-body"/>
 <tbody valign="top">
 <tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
 <li><strong>prefix</strong> (<em>str</em>) – Prefix acts like a name space. It will be prepended to the names of all
 Parameters and child <cite>Block`s in this `Block</cite>‘s <cite>name_scope</cite>. Prefix
 should be unique within one model to prevent name collisions.</li>
 <li><strong>params</strong> (<a class="reference internal" href="#mxnet.gluon.ParameterDict" title="mxnet.gluon.ParameterDict"><em>ParameterDict</em></a><em> or </em><em>None</em>) – <p><cite>ParameterDict</cite> for sharing weights with the new <cite>Block</cite>. For example,
 if you want <cite>dense1</cite> to share <cite>dense0</cite>‘s weights, you can do:</p>
 <div class="highlight-default"><div class="highlight"><pre><span></span><span class="n">dense0</span> <span class="o">=</span> <span class="n">nn</span><span class="o">.</span><span class="n">Dense</span><span class="p">(</span><span class="mi">20</span><span class="p">)</span>
 <span class="n">dense1</span> <span class="o">=</span> <span class="n">nn</span><span class="o">.</span><span class="n">Dense</span><span class="p">(</span><span class="mi">20</span><span class="p">,</span> <span class="n">params</span><span class="o">=</span><span class="n">dense0</span><span class="o">.</span><span class="n">collect_params</span><span class="p">())</span>
 </pre></div>
 </div>
 </li>
 </ul>
 </td>
 </tr>
 </tbody>
 </table>
 <dl class="method">
 <dt id="mxnet.gluon.Block.forward">
 <code class="descname">forward</code><span class="sig-paren">(</span><em>*args</em><span class="sig-paren">)</span><a class="headerlink" href="#mxnet.gluon.Block.forward" title="Permalink to this definition">¶</a></dt>
 <dd><p>Overrides to implement forward computation using <cite>NDArray</cite>. Only
 accepts positional arguments.</p>
 <table class="docutils field-list" frame="void" rules="none">
 <col class="field-name"/>
 <col class="field-body"/>
 <tbody valign="top">
 <tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>*args</strong> (<em>list of NDArray</em>) – Input tensors.</td>
 </tr>
 </tbody>
 </table>
 </dd></dl>
 <dl class="attribute">
 <dt id="mxnet.gluon.Block.prefix">
 <code class="descname">prefix</code><a class="headerlink" href="#mxnet.gluon.Block.prefix" title="Permalink to this definition">¶</a></dt>
 <dd><p>Prefix of this <cite>Block</cite>.</p>
 </dd></dl>
 <dl class="attribute">
 <dt id="mxnet.gluon.Block.name">
 <code class="descname">name</code><a class="headerlink" href="#mxnet.gluon.Block.name" title="Permalink to this definition">¶</a></dt>
 <dd><p>Name of this <cite>Block</cite>, without ‘_’ in the end.</p>
 </dd></dl>
 <dl class="method">
 <dt id="mxnet.gluon.Block.name_scope">
 <code class="descname">name_scope</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#mxnet.gluon.Block.name_scope" title="Permalink to this definition">¶</a></dt>
 <dd><p>Returns a name space object managing a child <cite>Block</cite> and parameter
 names. Should be used within a <cite>with</cite> statement:</p>
 <div class="highlight-default"><div class="highlight"><pre><span></span><span class="k">with</span> <span class="bp">self</span><span class="o">.</span><span class="n">name_scope</span><span class="p">():</span>
     <span class="bp">self</span><span class="o">.</span><span class="n">dense</span> <span class="o">=</span> <span class="n">nn</span><span class="o">.</span><span class="n">Dense</span><span class="p">(</span><span class="mi">20</span><span class="p">)</span>
 </pre></div>
 </div>
 </dd></dl>
 <dl class="attribute">
 <dt id="mxnet.gluon.Block.params">
 <code class="descname">params</code><a class="headerlink" href="#mxnet.gluon.Block.params" title="Permalink to this definition">¶</a></dt>
 <dd><p>Returns this <cite>Block</cite>‘s parameter dictionary (does not include its
 children’s parameters).</p>
 </dd></dl>
 <dl class="method">
 <dt id="mxnet.gluon.Block.collect_params">
 <code class="descname">collect_params</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#mxnet.gluon.Block.collect_params" title="Permalink to this definition">¶</a></dt>
 <dd><p>Returns a <cite>ParameterDict</cite> containing this <cite>Block</cite> and all of its
 children’s Parameters.</p>
 </dd></dl>
 <dl class="method">
 <dt id="mxnet.gluon.Block.save_params">
 <code class="descname">save_params</code><span class="sig-paren">(</span><em>filename</em><span class="sig-paren">)</span><a class="headerlink" href="#mxnet.gluon.Block.save_params" title="Permalink to this definition">¶</a></dt>
 <dd><p>Save parameters to file.</p>
 <dl class="docutils">
 <dt>filename</dt>
 <span class="classifier-delimiter">:</span> <span class="classifier">str</span><dd>Path to file.</dd>
 </dl>
 </dd></dl>
 <dl class="method">
 <dt id="mxnet.gluon.Block.load_params">
 <code class="descname">load_params</code><span class="sig-paren">(</span><em>filename</em>, <em>ctx</em>, <em>allow_missing=False</em>, <em>ignore_extra=False</em><span class="sig-paren">)</span><a class="headerlink" href="#mxnet.gluon.Block.load_params" title="Permalink to this definition">¶</a></dt>
 <dd><p>Load parameters from file.</p>
 <dl class="docutils">
 <dt>filename</dt>
 <span class="classifier-delimiter">:</span> <span class="classifier">str</span><dd>Path to parameter file.</dd>
 <dt>ctx</dt>
 <span class="classifier-delimiter">:</span> <span class="classifier">Context or list of Context</span><dd>Context(s) initialize loaded parameters on.</dd>
 <dt>allow_missing</dt>
 <span class="classifier-delimiter">:</span> <span class="classifier">bool, default False</span><dd>Whether to silently skip loading parameters not represents in the file.</dd>
 <dt>ignore_extra</dt>
 <span class="classifier-delimiter">:</span> <span class="classifier">bool, default False</span><dd>Whether to silently ignore parameters from the file that are not
 present in this Block.</dd>
 </dl>
 </dd></dl>
 <dl class="method">
 <dt id="mxnet.gluon.Block.register_child">
 <code class="descname">register_child</code><span class="sig-paren">(</span><em>block</em><span class="sig-paren">)</span><a class="headerlink" href="#mxnet.gluon.Block.register_child" title="Permalink to this definition">¶</a></dt>
 <dd><p>Registers block as a child of self. <a href="#id4"><span class="problematic" id="id5">`</span></a>Block`s assigned to self as
 attributes will be registered automatically.</p>
 </dd></dl>
 <dl class="method">
 <dt id="mxnet.gluon.Block.initialize">
 <code class="descname">initialize</code><span class="sig-paren">(</span><em>init=<mxnet.initializer.uniform object=""></mxnet.initializer.uniform></em>, <em>ctx=None</em>, <em>verbose=False</em><span class="sig-paren">)</span><a class="headerlink" href="#mxnet.gluon.Block.initialize" title="Permalink to this definition">¶</a></dt>
 <dd><p>Initializes <cite>Parameter`s of this `Block</cite> and its children.</p>
 <p>Equivalent to <cite>block.collect_params().initialize(...)</cite></p>
 </dd></dl>
 <dl class="method">
 <dt id="mxnet.gluon.Block.hybridize">
 <code class="descname">hybridize</code><span class="sig-paren">(</span><em>active=True</em><span class="sig-paren">)</span><a class="headerlink" href="#mxnet.gluon.Block.hybridize" title="Permalink to this definition">¶</a></dt>
 <dd><p>Activates or deactivates <a href="#id6"><span class="problematic" id="id7">`</span></a>HybridBlock`s recursively. Has no effect on
 non-hybrid children.</p>
 <table class="docutils field-list" frame="void" rules="none">
 <col class="field-name"/>
 <col class="field-body"/>
 <tbody valign="top">
 <tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>active</strong> (<em>bool</em><em>, </em><em>default True</em>) – Whether to turn hybrid on or off.</td>
 </tr>
 </tbody>
 </table>
 </dd></dl>
 <dl class="method">
 <dt>
 <code class="descname">forward</code><span class="sig-paren">(</span><em>*args</em><span class="sig-paren">)</span></dt>
 <dd><p>Overrides to implement forward computation using <cite>NDArray</cite>. Only
 accepts positional arguments.</p>
 <table class="docutils field-list" frame="void" rules="none">
 <col class="field-name"/>
 <col class="field-body"/>
 <tbody valign="top">
 <tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>*args</strong> (<em>list of NDArray</em>) – Input tensors.</td>
 </tr>
 </tbody>
 </table>
 </dd></dl>
 </dd></dl>
 <dl class="class">
 <dt id="mxnet.gluon.HybridBlock">
 <em class="property">class </em><code class="descclassname">mxnet.gluon.</code><code class="descname">HybridBlock</code><span class="sig-paren">(</span><em>prefix=None</em>, <em>params=None</em><span class="sig-paren">)</span><a class="headerlink" href="#mxnet.gluon.HybridBlock" title="Permalink to this definition">¶</a></dt>
 <dd><p><cite>HybridBlock</cite> supports forwarding with both Symbol and NDArray.</p>
 <p>Forward computation in <cite>HybridBlock</cite> must be static to work with <cite>Symbol`s,
 i.e. you cannot call `.asnumpy()</cite>, <cite>.shape</cite>, <cite>.dtype</cite>, etc on tensors.
 Also, you cannot use branching or loop logic that bases on non-constant
 expressions like random numbers or intermediate results, since they change
 the graph structure for each iteration.</p>
 <p>Before activating with <cite>hybridize()</cite>, <cite>HybridBlock</cite> works just like normal
 <cite>Block</cite>. After activation, <cite>HybridBlock</cite> will create a symbolic graph
 representing the forward computation and cache it. On subsequent forwards,
 the cached graph will be used instead of <cite>hybrid_forward</cite>.</p>
 <p>Refer <a class="reference external" href="http://mxnet.io/tutorials/gluon/hybrid.html">Hybrid tutorial</a> to see
 the end-to-end usage.</p>
 <dl class="method">
 <dt id="mxnet.gluon.HybridBlock.hybrid_forward">
 <code class="descname">hybrid_forward</code><span class="sig-paren">(</span><em>F</em>, <em>x</em>, <em>*args</em>, <em>**kwargs</em><span class="sig-paren">)</span><a class="headerlink" href="#mxnet.gluon.HybridBlock.hybrid_forward" title="Permalink to this definition">¶</a></dt>
 <dd><p>Overrides to construct symbolic graph for this <cite>Block</cite>.</p>
 <table class="docutils field-list" frame="void" rules="none">
 <col class="field-name"/>
 <col class="field-body"/>
 <tbody valign="top">
 <tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
 <li><strong>x</strong> (<a class="reference internal" href="symbol.html#mxnet.symbol.Symbol" title="mxnet.symbol.Symbol"><em>Symbol</em></a><em> or </em><a class="reference internal" href="ndarray.html#mxnet.ndarray.NDArray" title="mxnet.ndarray.NDArray"><em>NDArray</em></a>) – The first input tensor.</li>
 <li><strong>*args</strong> (<em>list of Symbol</em><em> or </em><em>list of NDArray</em>) – Additional input tensors.</li>
 </ul>
 </td>
 </tr>
 </tbody>
 </table>
 </dd></dl>
 <dl class="method">
 <dt id="mxnet.gluon.HybridBlock.infer_shape">
 <code class="descname">infer_shape</code><span class="sig-paren">(</span><em>*args</em><span class="sig-paren">)</span><a class="headerlink" href="#mxnet.gluon.HybridBlock.infer_shape" title="Permalink to this definition">¶</a></dt>
 <dd><p>Infers shape of Parameters from inputs.</p>
 </dd></dl>
 <dl class="method">
 <dt id="mxnet.gluon.HybridBlock.forward">
 <code class="descname">forward</code><span class="sig-paren">(</span><em>x</em>, <em>*args</em><span class="sig-paren">)</span><a class="headerlink" href="#mxnet.gluon.HybridBlock.forward" title="Permalink to this definition">¶</a></dt>
 <dd><p>Defines the forward computation. Arguments can be either
 <cite>NDArray</cite> or <cite>Symbol</cite>.</p>
 </dd></dl>
 <dl class="method">
 <dt>
 <code class="descname">hybrid_forward</code><span class="sig-paren">(</span><em>F</em>, <em>x</em>, <em>*args</em>, <em>**kwargs</em><span class="sig-paren">)</span></dt>
 <dd><p>Overrides to construct symbolic graph for this <cite>Block</cite>.</p>
 <table class="docutils field-list" frame="void" rules="none">
 <col class="field-name"/>
 <col class="field-body"/>
 <tbody valign="top">
 <tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
 <li><strong>x</strong> (<a class="reference internal" href="symbol.html#mxnet.symbol.Symbol" title="mxnet.symbol.Symbol"><em>Symbol</em></a><em> or </em><a class="reference internal" href="ndarray.html#mxnet.ndarray.NDArray" title="mxnet.ndarray.NDArray"><em>NDArray</em></a>) – The first input tensor.</li>
 <li><strong>*args</strong> (<em>list of Symbol</em><em> or </em><em>list of NDArray</em>) – Additional input tensors.</li>
 </ul>
 </td>
 </tr>
 </tbody>
 </table>
 </dd></dl>
 </dd></dl>
 </div>
 <div class="section" id="neural-network-layers">
 <span id="neural-network-layers"></span><h2>Neural Network Layers<a class="headerlink" href="#neural-network-layers" title="Permalink to this headline">¶</a></h2>
 <div class="section" id="containers">
 <span id="id1"></span><h3>Containers<a class="headerlink" href="#containers" title="Permalink to this headline">¶</a></h3>
 <dl class="class">
 <dt id="mxnet.gluon.nn.Sequential">
 <em class="property">class </em><code class="descclassname">mxnet.gluon.nn.</code><code class="descname">Sequential</code><span class="sig-paren">(</span><em>prefix=None</em>, <em>params=None</em><span class="sig-paren">)</span><a class="headerlink" href="#mxnet.gluon.nn.Sequential" title="Permalink to this definition">¶</a></dt>
 <dd><p>Stacks <a href="#id8"><span class="problematic" id="id9">`</span></a>Block`s sequentially.</p>
 <p>Example:</p>
 <div class="highlight-default"><div class="highlight"><pre><span></span><span class="n">net</span> <span class="o">=</span> <span class="n">nn</span><span class="o">.</span><span class="n">Sequential</span><span class="p">()</span>
 <span class="c1"># use net's name_scope to give child Blocks appropriate names.</span>
 <span class="k">with</span> <span class="n">net</span><span class="o">.</span><span class="n">name_scope</span><span class="p">():</span>
     <span class="n">net</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="n">nn</span><span class="o">.</span><span class="n">Dense</span><span class="p">(</span><span class="mi">10</span><span class="p">,</span> <span class="n">activation</span><span class="o">=</span><span class="s1">'relu'</span><span class="p">))</span>
     <span class="n">net</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="n">nn</span><span class="o">.</span><span class="n">Dense</span><span class="p">(</span><span class="mi">20</span><span class="p">))</span>
 </pre></div>
 </div>
 <dl class="method">
 <dt id="mxnet.gluon.nn.Sequential.add">
 <code class="descname">add</code><span class="sig-paren">(</span><em>block</em><span class="sig-paren">)</span><a class="headerlink" href="#mxnet.gluon.nn.Sequential.add" title="Permalink to this definition">¶</a></dt>
 <dd><p>Adds block on top of the stack.</p>
 </dd></dl>
 </dd></dl>
 <dl class="class">
 <dt id="mxnet.gluon.nn.HybridSequential">
 <em class="property">class </em><code class="descclassname">mxnet.gluon.nn.</code><code class="descname">HybridSequential</code><span class="sig-paren">(</span><em>prefix=None</em>, <em>params=None</em><span class="sig-paren">)</span><a class="headerlink" href="#mxnet.gluon.nn.HybridSequential" title="Permalink to this definition">¶</a></dt>
 <dd><p>Stacks <a href="#id10"><span class="problematic" id="id11">`</span></a>HybridBlock`s sequentially.</p>
 <p>Example:</p>
 <div class="highlight-default"><div class="highlight"><pre><span></span><span class="n">net</span> <span class="o">=</span> <span class="n">nn</span><span class="o">.</span><span class="n">Sequential</span><span class="p">()</span>
 <span class="c1"># use net's name_scope to give child Blocks appropriate names.</span>
 <span class="k">with</span> <span class="n">net</span><span class="o">.</span><span class="n">name_scope</span><span class="p">():</span>
     <span class="n">net</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="n">nn</span><span class="o">.</span><span class="n">Dense</span><span class="p">(</span><span class="mi">10</span><span class="p">,</span> <span class="n">activation</span><span class="o">=</span><span class="s1">'relu'</span><span class="p">))</span>
     <span class="n">net</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="n">nn</span><span class="o">.</span><span class="n">Dense</span><span class="p">(</span><span class="mi">20</span><span class="p">))</span>
 </pre></div>
 </div>
 <dl class="method">
 <dt id="mxnet.gluon.nn.HybridSequential.add">
 <code class="descname">add</code><span class="sig-paren">(</span><em>block</em><span class="sig-paren">)</span><a class="headerlink" href="#mxnet.gluon.nn.HybridSequential.add" title="Permalink to this definition">¶</a></dt>
 <dd><p>Adds block on top of the stack.</p>
 </dd></dl>
 </dd></dl>
 </div>
 <div class="section" id="basic-layers">
 <span id="basic-layers"></span><h3>Basic Layers<a class="headerlink" href="#basic-layers" title="Permalink to this headline">¶</a></h3>
 <dl class="class">
 <dt id="mxnet.gluon.nn.Dense">
 <em class="property">class </em><code class="descclassname">mxnet.gluon.nn.</code><code class="descname">Dense</code><span class="sig-paren">(</span><em>units</em>, <em>activation=None</em>, <em>use_bias=True</em>, <em>weight_initializer=None</em>, <em>bias_initializer='zeros'</em>, <em>in_units=0</em>, <em>**kwargs</em><span class="sig-paren">)</span><a class="headerlink" href="#mxnet.gluon.nn.Dense" title="Permalink to this definition">¶</a></dt>
 <dd><p>Just your regular densely-connected NN layer.</p>
 <p><cite>Dense</cite> implements the operation:
 <cite>output = activation(dot(input, weight) + bias)</cite>
 where <cite>activation</cite> is the element-wise activation function
 passed as the <cite>activation</cite> argument, <cite>weight</cite> is a weights matrix
 created by the layer, and <cite>bias</cite> is a bias vector created by the layer
 (only applicable if <cite>use_bias</cite> is <cite>True</cite>).</p>
 <p>Note: the input must be a tensor with rank 2. Use <cite>flatten</cite> to convert it
 to rank 2 manually if necessary.</p>
 <table class="docutils field-list" frame="void" rules="none">
 <col class="field-name"/>
 <col class="field-body"/>
 <tbody valign="top">
 <tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
 <li><strong>units</strong> (<em>int</em>) – Dimensionality of the output space.</li>
 <li><strong>activation</strong> (<em>str</em>) – Activation function to use. See help on <cite>Activation</cite> layer.
 If you don’t specify anything, no activation is applied
 (ie. “linear” activation: <cite>a(x) = x</cite>).</li>
 <li><strong>use_bias</strong> (<em>bool</em>) – Whether the layer uses a bias vector.</li>
 <li><strong>weight_initializer</strong> (str or <cite>Initializer</cite>) – Initializer for the <cite>kernel</cite> weights matrix.</li>
 <li><strong>bias_initializer</strong> (str or <cite>Initializer</cite>) – Initializer for the bias vector.</li>
 <li><strong>in_units</strong> (<em>int</em><em>, </em><em>optional</em>) – Size of the input data. If not specified, initialization will be
 deferred to the first time <cite>forward</cite> is called and <cite>in_units</cite>
 will be inferred from the shape of input data.</li>
 <li><strong>prefix</strong> (<em>str</em><em> or </em><em>None</em>) – See document of <cite>Block</cite>.</li>
 <li><strong>params</strong> (<a class="reference internal" href="#mxnet.gluon.ParameterDict" title="mxnet.gluon.ParameterDict"><em>ParameterDict</em></a><em> or </em><em>None</em>) – See document of <cite>Block</cite>.</li>
 </ul>
 </td>
 </tr>
 </tbody>
 </table>
 <dl class="docutils">
 <dt>Input shape:</dt>
 <dd>A 2D input with shape <cite>(batch_size, in_units)</cite>.</dd>
 <dt>Output shape:</dt>
 <dd>The output would have shape <cite>(batch_size, units)</cite>.</dd>
 </dl>
 </dd></dl>
 <dl class="class">
 <dt id="mxnet.gluon.nn.Activation">
 <em class="property">class </em><code class="descclassname">mxnet.gluon.nn.</code><code class="descname">Activation</code><span class="sig-paren">(</span><em>activation</em>, <em>**kwargs</em><span class="sig-paren">)</span><a class="headerlink" href="#mxnet.gluon.nn.Activation" title="Permalink to this definition">¶</a></dt>
 <dd><p>Applies an activation function to input.</p>
 <table class="docutils field-list" frame="void" rules="none">
 <col class="field-name"/>
 <col class="field-body"/>
 <tbody valign="top">
 <tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>activation</strong> (<em>str</em>) – Name of activation function to use.
 See <a class="reference internal" href="ndarray.html#mxnet.ndarray.Activation" title="mxnet.ndarray.Activation"><code class="xref py py-func docutils literal"><span class="pre">Activation()</span></code></a> for available choices.</td>
 </tr>
 </tbody>
 </table>
 <dl class="docutils">
 <dt>Input shape:</dt>
 <dd>Arbitrary.</dd>
 <dt>Output shape:</dt>
 <dd>Same shape as input.</dd>
 </dl>
 </dd></dl>
 <dl class="class">
 <dt id="mxnet.gluon.nn.Dropout">
 <em class="property">class </em><code class="descclassname">mxnet.gluon.nn.</code><code class="descname">Dropout</code><span class="sig-paren">(</span><em>rate</em>, <em>**kwargs</em><span class="sig-paren">)</span><a class="headerlink" href="#mxnet.gluon.nn.Dropout" title="Permalink to this definition">¶</a></dt>
 <dd><p>Applies Dropout to the input.</p>
 <p>Dropout consists in randomly setting a fraction <cite>rate</cite> of input units
 to 0 at each update during training time, which helps prevent overfitting.</p>
 <table class="docutils field-list" frame="void" rules="none">
 <col class="field-name"/>
 <col class="field-body"/>
 <tbody valign="top">
 <tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>rate</strong> (<em>float</em>) – Fraction of the input units to drop. Must be a number between 0 and 1.</td>
 </tr>
 </tbody>
 </table>
 <dl class="docutils">
 <dt>Input shape:</dt>
 <dd>Arbitrary.</dd>
 <dt>Output shape:</dt>
 <dd>Same shape as input.</dd>
 </dl>
 <p class="rubric">References</p>
 <p><a class="reference external" href="http://www.cs.toronto.edu/~rsalakhu/papers/srivastava14a.pdf">Dropout: A Simple Way to Prevent Neural Networks from Overfitting</a></p>
 </dd></dl>
 <dl class="class">
 <dt id="mxnet.gluon.nn.BatchNorm">
 <em class="property">class </em><code class="descclassname">mxnet.gluon.nn.</code><code class="descname">BatchNorm</code><span class="sig-paren">(</span><em>axis=1</em>, <em>momentum=0.9</em>, <em>epsilon=0.001</em>, <em>center=True</em>, <em>scale=True</em>, <em>beta_initializer='zeros'</em>, <em>gamma_initializer='ones'</em>, <em>running_mean_initializer='zeros'</em>, <em>running_variance_initializer='ones'</em>, <em>in_channels=0</em>, <em>**kwargs</em><span class="sig-paren">)</span><a class="headerlink" href="#mxnet.gluon.nn.BatchNorm" title="Permalink to this definition">¶</a></dt>
 <dd><p>Batch normalization layer (Ioffe and Szegedy, 2014).
 Normalizes the input at each batch, i.e. applies a transformation
 that maintains the mean activation close to 0 and the activation
 standard deviation close to 1.</p>
 <table class="docutils field-list" frame="void" rules="none">
 <col class="field-name"/>
 <col class="field-body"/>
 <tbody valign="top">
 <tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
 <li><strong>axis</strong> (<em>int</em><em>, </em><em>default 1</em>) – The axis that should be normalized. This is typically the channels
 (C) axis. For instance, after a <cite>Conv2D</cite> layer with <cite>layout=’NCHW’</cite>,
 set <cite>axis=1</cite> in <cite>BatchNorm</cite>. If <cite>layout=’NHWC’</cite>, then set <cite>axis=3</cite>.</li>
 <li><strong>momentum</strong> (<em>float</em><em>, </em><em>default 0.9</em>) – Momentum for the moving average.</li>
 <li><strong>epsilon</strong> (<em>float</em><em>, </em><em>default 1e-3</em>) – Small float added to variance to avoid dividing by zero.</li>
 <li><strong>center</strong> (<em>bool</em><em>, </em><em>default True</em>) – If True, add offset of <cite>beta</cite> to normalized tensor.
 If False, <cite>beta</cite> is ignored.</li>
 <li><strong>scale</strong> (<em>bool</em><em>, </em><em>default True</em>) – If True, multiply by <cite>gamma</cite>. If False, <cite>gamma</cite> is not used.
 When the next layer is linear (also e.g. <cite>nn.relu</cite>),
 this can be disabled since the scaling
 will be done by the next layer.</li>
 <li><strong>beta_initializer</strong> (str or <cite>Initializer</cite>, default ‘zeros’) – Initializer for the beta weight.</li>
 <li><strong>gamma_initializer</strong> (str or <cite>Initializer</cite>, default ‘ones’) – Initializer for the gamma weight.</li>
 <li><strong>moving_mean_initializer</strong> (str or <cite>Initializer</cite>, default ‘zeros’) – Initializer for the moving mean.</li>
 <li><strong>moving_variance_initializer</strong> (str or <cite>Initializer</cite>, default ‘ones’) – Initializer for the moving variance.</li>
 <li><strong>in_channels</strong> (<em>int</em><em>, </em><em>default 0</em>) – Number of channels (feature maps) in input data. If not specified,
 initialization will be deferred to the first time <cite>forward</cite> is called
 and <cite>in_channels</cite> will be inferred from the shape of input data.</li>
 </ul>
 </td>
 </tr>
 </tbody>
 </table>
 <dl class="docutils">
 <dt>Input shape:</dt>
 <dd>Arbitrary.</dd>
 <dt>Output shape:</dt>
 <dd>Same shape as input.</dd>
 </dl>
 </dd></dl>
 <dl class="class">
 <dt id="mxnet.gluon.nn.LeakyReLU">
 <em class="property">class </em><code class="descclassname">mxnet.gluon.nn.</code><code class="descname">LeakyReLU</code><span class="sig-paren">(</span><em>alpha</em>, <em>**kwargs</em><span class="sig-paren">)</span><a class="headerlink" href="#mxnet.gluon.nn.LeakyReLU" title="Permalink to this definition">¶</a></dt>
 <dd><p>Leaky version of a Rectified Linear Unit.</p>
 <p>It allows a small gradient when the unit is not active:</p>
 <div class="highlight-default"><div class="highlight"><pre><span></span>`f(x) = alpha * x for x < 0`,
 `f(x) = x for x >= 0`.
 </pre></div>
 </div>
 <table class="docutils field-list" frame="void" rules="none">
 <col class="field-name"/>
 <col class="field-body"/>
 <tbody valign="top">
 <tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>alpha</strong> (<em>float</em>) – slope coefficient for the negative half axis. Must be >= 0.</td>
 </tr>
 </tbody>
 </table>
 <dl class="docutils">
 <dt>Input shape:</dt>
 <dd>Arbitrary.</dd>
 <dt>Output shape:</dt>
 <dd>Same shape as input.</dd>
 </dl>
 </dd></dl>
 <dl class="class">
 <dt id="mxnet.gluon.nn.Embedding">
 <em class="property">class </em><code class="descclassname">mxnet.gluon.nn.</code><code class="descname">Embedding</code><span class="sig-paren">(</span><em>input_dim</em>, <em>output_dim</em>, <em>dtype='float32'</em>, <em>weight_initializer=None</em>, <em>**kwargs</em><span class="sig-paren">)</span><a class="headerlink" href="#mxnet.gluon.nn.Embedding" title="Permalink to this definition">¶</a></dt>
 <dd><p>Turns non-negative integers (indexes/tokens) into dense vectors
 of fixed size. eg. [[4], [20]] -> [[0.25, 0.1], [0.6, -0.2]]</p>
 <table class="docutils field-list" frame="void" rules="none">
 <col class="field-name"/>
 <col class="field-body"/>
 <tbody valign="top">
 <tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
 <li><strong>input_dim</strong> (<em>int</em>) – Size of the vocabulary, i.e. maximum integer index + 1.</li>
 <li><strong>output_dim</strong> (<em>int</em>) – Dimension of the dense embedding.</li>
 <li><strong>dtype</strong> (<em>str</em><em> or </em><em>np.dtype</em><em>, </em><em>default 'float32'</em>) – Data type of output embeddings.</li>
 <li><strong>weight_initializer</strong> (<a class="reference internal" href="optimization.html#mxnet.initializer.Initializer" title="mxnet.initializer.Initializer"><em>Initializer</em></a>) – Initializer for the <cite>embeddings</cite> matrix.</li>
 </ul>
 </td>
 </tr>
 </tbody>
 </table>
 <dl class="docutils">
 <dt>Input shape:</dt>
 <dd>2D tensor with shape: <cite>(N, M)</cite>.</dd>
 <dt>Output shape:</dt>
 <dd>3D tensor with shape: <cite>(N, M, output_dim)</cite>.</dd>
 </dl>
 </dd></dl>
 </div>
 <div class="section" id="convolutional-layers">
 <span id="convolutional-layers"></span><h3>Convolutional Layers<a class="headerlink" href="#convolutional-layers" title="Permalink to this headline">¶</a></h3>
 <dl class="class">
 <dt id="mxnet.gluon.nn.Conv1D">
 <em class="property">class </em><code class="descclassname">mxnet.gluon.nn.</code><code class="descname">Conv1D</code><span class="sig-paren">(</span><em>channels</em>, <em>kernel_size</em>, <em>strides=1</em>, <em>padding=0</em>, <em>dilation=1</em>, <em>groups=1</em>, <em>layout='NCW'</em>, <em>activation=None</em>, <em>use_bias=True</em>, <em>weight_initializer=None</em>, <em>bias_initializer='zeros'</em>, <em>in_channels=0</em>, <em>**kwargs</em><span class="sig-paren">)</span><a class="headerlink" href="#mxnet.gluon.nn.Conv1D" title="Permalink to this definition">¶</a></dt>
 <dd><p>1D convolution layer (e.g. temporal convolution).</p>
 <p>This layer creates a convolution kernel that is convolved
 with the layer input over a single spatial (or temporal) dimension
 to produce a tensor of outputs.
 If <cite>use_bias</cite> is True, a bias vector is created and added to the outputs.
 Finally, if <cite>activation</cite> is not <cite>None</cite>,
 it is applied to the outputs as well.</p>
 <p>If <cite>in_channels</cite> is not specified, <cite>Parameter</cite> initialization will be
 deferred to the first time <cite>forward</cite> is called and <cite>in_channels</cite> will be
 inferred from the shape of input data.</p>
 <table class="docutils field-list" frame="void" rules="none">
 <col class="field-name"/>
 <col class="field-body"/>
 <tbody valign="top">
 <tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
 <li><strong>channels</strong> (<em>int</em>) – The dimensionality of the output space, i.e. the number of output
 channels (filters) in the convolution.</li>
 <li><strong>kernel_size</strong> (<em>int</em><em> or </em><em>tuple/list of 1 int</em>) – Specifies the dimensions of the convolution window.</li>
 <li><strong>strides</strong> (<em>int</em><em> or </em><em>tuple/list of 1 int</em><em>,</em><em></em>) – Specify the strides of the convolution.</li>
 <li><strong>padding</strong> (<em>int</em><em> or </em><em>a tuple/list of 1 int</em><em>,</em><em></em>) – If padding is non-zero, then the input is implicitly zero-padded
 on both sides for padding number of points</li>
 <li><strong>dilation</strong> (<em>int</em><em> or </em><em>tuple/list of 1 int</em>) – Specifies the dilation rate to use for dilated convolution.</li>
 <li><strong>groups</strong> (<em>int</em>) – Controls the connections between inputs and outputs.
 At groups=1, all inputs are convolved to all outputs.
 At groups=2, the operation becomes equivalent to having two conv
 layers side by side, each seeing half the input channels, and producing
 half the output channels, and both subsequently concatenated.</li>
 <li><strong>layout</strong> (<em>str</em><em>, </em><em>default 'NCW'</em>) – Dimension ordering of data and weight. Can be ‘NCW’, ‘NWC’, etc.
 ‘N’, ‘C’, ‘W’ stands for batch, channel, and width (time) dimensions
 respectively. Convolution is applied on the ‘W’ dimension.</li>
 <li><strong>in_channels</strong> (<em>int</em><em>, </em><em>default 0</em>) – The number of input channels to this layer. If not specified,
 initialization will be deferred to the first time <cite>forward</cite> is called
 and <cite>in_channels</cite> will be inferred from the shape of input data.</li>
 <li><strong>activation</strong> (<em>str</em>) – Activation function to use. See <a class="reference internal" href="ndarray.html#mxnet.ndarray.Activation" title="mxnet.ndarray.Activation"><code class="xref py py-func docutils literal"><span class="pre">Activation()</span></code></a>.
 If you don’t specify anything, no activation is applied
 (ie. “linear” activation: <cite>a(x) = x</cite>).</li>
 <li><strong>use_bias</strong> (<em>bool</em>) – Whether the layer uses a bias vector.</li>
 <li><strong>weight_initializer</strong> (str or <cite>Initializer</cite>) – Initializer for the <cite>weight</cite> weights matrix.</li>
 <li><strong>bias_initializer</strong> (str or <cite>Initializer</cite>) – Initializer for the bias vector.</li>
 </ul>
 </td>
 </tr>
 </tbody>
 </table>
 <dl class="docutils">
 <dt>Input shape:</dt>
 <dd>This depends on the <cite>layout</cite> parameter. Input is 3D array of shape
 (batch_size, in_channels, width) if <cite>layout</cite> is <cite>NCW</cite>.</dd>
 <dt>Output shape:</dt>
 <dd><p class="first">This depends on the <cite>layout</cite> parameter. Output is 3D array of shape
 (batch_size, channels, out_width) if <cite>layout</cite> is <cite>NCW</cite>.
 out_width is calculated as:</p>
 <div class="last highlight-default"><div class="highlight"><pre><span></span><span class="n">out_width</span> <span class="o">=</span> <span class="n">floor</span><span class="p">((</span><span class="n">width</span><span class="o">+</span><span class="mi">2</span><span class="o">*</span><span class="n">padding</span><span class="o">-</span><span class="n">dilation</span><span class="o">*</span><span class="p">(</span><span class="n">kernel_size</span><span class="o">-</span><span class="mi">1</span><span class="p">)</span><span class="o">-</span><span class="mi">1</span><span class="p">)</span><span class="o">/</span><span class="n">stride</span><span class="p">)</span><span class="o">+</span><span class="mi">1</span>
 </pre></div>
 </div>
 </dd>
 </dl>
 </dd></dl>
 <dl class="class">
 <dt id="mxnet.gluon.nn.Conv2D">
 <em class="property">class </em><code class="descclassname">mxnet.gluon.nn.</code><code class="descname">Conv2D</code><span class="sig-paren">(</span><em>channels</em>, <em>kernel_size</em>, <em>strides=(1</em>, <em>1)</em>, <em>padding=(0</em>, <em>0)</em>, <em>dilation=(1</em>, <em>1)</em>, <em>groups=1</em>, <em>layout='NCHW'</em>, <em>activation=None</em>, <em>use_bias=True</em>, <em>weight_initializer=None</em>, <em>bias_initializer='zeros'</em>, <em>in_channels=0</em>, <em>**kwargs</em><span class="sig-paren">)</span><a class="headerlink" href="#mxnet.gluon.nn.Conv2D" title="Permalink to this definition">¶</a></dt>
 <dd><p>2D convolution layer (e.g. spatial convolution over images).</p>
 <p>This layer creates a convolution kernel that is convolved
 with the layer input to produce a tensor of
 outputs. If <cite>use_bias</cite> is True,
 a bias vector is created and added to the outputs. Finally, if
 <cite>activation</cite> is not <cite>None</cite>, it is applied to the outputs as well.</p>
 <p>If <cite>in_channels</cite> is not specified, <cite>Parameter</cite> initialization will be
 deferred to the first time <cite>forward</cite> is called and <cite>in_channels</cite> will be
 inferred from the shape of input data.</p>
 <table class="docutils field-list" frame="void" rules="none">
 <col class="field-name"/>
 <col class="field-body"/>
 <tbody valign="top">
 <tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
 <li><strong>channels</strong> (<em>int</em>) – The dimensionality of the output space, i.e. the number of output
 channels (filters) in the convolution.</li>
 <li><strong>kernel_size</strong> (<em>int</em><em> or </em><em>tuple/list of 2 int</em>) – Specifies the dimensions of the convolution window.</li>
 <li><strong>strides</strong> (<em>int</em><em> or </em><em>tuple/list of 2 int</em><em>,</em><em></em>) – Specify the strides of the convolution.</li>
 <li><strong>padding</strong> (<em>int</em><em> or </em><em>a tuple/list of 2 int</em><em>,</em><em></em>) – If padding is non-zero, then the input is implicitly zero-padded
 on both sides for padding number of points</li>
 <li><strong>dilation</strong> (<em>int</em><em> or </em><em>tuple/list of 2 int</em>) – Specifies the dilation rate to use for dilated convolution.</li>
 <li><strong>groups</strong> (<em>int</em>) – Controls the connections between inputs and outputs.
 At groups=1, all inputs are convolved to all outputs.
 At groups=2, the operation becomes equivalent to having two conv
 layers side by side, each seeing half the input channels, and producing
 half the output channels, and both subsequently concatenated.</li>
 <li><strong>layout</strong> (<em>str</em><em>, </em><em>default 'NCHW'</em>) – Dimension ordering of data and weight. Can be ‘NCHW’, ‘NHWC’, etc.
 ‘N’, ‘C’, ‘H’, ‘W’ stands for batch, channel, height, and width
 dimensions respectively. Convolution is applied on the ‘H’ and
 ‘W’ dimensions.</li>
 <li><strong>in_channels</strong> (<em>int</em><em>, </em><em>default 0</em>) – The number of input channels to this layer. If not specified,
 initialization will be deferred to the first time <cite>forward</cite> is called
 and <cite>in_channels</cite> will be inferred from the shape of input data.</li>
 <li><strong>activation</strong> (<em>str</em>) – Activation function to use. See <a class="reference internal" href="ndarray.html#mxnet.ndarray.Activation" title="mxnet.ndarray.Activation"><code class="xref py py-func docutils literal"><span class="pre">Activation()</span></code></a>.
 If you don’t specify anything, no activation is applied
 (ie. “linear” activation: <cite>a(x) = x</cite>).</li>
 <li><strong>use_bias</strong> (<em>bool</em>) – Whether the layer uses a bias vector.</li>
 <li><strong>weight_initializer</strong> (str or <cite>Initializer</cite>) – Initializer for the <cite>weight</cite> weights matrix.</li>
 <li><strong>bias_initializer</strong> (str or <cite>Initializer</cite>) – Initializer for the bias vector.</li>
 </ul>
 </td>
 </tr>
 </tbody>
 </table>
 <dl class="docutils">
 <dt>Input shape:</dt>
 <dd>This depends on the <cite>layout</cite> parameter. Input is 4D array of shape
 (batch_size, in_channels, height, width) if <cite>layout</cite> is <cite>NCHW</cite>.</dd>
 <dt>Output shape:</dt>
 <dd><p class="first">This depends on the <cite>layout</cite> parameter. Output is 4D array of shape
 (batch_size, channels, out_height, out_width) if <cite>layout</cite> is <cite>NCHW</cite>.</p>
 <p>out_height and out_width are calculated as:</p>
 <div class="last highlight-default"><div class="highlight"><pre><span></span><span class="n">out_height</span> <span class="o">=</span> <span class="n">floor</span><span class="p">((</span><span class="n">height</span><span class="o">+</span><span class="mi">2</span><span class="o">*</span><span class="n">padding</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">-</span><span class="n">dilation</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">*</span><span class="p">(</span><span class="n">kernel_size</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">-</span><span class="mi">1</span><span class="p">)</span><span class="o">-</span><span class="mi">1</span><span class="p">)</span><span class="o">/</span><span class="n">stride</span><span class="p">[</span><span class="mi">0</span><span class="p">])</span><span class="o">+</span><span class="mi">1</span>
 <span class="n">out_width</span> <span class="o">=</span> <span class="n">floor</span><span class="p">((</span><span class="n">width</span><span class="o">+</span><span class="mi">2</span><span class="o">*</span><span class="n">padding</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span><span class="o">-</span><span class="n">dilation</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span><span class="o">*</span><span class="p">(</span><span class="n">kernel_size</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span><span class="o">-</span><span class="mi">1</span><span class="p">)</span><span class="o">-</span><span class="mi">1</span><span class="p">)</span><span class="o">/</span><span class="n">stride</span><span class="p">[</span><span class="mi">1</span><span class="p">])</span><span class="o">+</span><span class="mi">1</span>
 </pre></div>
 </div>
 </dd>
 </dl>
 </dd></dl>
 <dl class="class">
 <dt id="mxnet.gluon.nn.Conv3D">
 <em class="property">class </em><code class="descclassname">mxnet.gluon.nn.</code><code class="descname">Conv3D</code><span class="sig-paren">(</span><em>channels</em>, <em>kernel_size</em>, <em>strides=(1</em>, <em>1</em>, <em>1)</em>, <em>padding=(0</em>, <em>0</em>, <em>0)</em>, <em>dilation=(1</em>, <em>1</em>, <em>1)</em>, <em>groups=1</em>, <em>layout='NCDHW'</em>, <em>activation=None</em>, <em>use_bias=True</em>, <em>weight_initializer=None</em>, <em>bias_initializer='zeros'</em>, <em>in_channels=0</em>, <em>**kwargs</em><span class="sig-paren">)</span><a class="headerlink" href="#mxnet.gluon.nn.Conv3D" title="Permalink to this definition">¶</a></dt>
 <dd><p>3D convolution layer (e.g. spatial convolution over volumes).</p>
 <p>This layer creates a convolution kernel that is convolved
 with the layer input to produce a tensor of
 outputs. If <cite>use_bias</cite> is <cite>True</cite>,
 a bias vector is created and added to the outputs. Finally, if
 <cite>activation</cite> is not <cite>None</cite>, it is applied to the outputs as well.</p>
 <p>If <cite>in_channels</cite> is not specified, <cite>Parameter</cite> initialization will be
 deferred to the first time <cite>forward</cite> is called and <cite>in_channels</cite> will be
 inferred from the shape of input data.</p>
 <table class="docutils field-list" frame="void" rules="none">
 <col class="field-name"/>
 <col class="field-body"/>
 <tbody valign="top">
 <tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
 <li><strong>channels</strong> (<em>int</em>) – The dimensionality of the output space, i.e. the number of output
 channels (filters) in the convolution.</li>
 <li><strong>kernel_size</strong> (<em>int</em><em> or </em><em>tuple/list of 3 int</em>) – Specifies the dimensions of the convolution window.</li>
 <li><strong>strides</strong> (<em>int</em><em> or </em><em>tuple/list of 3 int</em><em>,</em><em></em>) – Specify the strides of the convolution.</li>
 <li><strong>padding</strong> (<em>int</em><em> or </em><em>a tuple/list of 3 int</em><em>,</em><em></em>) – If padding is non-zero, then the input is implicitly zero-padded
 on both sides for padding number of points</li>
 <li><strong>dilation</strong> (<em>int</em><em> or </em><em>tuple/list of 3 int</em>) – Specifies the dilation rate to use for dilated convolution.</li>
 <li><strong>groups</strong> (<em>int</em>) – Controls the connections between inputs and outputs.
 At groups=1, all inputs are convolved to all outputs.
 At groups=2, the operation becomes equivalent to having two conv
 layers side by side, each seeing half the input channels, and producing
 half the output channels, and both subsequently concatenated.</li>
 <li><strong>layout</strong> (<em>str</em><em>, </em><em>default 'NCDHW'</em>) – Dimension ordering of data and weight. Can be ‘NCDHW’, ‘NDHWC’, etc.
 ‘N’, ‘C’, ‘H’, ‘W’, ‘D’ stands for batch, channel, height, width and
 depth dimensions respectively. Convolution is applied on the ‘D’,
 ‘H’ and ‘W’ dimensions.</li>
 <li><strong>in_channels</strong> (<em>int</em><em>, </em><em>default 0</em>) – The number of input channels to this layer. If not specified,
 initialization will be deferred to the first time <cite>forward</cite> is called
 and <cite>in_channels</cite> will be inferred from the shape of input data.</li>
 <li><strong>activation</strong> (<em>str</em>) – Activation function to use. See <a class="reference internal" href="ndarray.html#mxnet.ndarray.Activation" title="mxnet.ndarray.Activation"><code class="xref py py-func docutils literal"><span class="pre">Activation()</span></code></a>.
 If you don’t specify anything, no activation is applied
 (ie. “linear” activation: <cite>a(x) = x</cite>).</li>
 <li><strong>use_bias</strong> (<em>bool</em>) – Whether the layer uses a bias vector.</li>
 <li><strong>weight_initializer</strong> (str or <cite>Initializer</cite>) – Initializer for the <cite>weight</cite> weights matrix.</li>
 <li><strong>bias_initializer</strong> (str or <cite>Initializer</cite>) – Initializer for the bias vector.</li>
 </ul>
 </td>
 </tr>
 </tbody>
 </table>
 <dl class="docutils">
 <dt>Input shape:</dt>
 <dd>This depends on the <cite>layout</cite> parameter. Input is 5D array of shape
 (batch_size, in_channels, depth, height, width) if <cite>layout</cite> is <cite>NCDHW</cite>.</dd>
 <dt>Output shape:</dt>
 <dd><p class="first">This depends on the <cite>layout</cite> parameter. Output is 5D array of shape
 (batch_size, channels, out_depth, out_height, out_width) if <cite>layout</cite> is
 <cite>NCDHW</cite>.</p>
 <p>out_depth, out_height and out_width are calculated as:</p>
 <div class="last highlight-default"><div class="highlight"><pre><span></span><span class="n">out_depth</span> <span class="o">=</span> <span class="n">floor</span><span class="p">((</span><span class="n">depth</span><span class="o">+</span><span class="mi">2</span><span class="o">*</span><span class="n">padding</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">-</span><span class="n">dilation</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">*</span><span class="p">(</span><span class="n">kernel_size</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">-</span><span class="mi">1</span><span class="p">)</span><span class="o">-</span><span class="mi">1</span><span class="p">)</span><span class="o">/</span><span class="n">stride</span><span class="p">[</span><span class="mi">0</span><span class="p">])</span><span class="o">+</span><span class="mi">1</span>
 <span class="n">out_height</span> <span class="o">=</span> <span class="n">floor</span><span class="p">((</span><span class="n">height</span><span class="o">+</span><span class="mi">2</span><span class="o">*</span><span class="n">padding</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span><span class="o">-</span><span class="n">dilation</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span><span class="o">*</span><span class="p">(</span><span class="n">kernel_size</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span><span class="o">-</span><span class="mi">1</span><span class="p">)</span><span class="o">-</span><span class="mi">1</span><span class="p">)</span><span class="o">/</span><span class="n">stride</span><span class="p">[</span><span class="mi">1</span><span class="p">])</span><span class="o">+</span><span class="mi">1</span>
 <span class="n">out_width</span> <span class="o">=</span> <span class="n">floor</span><span class="p">((</span><span class="n">width</span><span class="o">+</span><span class="mi">2</span><span class="o">*</span><span class="n">padding</span><span class="p">[</span><span class="mi">2</span><span class="p">]</span><span class="o">-</span><span class="n">dilation</span><span class="p">[</span><span class="mi">2</span><span class="p">]</span><span class="o">*</span><span class="p">(</span><span class="n">kernel_size</span><span class="p">[</span><span class="mi">2</span><span class="p">]</span><span class="o">-</span><span class="mi">1</span><span class="p">)</span><span class="o">-</span><span class="mi">1</span><span class="p">)</span><span class="o">/</span><span class="n">stride</span><span class="p">[</span><span class="mi">2</span><span class="p">])</span><span class="o">+</span><span class="mi">1</span>
 </pre></div>
 </div>
 </dd>
 </dl>
 </dd></dl>
 <dl class="class">
 <dt id="mxnet.gluon.nn.Conv1DTranspose">
 <em class="property">class </em><code class="descclassname">mxnet.gluon.nn.</code><code class="descname">Conv1DTranspose</code><span class="sig-paren">(</span><em>channels</em>, <em>kernel_size</em>, <em>strides=1</em>, <em>padding=0</em>, <em>output_padding=0</em>, <em>dilation=1</em>, <em>groups=1</em>, <em>layout='NCW'</em>, <em>activation=None</em>, <em>use_bias=True</em>, <em>weight_initializer=None</em>, <em>bias_initializer='zeros'</em>, <em>in_channels=0</em>, <em>**kwargs</em><span class="sig-paren">)</span><a class="headerlink" href="#mxnet.gluon.nn.Conv1DTranspose" title="Permalink to this definition">¶</a></dt>
 <dd><p>Transposed 1D convolution layer (sometimes called Deconvolution).</p>
 <p>The need for transposed convolutions generally arises
 from the desire to use a transformation going in the opposite direction
 of a normal convolution, i.e., from something that has the shape of the
 output of some convolution to something that has the shape of its input
 while maintaining a connectivity pattern that is compatible with
 said convolution.</p>
 <p>If <cite>in_channels</cite> is not specified, <cite>Parameter</cite> initialization will be
 deferred to the first time <cite>forward</cite> is called and <cite>in_channels</cite> will be
 inferred from the shape of input data.</p>
 <table class="docutils field-list" frame="void" rules="none">
 <col class="field-name"/>
 <col class="field-body"/>
 <tbody valign="top">
 <tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
 <li><strong>channels</strong> (<em>int</em>) – The dimensionality of the output space, i.e. the number of output
 channels (filters) in the convolution.</li>
 <li><strong>kernel_size</strong> (<em>int</em><em> or </em><em>tuple/list of 3 int</em>) – Specifies the dimensions of the convolution window.</li>
 <li><strong>strides</strong> (<em>int</em><em> or </em><em>tuple/list of 3 int</em><em>,</em><em></em>) – Specify the strides of the convolution.</li>
 <li><strong>padding</strong> (<em>int</em><em> or </em><em>a tuple/list of 3 int</em><em>,</em><em></em>) – If padding is non-zero, then the input is implicitly zero-padded
 on both sides for padding number of points</li>
 <li><strong>dilation</strong> (<em>int</em><em> or </em><em>tuple/list of 3 int</em>) – Specifies the dilation rate to use for dilated convolution.</li>
 <li><strong>groups</strong> (<em>int</em>) – Controls the connections between inputs and outputs.
 At groups=1, all inputs are convolved to all outputs.
 At groups=2, the operation becomes equivalent to having two conv
 layers side by side, each seeing half the input channels, and producing
 half the output channels, and both subsequently concatenated.</li>
 <li><strong>layout</strong> (<em>str</em><em>, </em><em>default 'NCW'</em>) – Dimension ordering of data and weight. Can be ‘NCW’, ‘NWC’, etc.
 ‘N’, ‘C’, ‘W’ stands for batch, channel, and width (time) dimensions
 respectively. Convolution is applied on the ‘W’ dimension.</li>
 <li><strong>in_channels</strong> (<em>int</em><em>, </em><em>default 0</em>) – The number of input channels to this layer. If not specified,
 initialization will be deferred to the first time <cite>forward</cite> is called
 and <cite>in_channels</cite> will be inferred from the shape of input data.</li>
 <li><strong>activation</strong> (<em>str</em>) – Activation function to use. See <a class="reference internal" href="ndarray.html#mxnet.ndarray.Activation" title="mxnet.ndarray.Activation"><code class="xref py py-func docutils literal"><span class="pre">Activation()</span></code></a>.
 If you don’t specify anything, no activation is applied
 (ie. “linear” activation: <cite>a(x) = x</cite>).</li>
 <li><strong>use_bias</strong> (<em>bool</em>) – Whether the layer uses a bias vector.</li>
 <li><strong>weight_initializer</strong> (str or <cite>Initializer</cite>) – Initializer for the <cite>weight</cite> weights matrix.</li>
 <li><strong>bias_initializer</strong> (str or <cite>Initializer</cite>) – Initializer for the bias vector.</li>
 </ul>
 </td>
 </tr>
 </tbody>
 </table>
 <dl class="docutils">
 <dt>Input shape:</dt>
 <dd>This depends on the <cite>layout</cite> parameter. Input is 3D array of shape
 (batch_size, in_channels, width) if <cite>layout</cite> is <cite>NCW</cite>.</dd>
 <dt>Output shape:</dt>
 <dd><p class="first">This depends on the <cite>layout</cite> parameter. Output is 3D array of shape
 (batch_size, channels, out_width) if <cite>layout</cite> is <cite>NCW</cite>.</p>
 <p>out_width is calculated as:</p>
 <div class="last highlight-default"><div class="highlight"><pre><span></span><span class="n">out_width</span> <span class="o">=</span> <span class="p">(</span><span class="n">width</span><span class="o">-</span><span class="mi">1</span><span class="p">)</span><span class="o">*</span><span class="n">strides</span><span class="o">-</span><span class="mi">2</span><span class="o">*</span><span class="n">padding</span><span class="o">+</span><span class="n">kernel_size</span><span class="o">+</span><span class="n">output_padding</span>
 </pre></div>
 </div>
 </dd>
 </dl>
 </dd></dl>
 <dl class="class">
 <dt id="mxnet.gluon.nn.Conv2DTranspose">
 <em class="property">class </em><code class="descclassname">mxnet.gluon.nn.</code><code class="descname">Conv2DTranspose</code><span class="sig-paren">(</span><em>channels</em>, <em>kernel_size</em>, <em>strides=(1</em>, <em>1)</em>, <em>padding=(0</em>, <em>0)</em>, <em>output_padding=(0</em>, <em>0)</em>, <em>dilation=(1</em>, <em>1)</em>, <em>groups=1</em>, <em>layout='NCHW'</em>, <em>activation=None</em>, <em>use_bias=True</em>, <em>weight_initializer=None</em>, <em>bias_initializer='zeros'</em>, <em>in_channels=0</em>, <em>**kwargs</em><span class="sig-paren">)</span><a class="headerlink" href="#mxnet.gluon.nn.Conv2DTranspose" title="Permalink to this definition">¶</a></dt>
 <dd><p>Transposed 2D convolution layer (sometimes called Deconvolution).</p>
 <p>The need for transposed convolutions generally arises
 from the desire to use a transformation going in the opposite direction
 of a normal convolution, i.e., from something that has the shape of the
 output of some convolution to something that has the shape of its input
 while maintaining a connectivity pattern that is compatible with
 said convolution.</p>
 <p>If <cite>in_channels</cite> is not specified, <cite>Parameter</cite> initialization will be
 deferred to the first time <cite>forward</cite> is called and <cite>in_channels</cite> will be
 inferred from the shape of input data.</p>
 <table class="docutils field-list" frame="void" rules="none">
 <col class="field-name"/>
 <col class="field-body"/>
 <tbody valign="top">
 <tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
 <li><strong>channels</strong> (<em>int</em>) – The dimensionality of the output space, i.e. the number of output
 channels (filters) in the convolution.</li>
 <li><strong>kernel_size</strong> (<em>int</em><em> or </em><em>tuple/list of 3 int</em>) – Specifies the dimensions of the convolution window.</li>
 <li><strong>strides</strong> (<em>int</em><em> or </em><em>tuple/list of 3 int</em><em>,</em><em></em>) – Specify the strides of the convolution.</li>
 <li><strong>padding</strong> (<em>int</em><em> or </em><em>a tuple/list of 3 int</em><em>,</em><em></em>) – If padding is non-zero, then the input is implicitly zero-padded
 on both sides for padding number of points</li>
 <li><strong>dilation</strong> (<em>int</em><em> or </em><em>tuple/list of 3 int</em>) – Specifies the dilation rate to use for dilated convolution.</li>
 <li><strong>groups</strong> (<em>int</em>) – Controls the connections between inputs and outputs.
 At groups=1, all inputs are convolved to all outputs.
 At groups=2, the operation becomes equivalent to having two conv
 layers side by side, each seeing half the input channels, and producing
 half the output channels, and both subsequently concatenated.</li>
 <li><strong>layout</strong> (<em>str</em><em>, </em><em>default 'NCHW'</em>) – Dimension ordering of data and weight. Can be ‘NCHW’, ‘NHWC’, etc.
 ‘N’, ‘C’, ‘H’, ‘W’ stands for batch, channel, height, and width
 dimensions respectively. Convolution is applied on the ‘H’ and
 ‘W’ dimensions.</li>
 <li><strong>in_channels</strong> (<em>int</em><em>, </em><em>default 0</em>) – The number of input channels to this layer. If not specified,
 initialization will be deferred to the first time <cite>forward</cite> is called
 and <cite>in_channels</cite> will be inferred from the shape of input data.</li>
 <li><strong>activation</strong> (<em>str</em>) – Activation function to use. See <a class="reference internal" href="ndarray.html#mxnet.ndarray.Activation" title="mxnet.ndarray.Activation"><code class="xref py py-func docutils literal"><span class="pre">Activation()</span></code></a>.
 If you don’t specify anything, no activation is applied
 (ie. “linear” activation: <cite>a(x) = x</cite>).</li>
 <li><strong>use_bias</strong> (<em>bool</em>) – Whether the layer uses a bias vector.</li>
 <li><strong>weight_initializer</strong> (str or <cite>Initializer</cite>) – Initializer for the <cite>weight</cite> weights matrix.</li>
 <li><strong>bias_initializer</strong> (str or <cite>Initializer</cite>) – Initializer for the bias vector.</li>
 </ul>
 </td>
 </tr>
 </tbody>
 </table>
 <dl class="docutils">
 <dt>Input shape:</dt>
 <dd>This depends on the <cite>layout</cite> parameter. Input is 4D array of shape
 (batch_size, in_channels, height, width) if <cite>layout</cite> is <cite>NCHW</cite>.</dd>
 <dt>Output shape:</dt>
 <dd><p class="first">This depends on the <cite>layout</cite> parameter. Output is 4D array of shape
 (batch_size, channels, out_height, out_width) if <cite>layout</cite> is <cite>NCHW</cite>.</p>
 <p>out_height and out_width are calculated as:</p>
 <div class="last highlight-default"><div class="highlight"><pre><span></span><span class="n">out_height</span> <span class="o">=</span> <span class="p">(</span><span class="n">height</span><span class="o">-</span><span class="mi">1</span><span class="p">)</span><span class="o">*</span><span class="n">strides</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">-</span><span class="mi">2</span><span class="o">*</span><span class="n">padding</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">+</span><span class="n">kernel_size</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">+</span><span class="n">output_padding</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span>
 <span class="n">out_width</span> <span class="o">=</span> <span class="p">(</span><span class="n">width</span><span class="o">-</span><span class="mi">1</span><span class="p">)</span><span class="o">*</span><span class="n">strides</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span><span class="o">-</span><span class="mi">2</span><span class="o">*</span><span class="n">padding</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span><span class="o">+</span><span class="n">kernel_size</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span><span class="o">+</span><span class="n">output_padding</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span>
 </pre></div>
 </div>
 </dd>
 </dl>
 </dd></dl>
 <dl class="class">
 <dt id="mxnet.gluon.nn.Conv3DTranspose">
 <em class="property">class </em><code class="descclassname">mxnet.gluon.nn.</code><code class="descname">Conv3DTranspose</code><span class="sig-paren">(</span><em>channels</em>, <em>kernel_size</em>, <em>strides=(1</em>, <em>1</em>, <em>1)</em>, <em>padding=(0</em>, <em>0</em>, <em>0)</em>, <em>output_padding=(0</em>, <em>0</em>, <em>0)</em>, <em>dilation=(1</em>, <em>1</em>, <em>1)</em>, <em>groups=1</em>, <em>layout='NCDHW'</em>, <em>activation=None</em>, <em>use_bias=True</em>, <em>weight_initializer=None</em>, <em>bias_initializer='zeros'</em>, <em>in_channels=0</em>, <em>**kwargs</em><span class="sig-paren">)</span><a class="headerlink" href="#mxnet.gluon.nn.Conv3DTranspose" title="Permalink to this definition">¶</a></dt>
 <dd><p>Transposed 3D convolution layer (sometimes called Deconvolution).</p>
 <p>The need for transposed convolutions generally arises
 from the desire to use a transformation going in the opposite direction
 of a normal convolution, i.e., from something that has the shape of the
 output of some convolution to something that has the shape of its input
 while maintaining a connectivity pattern that is compatible with
 said convolution.</p>
 <p>If <cite>in_channels</cite> is not specified, <cite>Parameter</cite> initialization will be
 deferred to the first time <cite>forward</cite> is called and <cite>in_channels</cite> will be
 inferred from the shape of input data.</p>
 <table class="docutils field-list" frame="void" rules="none">
 <col class="field-name"/>
 <col class="field-body"/>
 <tbody valign="top">
 <tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
 <li><strong>channels</strong> (<em>int</em>) – The dimensionality of the output space, i.e. the number of output
 channels (filters) in the convolution.</li>
 <li><strong>kernel_size</strong> (<em>int</em><em> or </em><em>tuple/list of 3 int</em>) – Specifies the dimensions of the convolution window.</li>
 <li><strong>strides</strong> (<em>int</em><em> or </em><em>tuple/list of 3 int</em><em>,</em><em></em>) – Specify the strides of the convolution.</li>
 <li><strong>padding</strong> (<em>int</em><em> or </em><em>a tuple/list of 3 int</em><em>,</em><em></em>) – If padding is non-zero, then the input is implicitly zero-padded
 on both sides for padding number of points</li>
 <li><strong>dilation</strong> (<em>int</em><em> or </em><em>tuple/list of 3 int</em>) – Specifies the dilation rate to use for dilated convolution.</li>
 <li><strong>groups</strong> (<em>int</em>) – Controls the connections between inputs and outputs.
 At groups=1, all inputs are convolved to all outputs.
 At groups=2, the operation becomes equivalent to having two conv
 layers side by side, each seeing half the input channels, and producing
 half the output channels, and both subsequently concatenated.</li>
 <li><strong>layout</strong> (<em>str</em><em>, </em><em>default 'NCDHW'</em>) – Dimension ordering of data and weight. Can be ‘NCDHW’, ‘NDHWC’, etc.
 ‘N’, ‘C’, ‘H’, ‘W’, ‘D’ stands for batch, channel, height, width and
 depth dimensions respectively. Convolution is applied on the ‘D’,
 ‘H’, and ‘W’ dimensions.</li>
 <li><strong>in_channels</strong> (<em>int</em><em>, </em><em>default 0</em>) – The number of input channels to this layer. If not specified,
 initialization will be deferred to the first time <cite>forward</cite> is called
 and <cite>in_channels</cite> will be inferred from the shape of input data.</li>
 <li><strong>activation</strong> (<em>str</em>) – Activation function to use. See <a class="reference internal" href="ndarray.html#mxnet.ndarray.Activation" title="mxnet.ndarray.Activation"><code class="xref py py-func docutils literal"><span class="pre">Activation()</span></code></a>.
 If you don’t specify anything, no activation is applied
 (ie. “linear” activation: <cite>a(x) = x</cite>).</li>
 <li><strong>use_bias</strong> (<em>bool</em>) – Whether the layer uses a bias vector.</li>
 <li><strong>weight_initializer</strong> (str or <cite>Initializer</cite>) – Initializer for the <cite>weight</cite> weights matrix.</li>
 <li><strong>bias_initializer</strong> (str or <cite>Initializer</cite>) – Initializer for the bias vector.</li>
 </ul>
 </td>
 </tr>
 </tbody>
 </table>
 <dl class="docutils">
 <dt>Input shape:</dt>
 <dd>This depends on the <cite>layout</cite> parameter. Input is 5D array of shape
 (batch_size, in_channels, depth, height, width) if <cite>layout</cite> is <cite>NCDHW</cite>.</dd>
 <dt>Output shape:</dt>
 <dd><p class="first">This depends on the <cite>layout</cite> parameter. Output is 5D array of shape
 (batch_size, channels, out_depth, out_height, out_width) if <cite>layout</cite> is <cite>NCDHW</cite>.
 out_depth, out_height and out_width are calculated as:</p>
 <div class="last highlight-default"><div class="highlight"><pre><span></span><span class="n">out_depth</span> <span class="o">=</span> <span class="p">(</span><span class="n">depth</span><span class="o">-</span><span class="mi">1</span><span class="p">)</span><span class="o">*</span><span class="n">strides</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">-</span><span class="mi">2</span><span class="o">*</span><span class="n">padding</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">+</span><span class="n">kernel_size</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">+</span><span class="n">output_padding</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span>
 <span class="n">out_height</span> <span class="o">=</span> <span class="p">(</span><span class="n">height</span><span class="o">-</span><span class="mi">1</span><span class="p">)</span><span class="o">*</span><span class="n">strides</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span><span class="o">-</span><span class="mi">2</span><span class="o">*</span><span class="n">padding</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span><span class="o">+</span><span class="n">kernel_size</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span><span class="o">+</span><span class="n">output_padding</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span>
 <span class="n">out_width</span> <span class="o">=</span> <span class="p">(</span><span class="n">width</span><span class="o">-</span><span class="mi">1</span><span class="p">)</span><span class="o">*</span><span class="n">strides</span><span class="p">[</span><span class="mi">2</span><span class="p">]</span><span class="o">-</span><span class="mi">2</span><span class="o">*</span><span class="n">padding</span><span class="p">[</span><span class="mi">2</span><span class="p">]</span><span class="o">+</span><span class="n">kernel_size</span><span class="p">[</span><span class="mi">2</span><span class="p">]</span><span class="o">+</span><span class="n">output_padding</span><span class="p">[</span><span class="mi">2</span><span class="p">]</span>
 </pre></div>
 </div>
 </dd>
 </dl>
 </dd></dl>
 </div>
 <div class="section" id="pooling-layers">
 <span id="pooling-layers"></span><h3>Pooling Layers<a class="headerlink" href="#pooling-layers" title="Permalink to this headline">¶</a></h3>
 <dl class="class">
 <dt id="mxnet.gluon.nn.MaxPool1D">
 <em class="property">class </em><code class="descclassname">mxnet.gluon.nn.</code><code class="descname">MaxPool1D</code><span class="sig-paren">(</span><em>pool_size=2</em>, <em>strides=None</em>, <em>padding=0</em>, <em>layout='NCW'</em>, <em>ceil_mode=False</em>, <em>**kwargs</em><span class="sig-paren">)</span><a class="headerlink" href="#mxnet.gluon.nn.MaxPool1D" title="Permalink to this definition">¶</a></dt>
 <dd><p>Max pooling operation for one dimensional data.</p>
 <table class="docutils field-list" frame="void" rules="none">
 <col class="field-name"/>
 <col class="field-body"/>
 <tbody valign="top">
 <tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
 <li><strong>pool_size</strong> (<em>int</em>) – Size of the max pooling windows.</li>
 <li><strong>strides</strong> (<em>int</em><em>, or </em><em>None</em>) – Factor by which to downscale. E.g. 2 will halve the input size.
 If <cite>None</cite>, it will default to <cite>pool_size</cite>.</li>
 <li><strong>padding</strong> (<em>int</em>) – If padding is non-zero, then the input is implicitly
 zero-padded on both sides for padding number of points.</li>
 <li><strong>layout</strong> (<em>str</em><em>, </em><em>default 'NCW'</em>) – Dimension ordering of data and weight. Can be ‘NCW’, ‘NWC’, etc.
 ‘N’, ‘C’, ‘W’ stands for batch, channel, and width (time) dimensions
 respectively. Pooling is applied on the W dimension.</li>
 <li><strong>ceil_mode</strong> (<em>bool</em><em>, </em><em>default False</em>) – When <cite>True</cite>, will use ceil instead of floor to compute the output shape.</li>
 </ul>
 </td>
 </tr>
 </tbody>
 </table>
 <dl class="docutils">
 <dt>Input shape:</dt>
 <dd>This depends on the <cite>layout</cite> parameter. Input is 3D array of shape
 (batch_size, channels, width) if <cite>layout</cite> is <cite>NCW</cite>.</dd>
 <dt>Output shape:</dt>
 <dd><p class="first">This depends on the <cite>layout</cite> parameter. Output is 3D array of shape
 (batch_size, channels, out_width) if <cite>layout</cite> is <cite>NCW</cite>.</p>
 <p>out_width is calculated as:</p>
 <div class="highlight-default"><div class="highlight"><pre><span></span><span class="n">out_width</span> <span class="o">=</span> <span class="n">floor</span><span class="p">((</span><span class="n">width</span><span class="o">+</span><span class="mi">2</span><span class="o">*</span><span class="n">padding</span><span class="o">-</span><span class="n">pool_size</span><span class="p">)</span><span class="o">/</span><span class="n">strides</span><span class="p">)</span><span class="o">+</span><span class="mi">1</span>
 </pre></div>
 </div>
 <p class="last">When <cite>ceil_mode</cite> is <cite>True</cite>, ceil will be used instead of floor in this
 equation.</p>
 </dd>
 </dl>
 </dd></dl>
 <dl class="class">
 <dt id="mxnet.gluon.nn.MaxPool2D">
 <em class="property">class </em><code class="descclassname">mxnet.gluon.nn.</code><code class="descname">MaxPool2D</code><span class="sig-paren">(</span><em>pool_size=(2</em>, <em>2)</em>, <em>strides=None</em>, <em>padding=0</em>, <em>layout='NCHW'</em>, <em>ceil_mode=False</em>, <em>**kwargs</em><span class="sig-paren">)</span><a class="headerlink" href="#mxnet.gluon.nn.MaxPool2D" title="Permalink to this definition">¶</a></dt>
 <dd><p>Max pooling operation for two dimensional (spatial) data.</p>
 <table class="docutils field-list" frame="void" rules="none">
 <col class="field-name"/>
 <col class="field-body"/>
 <tbody valign="top">
 <tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
 <li><strong>pool_size</strong> (<em>int</em><em> or </em><em>list/tuple of 2 ints</em><em>,</em><em></em>) – Size of the max pooling windows.</li>
 <li><strong>strides</strong> (<em>int</em><em>, </em><em>list/tuple of 2 ints</em><em>, or </em><em>None.</em>) – Factor by which to downscale. E.g. 2 will halve the input size.
 If <cite>None</cite>, it will default to <cite>pool_size</cite>.</li>
 <li><strong>padding</strong> (<em>int</em><em> or </em><em>list/tuple of 2 ints</em><em>,</em><em></em>) – If padding is non-zero, then the input is implicitly
 zero-padded on both sides for padding number of points.</li>
 <li><strong>layout</strong> (<em>str</em><em>, </em><em>default 'NCHW'</em>) – Dimension ordering of data and weight. Can be ‘NCHW’, ‘NHWC’, etc.
 ‘N’, ‘C’, ‘H’, ‘W’ stands for batch, channel, height, and width
 dimensions respectively. padding is applied on ‘H’ and ‘W’ dimension.</li>
 <li><strong>ceil_mode</strong> (<em>bool</em><em>, </em><em>default False</em>) – When <cite>True</cite>, will use ceil instead of floor to compute the output shape.</li>
 </ul>
 </td>
 </tr>
 </tbody>
 </table>
 <dl class="docutils">
 <dt>Input shape:</dt>
 <dd>This depends on the <cite>layout</cite> parameter. Input is 4D array of shape
 (batch_size, channels, height, width) if <cite>layout</cite> is <cite>NCHW</cite>.</dd>
 <dt>Output shape:</dt>
 <dd><p class="first">This depends on the <cite>layout</cite> parameter. Output is 4D array of shape
 (batch_size, channels, out_height, out_width)  if <cite>layout</cite> is <cite>NCHW</cite>.</p>
 <p>out_height and out_width are calculated as:</p>
 <div class="highlight-default"><div class="highlight"><pre><span></span><span class="n">out_height</span> <span class="o">=</span> <span class="n">floor</span><span class="p">((</span><span class="n">height</span><span class="o">+</span><span class="mi">2</span><span class="o">*</span><span class="n">padding</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">-</span><span class="n">pool_size</span><span class="p">[</span><span class="mi">0</span><span class="p">])</span><span class="o">/</span><span class="n">strides</span><span class="p">[</span><span class="mi">0</span><span class="p">])</span><span class="o">+</span><span class="mi">1</span>
 <span class="n">out_width</span> <span class="o">=</span> <span class="n">floor</span><span class="p">((</span><span class="n">width</span><span class="o">+</span><span class="mi">2</span><span class="o">*</span><span class="n">padding</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span><span class="o">-</span><span class="n">pool_size</span><span class="p">[</span><span class="mi">1</span><span class="p">])</span><span class="o">/</span><span class="n">strides</span><span class="p">[</span><span class="mi">1</span><span class="p">])</span><span class="o">+</span><span class="mi">1</span>
 </pre></div>
 </div>
 <p class="last">When <cite>ceil_mode</cite> is <cite>True</cite>, ceil will be used instead of floor in this
 equation.</p>
 </dd>
 </dl>
 </dd></dl>
 <dl class="class">
 <dt id="mxnet.gluon.nn.MaxPool3D">
 <em class="property">class </em><code class="descclassname">mxnet.gluon.nn.</code><code class="descname">MaxPool3D</code><span class="sig-paren">(</span><em>pool_size=(2</em>, <em>2</em>, <em>2)</em>, <em>strides=None</em>, <em>padding=0</em>, <em>ceil_mode=False</em>, <em>layout='NCDHW'</em>, <em>**kwargs</em><span class="sig-paren">)</span><a class="headerlink" href="#mxnet.gluon.nn.MaxPool3D" title="Permalink to this definition">¶</a></dt>
 <dd><p>Max pooling operation for 3D data (spatial or spatio-temporal).</p>
 <table class="docutils field-list" frame="void" rules="none">
 <col class="field-name"/>
 <col class="field-body"/>
 <tbody valign="top">
 <tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
 <li><strong>pool_size</strong> (<em>int</em><em> or </em><em>list/tuple of 3 ints</em><em>,</em><em></em>) – Size of the max pooling windows.</li>
 <li><strong>strides</strong> (<em>int</em><em>, </em><em>list/tuple of 3 ints</em><em>, or </em><em>None.</em>) – Factor by which to downscale. E.g. 2 will halve the input size.
 If <cite>None</cite>, it will default to <cite>pool_size</cite>.</li>
 <li><strong>padding</strong> (<em>int</em><em> or </em><em>list/tuple of 3 ints</em><em>,</em><em></em>) – If padding is non-zero, then the input is implicitly
 zero-padded on both sides for padding number of points.</li>
 <li><strong>layout</strong> (<em>str</em><em>, </em><em>default 'NCDHW'</em>) – Dimension ordering of data and weight. Can be ‘NCDHW’, ‘NDHWC’, etc.
 ‘N’, ‘C’, ‘H’, ‘W’, ‘D’ stands for batch, channel, height, width and
 depth dimensions respectively. padding is applied on ‘D’, ‘H’ and ‘W’
 dimension.</li>
 <li><strong>ceil_mode</strong> (<em>bool</em><em>, </em><em>default False</em>) – When <cite>True</cite>, will use ceil instead of floor to compute the output shape.</li>
 </ul>
 </td>
 </tr>
 </tbody>
 </table>
 <dl class="docutils">
 <dt>Input shape:</dt>
 <dd>This depends on the <cite>layout</cite> parameter. Input is 5D array of shape
 (batch_size, channels, depth, height, width) if <cite>layout</cite> is <cite>NCDHW</cite>.</dd>
 <dt>Output shape:</dt>
 <dd><p class="first">This depends on the <cite>layout</cite> parameter. Output is 5D array of shape
 (batch_size, channels, out_depth, out_height, out_width) if <cite>layout</cite>
 is <cite>NCDHW</cite>.</p>
 <p>out_depth, out_height and out_width are calculated as</p>
 <div class="highlight-default"><div class="highlight"><pre><span></span><span class="n">out_depth</span> <span class="o">=</span> <span class="n">floor</span><span class="p">((</span><span class="n">depth</span><span class="o">+</span><span class="mi">2</span><span class="o">*</span><span class="n">padding</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">-</span><span class="n">pool_size</span><span class="p">[</span><span class="mi">0</span><span class="p">])</span><span class="o">/</span><span class="n">strides</span><span class="p">[</span><span class="mi">0</span><span class="p">])</span><span class="o">+</span><span class="mi">1</span>
 <span class="n">out_height</span> <span class="o">=</span> <span class="n">floor</span><span class="p">((</span><span class="n">height</span><span class="o">+</span><span class="mi">2</span><span class="o">*</span><span class="n">padding</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span><span class="o">-</span><span class="n">pool_size</span><span class="p">[</span><span class="mi">1</span><span class="p">])</span><span class="o">/</span><span class="n">strides</span><span class="p">[</span><span class="mi">1</span><span class="p">])</span><span class="o">+</span><span class="mi">1</span>
 <span class="n">out_width</span> <span class="o">=</span> <span class="n">floor</span><span class="p">((</span><span class="n">width</span><span class="o">+</span><span class="mi">2</span><span class="o">*</span><span class="n">padding</span><span class="p">[</span><span class="mi">2</span><span class="p">]</span><span class="o">-</span><span class="n">pool_size</span><span class="p">[</span><span class="mi">2</span><span class="p">])</span><span class="o">/</span><span class="n">strides</span><span class="p">[</span><span class="mi">2</span><span class="p">])</span><span class="o">+</span><span class="mi">1</span>
 </pre></div>
 </div>
 <p class="last">When <cite>ceil_mode</cite> is <cite>True</cite>, ceil will be used instead of floor in this
 equation.</p>
 </dd>
 </dl>
 </dd></dl>
 <dl class="class">
 <dt id="mxnet.gluon.nn.AvgPool1D">
 <em class="property">class </em><code class="descclassname">mxnet.gluon.nn.</code><code class="descname">AvgPool1D</code><span class="sig-paren">(</span><em>pool_size=2</em>, <em>strides=None</em>, <em>padding=0</em>, <em>layout='NCW'</em>, <em>ceil_mode=False</em>, <em>**kwargs</em><span class="sig-paren">)</span><a class="headerlink" href="#mxnet.gluon.nn.AvgPool1D" title="Permalink to this definition">¶</a></dt>
 <dd><p>Average pooling operation for temporal data.</p>
 <table class="docutils field-list" frame="void" rules="none">
 <col class="field-name"/>
 <col class="field-body"/>
 <tbody valign="top">
 <tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
 <li><strong>pool_size</strong> (<em>int</em>) – Size of the max pooling windows.</li>
 <li><strong>strides</strong> (<em>int</em><em>, or </em><em>None</em>) – Factor by which to downscale. E.g. 2 will halve the input size.
 If <cite>None</cite>, it will default to <cite>pool_size</cite>.</li>
 <li><strong>padding</strong> (<em>int</em>) – If padding is non-zero, then the input is implicitly
 zero-padded on both sides for padding number of points.</li>
 <li><strong>layout</strong> (<em>str</em><em>, </em><em>default 'NCW'</em>) – Dimension ordering of data and weight. Can be ‘NCW’, ‘NWC’, etc.
 ‘N’, ‘C’, ‘W’ stands for batch, channel, and width (time) dimensions
 respectively. padding is applied on ‘W’ dimension.</li>
 <li><strong>ceil_mode</strong> (<em>bool</em><em>, </em><em>default False</em>) – When <cite>True</cite>, will use ceil instead of floor to compute the output shape.</li>
 </ul>
 </td>
 </tr>
 </tbody>
 </table>
 <dl class="docutils">
 <dt>Input shape:</dt>
 <dd>This depends on the <cite>layout</cite> parameter. Input is 3D array of shape
 (batch_size, channels, width) if <cite>layout</cite> is <cite>NCW</cite>.</dd>
 <dt>Output shape:</dt>
 <dd><p class="first">This depends on the <cite>layout</cite> parameter. Output is 3D array of shape
 (batch_size, channels, out_width) if <cite>layout</cite> is <cite>NCW</cite>.</p>
 <p>out_width is calculated as:</p>
 <div class="highlight-default"><div class="highlight"><pre><span></span><span class="n">out_width</span> <span class="o">=</span> <span class="n">floor</span><span class="p">((</span><span class="n">width</span><span class="o">+</span><span class="mi">2</span><span class="o">*</span><span class="n">padding</span><span class="o">-</span><span class="n">pool_size</span><span class="p">)</span><span class="o">/</span><span class="n">strides</span><span class="p">)</span><span class="o">+</span><span class="mi">1</span>
 </pre></div>
 </div>
 <p class="last">When <cite>ceil_mode</cite> is <cite>True</cite>, ceil will be used instead of floor in this
 equation.</p>
 </dd>
 </dl>
 </dd></dl>
 <dl class="class">
 <dt id="mxnet.gluon.nn.AvgPool2D">
 <em class="property">class </em><code class="descclassname">mxnet.gluon.nn.</code><code class="descname">AvgPool2D</code><span class="sig-paren">(</span><em>pool_size=(2</em>, <em>2)</em>, <em>strides=None</em>, <em>padding=0</em>, <em>ceil_mode=False</em>, <em>layout='NCHW'</em>, <em>**kwargs</em><span class="sig-paren">)</span><a class="headerlink" href="#mxnet.gluon.nn.AvgPool2D" title="Permalink to this definition">¶</a></dt>
 <dd><p>Average pooling operation for spatial data.</p>
 <table class="docutils field-list" frame="void" rules="none">
 <col class="field-name"/>
 <col class="field-body"/>
 <tbody valign="top">
 <tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
 <li><strong>pool_size</strong> (<em>int</em><em> or </em><em>list/tuple of 2 ints</em><em>,</em><em></em>) – Size of the max pooling windows.</li>
 <li><strong>strides</strong> (<em>int</em><em>, </em><em>list/tuple of 2 ints</em><em>, or </em><em>None.</em>) – Factor by which to downscale. E.g. 2 will halve the input size.
 If <cite>None</cite>, it will default to <cite>pool_size</cite>.</li>
 <li><strong>padding</strong> (<em>int</em><em> or </em><em>list/tuple of 2 ints</em><em>,</em><em></em>) – If padding is non-zero, then the input is implicitly
 zero-padded on both sides for padding number of points.</li>
 <li><strong>layout</strong> (<em>str</em><em>, </em><em>default 'NCHW'</em>) – Dimension ordering of data and weight. Can be ‘NCHW’, ‘NHWC’, etc.
 ‘N’, ‘C’, ‘H’, ‘W’ stands for batch, channel, height, and width
 dimensions respectively. padding is applied on ‘H’ and ‘W’ dimension.</li>
 <li><strong>ceil_mode</strong> (<em>bool</em><em>, </em><em>default False</em>) – When True, will use ceil instead of floor to compute the output shape.</li>
 </ul>
 </td>
 </tr>
 </tbody>
 </table>
 <dl class="docutils">
 <dt>Input shape:</dt>
 <dd>This depends on the <cite>layout</cite> parameter. Input is 4D array of shape
 (batch_size, channels, height, width) if <cite>layout</cite> is <cite>NCHW</cite>.</dd>
 <dt>Output shape:</dt>
 <dd><p class="first">This depends on the <cite>layout</cite> parameter. Output is 4D array of shape
 (batch_size, channels, out_height, out_width)  if <cite>layout</cite> is <cite>NCHW</cite>.</p>
 <p>out_height and out_width are calculated as:</p>
 <div class="highlight-default"><div class="highlight"><pre><span></span><span class="n">out_height</span> <span class="o">=</span> <span class="n">floor</span><span class="p">((</span><span class="n">height</span><span class="o">+</span><span class="mi">2</span><span class="o">*</span><span class="n">padding</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">-</span><span class="n">pool_size</span><span class="p">[</span><span class="mi">0</span><span class="p">])</span><span class="o">/</span><span class="n">strides</span><span class="p">[</span><span class="mi">0</span><span class="p">])</span><span class="o">+</span><span class="mi">1</span>
 <span class="n">out_width</span> <span class="o">=</span> <span class="n">floor</span><span class="p">((</span><span class="n">width</span><span class="o">+</span><span class="mi">2</span><span class="o">*</span><span class="n">padding</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span><span class="o">-</span><span class="n">pool_size</span><span class="p">[</span><span class="mi">1</span><span class="p">])</span><span class="o">/</span><span class="n">strides</span><span class="p">[</span><span class="mi">1</span><span class="p">])</span><span class="o">+</span><span class="mi">1</span>
 </pre></div>
 </div>
 <p class="last">When <cite>ceil_mode</cite> is <cite>True</cite>, ceil will be used instead of floor in this
 equation.</p>
 </dd>
 </dl>
 </dd></dl>
 <dl class="class">
 <dt id="mxnet.gluon.nn.AvgPool3D">
 <em class="property">class </em><code class="descclassname">mxnet.gluon.nn.</code><code class="descname">AvgPool3D</code><span class="sig-paren">(</span><em>pool_size=(2</em>, <em>2</em>, <em>2)</em>, <em>strides=None</em>, <em>padding=0</em>, <em>ceil_mode=False</em>, <em>layout='NCDHW'</em>, <em>**kwargs</em><span class="sig-paren">)</span><a class="headerlink" href="#mxnet.gluon.nn.AvgPool3D" title="Permalink to this definition">¶</a></dt>
 <dd><p>Average pooling operation for 3D data (spatial or spatio-temporal).</p>
 <table class="docutils field-list" frame="void" rules="none">
 <col class="field-name"/>
 <col class="field-body"/>
 <tbody valign="top">
 <tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
 <li><strong>pool_size</strong> (<em>int</em><em> or </em><em>list/tuple of 3 ints</em><em>,</em><em></em>) – Size of the max pooling windows.</li>
 <li><strong>strides</strong> (<em>int</em><em>, </em><em>list/tuple of 3 ints</em><em>, or </em><em>None.</em>) – Factor by which to downscale. E.g. 2 will halve the input size.
 If <cite>None</cite>, it will default to <cite>pool_size</cite>.</li>
 <li><strong>padding</strong> (<em>int</em><em> or </em><em>list/tuple of 3 ints</em><em>,</em><em></em>) – If padding is non-zero, then the input is implicitly
 zero-padded on both sides for padding number of points.</li>
 <li><strong>layout</strong> (<em>str</em><em>, </em><em>default 'NCDHW'</em>) – Dimension ordering of data and weight. Can be ‘NCDHW’, ‘NDHWC’, etc.
 ‘N’, ‘C’, ‘H’, ‘W’, ‘D’ stands for batch, channel, height, width and
 depth dimensions respectively. padding is applied on ‘D’, ‘H’ and ‘W’
 dimension.</li>
 <li><strong>ceil_mode</strong> (<em>bool</em><em>, </em><em>default False</em>) – When True, will use ceil instead of floor to compute the output shape.</li>
 </ul>
 </td>
 </tr>
 </tbody>
 </table>
 <dl class="docutils">
 <dt>Input shape:</dt>
 <dd>This depends on the <cite>layout</cite> parameter. Input is 5D array of shape
 (batch_size, channels, depth, height, width) if <cite>layout</cite> is <cite>NCDHW</cite>.</dd>
 <dt>Output shape:</dt>
 <dd><p class="first">This depends on the <cite>layout</cite> parameter. Output is 5D array of shape
 (batch_size, channels, out_depth, out_height, out_width) if <cite>layout</cite>
 is <cite>NCDHW</cite>.</p>
 <p>out_depth, out_height and out_width are calculated as</p>
 <div class="highlight-default"><div class="highlight"><pre><span></span><span class="n">out_depth</span> <span class="o">=</span> <span class="n">floor</span><span class="p">((</span><span class="n">depth</span><span class="o">+</span><span class="mi">2</span><span class="o">*</span><span class="n">padding</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">-</span><span class="n">pool_size</span><span class="p">[</span><span class="mi">0</span><span class="p">])</span><span class="o">/</span><span class="n">strides</span><span class="p">[</span><span class="mi">0</span><span class="p">])</span><span class="o">+</span><span class="mi">1</span>
 <span class="n">out_height</span> <span class="o">=</span> <span class="n">floor</span><span class="p">((</span><span class="n">height</span><span class="o">+</span><span class="mi">2</span><span class="o">*</span><span class="n">padding</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span><span class="o">-</span><span class="n">pool_size</span><span class="p">[</span><span class="mi">1</span><span class="p">])</span><span class="o">/</span><span class="n">strides</span><span class="p">[</span><span class="mi">1</span><span class="p">])</span><span class="o">+</span><span class="mi">1</span>
 <span class="n">out_width</span> <span class="o">=</span> <span class="n">floor</span><span class="p">((</span><span class="n">width</span><span class="o">+</span><span class="mi">2</span><span class="o">*</span><span class="n">padding</span><span class="p">[</span><span class="mi">2</span><span class="p">]</span><span class="o">-</span><span class="n">pool_size</span><span class="p">[</span><span class="mi">2</span><span class="p">])</span><span class="o">/</span><span class="n">strides</span><span class="p">[</span><span class="mi">2</span><span class="p">])</span><span class="o">+</span><span class="mi">1</span>
 </pre></div>
 </div>
 <p class="last">When <cite>ceil_mode</cite> is <cite>True,</cite> ceil will be used instead of floor in this
 equation.</p>
 </dd>
 </dl>
 </dd></dl>
 <dl class="class">
 <dt id="mxnet.gluon.nn.GlobalMaxPool1D">
 <em class="property">class </em><code class="descclassname">mxnet.gluon.nn.</code><code class="descname">GlobalMaxPool1D</code><span class="sig-paren">(</span><em>layout='NCW'</em>, <em>**kwargs</em><span class="sig-paren">)</span><a class="headerlink" href="#mxnet.gluon.nn.GlobalMaxPool1D" title="Permalink to this definition">¶</a></dt>
 <dd><p>Global max pooling operation for temporal data.</p>
 </dd></dl>
 <dl class="class">
 <dt id="mxnet.gluon.nn.GlobalMaxPool2D">
 <em class="property">class </em><code class="descclassname">mxnet.gluon.nn.</code><code class="descname">GlobalMaxPool2D</code><span class="sig-paren">(</span><em>layout='NCHW'</em>, <em>**kwargs</em><span class="sig-paren">)</span><a class="headerlink" href="#mxnet.gluon.nn.GlobalMaxPool2D" title="Permalink to this definition">¶</a></dt>
 <dd><p>Global max pooling operation for spatial data.</p>
 </dd></dl>
 <dl class="class">
 <dt id="mxnet.gluon.nn.GlobalMaxPool3D">
 <em class="property">class </em><code class="descclassname">mxnet.gluon.nn.</code><code class="descname">GlobalMaxPool3D</code><span class="sig-paren">(</span><em>layout='NCDHW'</em>, <em>**kwargs</em><span class="sig-paren">)</span><a class="headerlink" href="#mxnet.gluon.nn.GlobalMaxPool3D" title="Permalink to this definition">¶</a></dt>
 <dd><p>Global max pooling operation for 3D data.</p>
 </dd></dl>
 <dl class="class">
 <dt id="mxnet.gluon.nn.GlobalAvgPool1D">
 <em class="property">class </em><code class="descclassname">mxnet.gluon.nn.</code><code class="descname">GlobalAvgPool1D</code><span class="sig-paren">(</span><em>layout='NCW'</em>, <em>**kwargs</em><span class="sig-paren">)</span><a class="headerlink" href="#mxnet.gluon.nn.GlobalAvgPool1D" title="Permalink to this definition">¶</a></dt>
 <dd><p>Global average pooling operation for temporal data.</p>
 </dd></dl>
 <dl class="class">
 <dt id="mxnet.gluon.nn.GlobalAvgPool2D">
 <em class="property">class </em><code class="descclassname">mxnet.gluon.nn.</code><code class="descname">GlobalAvgPool2D</code><span class="sig-paren">(</span><em>layout='NCHW'</em>, <em>**kwargs</em><span class="sig-paren">)</span><a class="headerlink" href="#mxnet.gluon.nn.GlobalAvgPool2D" title="Permalink to this definition">¶</a></dt>
 <dd><p>Global average pooling operation for spatial data.</p>
 </dd></dl>
 <dl class="class">
 <dt id="mxnet.gluon.nn.GlobalAvgPool3D">
 <em class="property">class </em><code class="descclassname">mxnet.gluon.nn.</code><code class="descname">GlobalAvgPool3D</code><span class="sig-paren">(</span><em>layout='NCDHW'</em>, <em>**kwargs</em><span class="sig-paren">)</span><a class="headerlink" href="#mxnet.gluon.nn.GlobalAvgPool3D" title="Permalink to this definition">¶</a></dt>
 <dd><p>Global max pooling operation for 3D data.</p>
 </dd></dl>
 </div>
 </div>
 <div class="section" id="recurrent-layers">
 <span id="recurrent-layers"></span><h2>Recurrent Layers<a class="headerlink" href="#recurrent-layers" title="Permalink to this headline">¶</a></h2>
 <dl class="class">
 <dt id="mxnet.gluon.rnn.RecurrentCell">
 <em class="property">class </em><code class="descclassname">mxnet.gluon.rnn.</code><code class="descname">RecurrentCell</code><span class="sig-paren">(</span><em>prefix=None</em>, <em>params=None</em><span class="sig-paren">)</span><a class="headerlink" href="#mxnet.gluon.rnn.RecurrentCell" title="Permalink to this definition">¶</a></dt>
 <dd><p>Abstract base class for RNN cells</p>
 <table class="docutils field-list" frame="void" rules="none">
 <col class="field-name"/>
 <col class="field-body"/>
 <tbody valign="top">
 <tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
 <li><strong>prefix</strong> (<em>str</em><em>, </em><em>optional</em>) – Prefix for names of <cite>Block`s
 (this prefix is also used for names of weights if `params</cite> is <cite>None</cite>
 i.e. if <cite>params</cite> are being created and not reused)</li>
 <li><strong>params</strong> (<a class="reference internal" href="#mxnet.gluon.Parameter" title="mxnet.gluon.Parameter"><em>Parameter</em></a><em> or </em><em>None</em><em>, </em><em>optional</em>) – Container for weight sharing between cells.
 A new Parameter container is created if <cite>params</cite> is <cite>None</cite>.</li>
 </ul>
 </td>
 </tr>
 </tbody>
 </table>
 <dl class="method">
 <dt id="mxnet.gluon.rnn.RecurrentCell.__call__">
 <code class="descname">__call__</code><span class="sig-paren">(</span><em>*args</em><span class="sig-paren">)</span><a class="headerlink" href="#mxnet.gluon.rnn.RecurrentCell.__call__" title="Permalink to this definition">¶</a></dt>
 <dd><p>Calls forward. Only accepts positional arguments.</p>
 </dd></dl>
 <dl class="method">
 <dt id="mxnet.gluon.rnn.RecurrentCell.reset">
 <code class="descname">reset</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#mxnet.gluon.rnn.RecurrentCell.reset" title="Permalink to this definition">¶</a></dt>
 <dd><p>Reset before re-using the cell for another graph.</p>
 </dd></dl>
 <dl class="method">
 <dt id="mxnet.gluon.rnn.RecurrentCell.state_info">
 <code class="descname">state_info</code><span class="sig-paren">(</span><em>batch_size=0</em><span class="sig-paren">)</span><a class="headerlink" href="#mxnet.gluon.rnn.RecurrentCell.state_info" title="Permalink to this definition">¶</a></dt>
 <dd><p>shape and layout information of states</p>
 </dd></dl>
 <dl class="method">
 <dt id="mxnet.gluon.rnn.RecurrentCell.begin_state">
 <code class="descname">begin_state</code><span class="sig-paren">(</span><em>batch_size=0</em>, <em>func=<function zeros=""></function></em>, <em>**kwargs</em><span class="sig-paren">)</span><a class="headerlink" href="#mxnet.gluon.rnn.RecurrentCell.begin_state" title="Permalink to this definition">¶</a></dt>
 <dd><p>Initial state for this cell.</p>
 <table class="docutils field-list" frame="void" rules="none">
 <col class="field-name"/>
 <col class="field-body"/>
 <tbody valign="top">
 <tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
 <li><strong>func</strong> (<em>callable</em><em>, </em><em>default symbol.zeros</em>) – <p>Function for creating initial state.</p>
 <p>For Symbol API, func can be <cite>symbol.zeros</cite>, <cite>symbol.uniform</cite>,
 <cite>symbol.var etc</cite>. Use <cite>symbol.var</cite> if you want to directly
 feed input as states.</p>
 <p>For NDArray API, func can be <cite>ndarray.zeros</cite>, <cite>ndarray.ones</cite>, etc.</p>
 </li>
 <li><strong>batch_size</strong> (<em>int</em><em>, </em><em>default 0</em>) – Only required for NDArray API. Size of the batch (‘N’ in layout)
 dimension of input.</li>
 <li><strong>**kwargs</strong> – Additional keyword arguments passed to func. For example
 <cite>mean</cite>, <cite>std</cite>, <cite>dtype</cite>, etc.</li>
 </ul>
 </td>
 </tr>
 <tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first"><strong>states</strong> – Starting states for the first RNN step.</p>
 </td>
 </tr>
 <tr class="field-odd field"><th class="field-name">Return type:</th><td class="field-body"><p class="first last">nested list of Symbol</p>
 </td>
 </tr>
 </tbody>
 </table>
 </dd></dl>
 <dl class="method">
 <dt id="mxnet.gluon.rnn.RecurrentCell.unroll">
 <code class="descname">unroll</code><span class="sig-paren">(</span><em>length</em>, <em>inputs</em>, <em>begin_state=None</em>, <em>layout='NTC'</em>, <em>merge_outputs=None</em><span class="sig-paren">)</span><a class="headerlink" href="#mxnet.gluon.rnn.RecurrentCell.unroll" title="Permalink to this definition">¶</a></dt>
 <dd><p>Unrolls an RNN cell across time steps.</p>
 <table class="docutils field-list" frame="void" rules="none">
 <col class="field-name"/>
 <col class="field-body"/>
 <tbody valign="top">
 <tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
 <li><strong>length</strong> (<em>int</em>) – Number of steps to unroll.</li>
 <li><strong>inputs</strong> (<a class="reference internal" href="symbol.html#mxnet.symbol.Symbol" title="mxnet.symbol.Symbol"><em>Symbol</em></a><em>, </em><em>list of Symbol</em><em>, or </em><em>None</em>) – <p>If <cite>inputs</cite> is a single Symbol (usually the output
 of Embedding symbol), it should have shape
 (batch_size, length, ...) if <cite>layout</cite> is ‘NTC’,
 or (length, batch_size, ...) if <cite>layout</cite> is ‘TNC’.</p>
 <p>If <cite>inputs</cite> is a list of symbols (usually output of
 previous unroll), they should all have shape
 (batch_size, ...).</p>
 </li>
 <li><strong>begin_state</strong> (<em>nested list of Symbol</em><em>, </em><em>optional</em>) – Input states created by <cite>begin_state()</cite>
 or output state of another cell.
 Created from <cite>begin_state()</cite> if <cite>None</cite>.</li>
 <li><strong>layout</strong> (<em>str</em><em>, </em><em>optional</em>) – <cite>layout</cite> of input symbol. Only used if inputs
 is a single Symbol.</li>
 <li><strong>merge_outputs</strong> (<em>bool</em><em>, </em><em>optional</em>) – If <cite>False</cite>, returns outputs as a list of Symbols.
 If <cite>True</cite>, concatenates output across time steps
 and returns a single symbol with shape
 (batch_size, length, ...) if layout is ‘NTC’,
 or (length, batch_size, ...) if layout is ‘TNC’.
 If <cite>None</cite>, output whatever is faster.</li>
 </ul>
 </td>
 </tr>
 <tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last"><ul class="simple">
 <li><strong>outputs</strong> (<em>list of Symbol or Symbol</em>) – Symbol (if <cite>merge_outputs</cite> is True) or list of Symbols
 (if <cite>merge_outputs</cite> is False) corresponding to the output from
 the RNN from this unrolling.</li>
 <li><strong>states</strong> (<em>list of Symbol</em>) – The new state of this RNN after this unrolling.
 The type of this symbol is same as the output of <cite>begin_state()</cite>.</li>
 </ul>
 </p>
 </td>
 </tr>
 </tbody>
 </table>
 </dd></dl>
 <dl class="method">
 <dt id="mxnet.gluon.rnn.RecurrentCell.forward">
 <code class="descname">forward</code><span class="sig-paren">(</span><em>inputs</em>, <em>states</em><span class="sig-paren">)</span><a class="headerlink" href="#mxnet.gluon.rnn.RecurrentCell.forward" title="Permalink to this definition">¶</a></dt>
 <dd><p>Unrolls the recurrent cell for one time step.</p>
 <table class="docutils field-list" frame="void" rules="none">
 <col class="field-name"/>
 <col class="field-body"/>
 <tbody valign="top">
 <tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
 <li><strong>inputs</strong> (<em>sym.Variable</em>) – Input symbol, 2D, of shape (batch_size * num_units).</li>
 <li><strong>states</strong> (<em>list of sym.Variable</em>) – RNN state from previous step or the output of begin_state().</li>
 </ul>
 </td>
 </tr>
 <tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last"><ul class="simple">
 <li><strong>output</strong> (<em>Symbol</em>) – Symbol corresponding to the output from the RNN when unrolling
 for a single time step.</li>
 <li><strong>states</strong> (<em>list of Symbol</em>) – The new state of this RNN after this unrolling.
 The type of this symbol is same as the output of <cite>begin_state()</cite>.
 This can be used as an input state to the next time step
 of this RNN.</li>
 </ul>
 </p>
 </td>
 </tr>
 </tbody>
 </table>
 <div class="admonition seealso">
 <p class="first admonition-title">See also</p>
 <dl class="last docutils">
 <dt><a class="reference internal" href="#mxnet.gluon.rnn.RecurrentCell.begin_state" title="mxnet.gluon.rnn.RecurrentCell.begin_state"><code class="xref py py-meth docutils literal"><span class="pre">begin_state()</span></code></a></dt>
 <dd>This function can provide the states for the first time step.</dd>
 <dt><a class="reference internal" href="#mxnet.gluon.rnn.RecurrentCell.unroll" title="mxnet.gluon.rnn.RecurrentCell.unroll"><code class="xref py py-meth docutils literal"><span class="pre">unroll()</span></code></a></dt>
 <dd>This function unrolls an RNN for a given number of (>=1) time steps.</dd>
 </dl>
 </div>
 </dd></dl>
 </dd></dl>
 <dl class="class">
 <dt id="mxnet.gluon.rnn.RNN">
 <em class="property">class </em><code class="descclassname">mxnet.gluon.rnn.</code><code class="descname">RNN</code><span class="sig-paren">(</span><em>hidden_size</em>, <em>num_layers=1</em>, <em>activation='relu'</em>, <em>layout='TNC'</em>, <em>dropout=0</em>, <em>bidirectional=False</em>, <em>i2h_weight_initializer=None</em>, <em>h2h_weight_initializer=None</em>, <em>i2h_bias_initializer='zeros'</em>, <em>h2h_bias_initializer='zeros'</em>, <em>input_size=0</em>, <em>**kwargs</em><span class="sig-paren">)</span><a class="headerlink" href="#mxnet.gluon.rnn.RNN" title="Permalink to this definition">¶</a></dt>
 <dd><p>Applies a multi-layer Elman RNN with <cite>tanh</cite> or <cite>ReLU</cite> non-linearity to an input sequence.</p>
 <p>For each element in the input sequence, each layer computes the following
 function:</p>
 <div class="math">
 \[h_t = \tanh(w_{ih} * x_t + b_{ih}  +  w_{hh} * h_{(t-1)} + b_{hh})\]</div>
 <p>where <span class="math">\(h_t\)</span> is the hidden state at time <cite>t</cite>, and <span class="math">\(x_t\)</span> is the hidden
 state of the previous layer at time <cite>t</cite> or <span class="math">\(input_t\)</span> for the first layer.
 If nonlinearity=’relu’, then <cite>ReLU</cite> is used instead of <cite>tanh</cite>.</p>
 <table class="docutils field-list" frame="void" rules="none">
 <col class="field-name"/>
 <col class="field-body"/>
 <tbody valign="top">
 <tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
 <li><strong>hidden_size</strong> (<em>int</em>) – The number of features in the hidden state h.</li>
 <li><strong>num_layers</strong> (<em>int</em><em>, </em><em>default 1</em>) – Number of recurrent layers.</li>
 <li><strong>activation</strong> (<em>{'relu'</em><em> or </em><em>'tanh'}</em><em>, </em><em>default 'tanh'</em>) – The activation function to use.</li>
 <li><strong>layout</strong> (<em>str</em><em>, </em><em>default 'TNC'</em>) – The format of input and output tensors. T, N and C stand for
 sequence length, batch size, and feature dimensions respectively.</li>
 <li><strong>dropout</strong> (<em>float</em><em>, </em><em>default 0</em>) – If non-zero, introduces a dropout layer on the outputs of each
 RNN layer except the last layer.</li>
 <li><strong>bidirectional</strong> (<em>bool</em><em>, </em><em>default False</em>) – If <cite>True</cite>, becomes a bidirectional RNN.</li>
 <li><strong>i2h_weight_initializer</strong> (<em>str</em><em> or </em><a class="reference internal" href="optimization.html#mxnet.initializer.Initializer" title="mxnet.initializer.Initializer"><em>Initializer</em></a>) – Initializer for the input weights matrix, used for the linear
 transformation of the inputs.</li>
 <li><strong>h2h_weight_initializer</strong> (<em>str</em><em> or </em><a class="reference internal" href="optimization.html#mxnet.initializer.Initializer" title="mxnet.initializer.Initializer"><em>Initializer</em></a>) – Initializer for the recurrent weights matrix, used for the linear
 transformation of the recurrent state.</li>
 <li><strong>i2h_bias_initializer</strong> (<em>str</em><em> or </em><a class="reference internal" href="optimization.html#mxnet.initializer.Initializer" title="mxnet.initializer.Initializer"><em>Initializer</em></a>) – Initializer for the bias vector.</li>
 <li><strong>h2h_bias_initializer</strong> (<em>str</em><em> or </em><a class="reference internal" href="optimization.html#mxnet.initializer.Initializer" title="mxnet.initializer.Initializer"><em>Initializer</em></a>) – Initializer for the bias vector.</li>
 <li><strong>input_size</strong> (<em>int</em><em>, </em><em>default 0</em>) – The number of expected features in the input x.
 If not specified, it will be inferred from input.</li>
 <li><strong>prefix</strong> (<em>str</em><em> or </em><em>None</em>) – Prefix of this <cite>Block</cite>.</li>
 <li><strong>params</strong> (<a class="reference internal" href="#mxnet.gluon.ParameterDict" title="mxnet.gluon.ParameterDict"><em>ParameterDict</em></a><em> or </em><em>None</em>) – Shared Parameters for this <cite>Block</cite>.</li>
 </ul>
 </td>
 </tr>
 </tbody>
 </table>
 <dl class="docutils">
 <dt>Input shapes:</dt>
 <dd>The input shape depends on <cite>layout</cite>. For <cite>layout=’TNC’</cite>, the
 input has shape <cite>(sequence_length, batch_size, input_size)</cite></dd>
 <dt>Output shape:</dt>
 <dd>The output shape depends on <cite>layout</cite>. For <cite>layout=’TNC’</cite>, the
 output has shape <cite>(sequence_length, batch_size, num_hidden)</cite>.
 If <cite>bidirectional</cite> is True, output shape will instead be
 <cite>(sequence_length, batch_size, 2*num_hidden)</cite></dd>
 <dt>Recurrent state shape:</dt>
 <dd>The recurrent state’s shape is <cite>(num_layers, batch_size, num_hidden)</cite>.
 If <cite>bidirectional</cite> is True, state shape will instead be
 <cite>(num_layers, batch_size, 2*num_hidden)</cite></dd>
 </dl>
 <p class="rubric">Examples</p>
 <div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">>>> </span><span class="n">layer</span> <span class="o">=</span> <span class="n">mx</span><span class="o">.</span><span class="n">gluon</span><span class="o">.</span><span class="n">rnn</span><span class="o">.</span><span class="n">RNN</span><span class="p">(</span><span class="mi">100</span><span class="p">,</span> <span class="mi">3</span><span class="p">)</span>
 <span class="gp">>>> </span><span class="n">layer</span><span class="o">.</span><span class="n">initialize</span><span class="p">()</span>
 <span class="gp">>>> </span><span class="nb">input</span> <span class="o">=</span> <span class="n">mx</span><span class="o">.</span><span class="n">nd</span><span class="o">.</span><span class="n">random_uniform</span><span class="p">(</span><span class="n">shape</span><span class="o">=</span><span class="p">(</span><span class="mi">5</span><span class="p">,</span> <span class="mi">3</span><span class="p">,</span> <span class="mi">10</span><span class="p">))</span>
 <span class="gp">>>> </span><span class="n">h0</span> <span class="o">=</span> <span class="n">mx</span><span class="o">.</span><span class="n">nd</span><span class="o">.</span><span class="n">random_uniform</span><span class="p">(</span><span class="n">shape</span><span class="o">=</span><span class="p">(</span><span class="mi">3</span><span class="p">,</span> <span class="mi">3</span><span class="p">,</span> <span class="mi">100</span><span class="p">))</span>
 <span class="gp">>>> </span><span class="n">output</span><span class="p">,</span> <span class="n">hn</span> <span class="o">=</span> <span class="n">layer</span><span class="p">(</span><span class="nb">input</span><span class="p">,</span> <span class="n">h0</span><span class="p">)</span>
 </pre></div>
 </div>
 </dd></dl>
 <dl class="class">
 <dt id="mxnet.gluon.rnn.LSTM">
 <em class="property">class </em><code class="descclassname">mxnet.gluon.rnn.</code><code class="descname">LSTM</code><span class="sig-paren">(</span><em>hidden_size</em>, <em>num_layers=1</em>, <em>layout='TNC'</em>, <em>dropout=0</em>, <em>bidirectional=False</em>, <em>input_size=0</em>, <em>i2h_weight_initializer=None</em>, <em>h2h_weight_initializer=None</em>, <em>i2h_bias_initializer='zeros'</em>, <em>h2h_bias_initializer='zeros'</em>, <em>**kwargs</em><span class="sig-paren">)</span><a class="headerlink" href="#mxnet.gluon.rnn.LSTM" title="Permalink to this definition">¶</a></dt>
 <dd><p>Applies a multi-layer long short-term memory (LSTM) RNN to an input sequence.</p>
 <p>For each element in the input sequence, each layer computes the following
 function:</p>
 <div class="math">
 \[\begin{split}\begin{array}{ll}
 i_t = sigmoid(W_{ii} x_t + b_{ii} + W_{hi} h_{(t-1)} + b_{hi}) \\
 f_t = sigmoid(W_{if} x_t + b_{if} + W_{hf} h_{(t-1)} + b_{hf}) \\
 g_t = \tanh(W_{ig} x_t + b_{ig} + W_{hc} h_{(t-1)} + b_{hg}) \\
 o_t = sigmoid(W_{io} x_t + b_{io} + W_{ho} h_{(t-1)} + b_{ho}) \\
 c_t = f_t * c_{(t-1)} + i_t * g_t \\
 h_t = o_t * \tanh(c_t)
 \end{array}\end{split}\]</div>
 <p>where <span class="math">\(h_t\)</span> is the hidden state at time <cite>t</cite>, <span class="math">\(c_t\)</span> is the
 cell state at time <cite>t</cite>, <span class="math">\(x_t\)</span> is the hidden state of the previous
 layer at time <cite>t</cite> or <span class="math">\(input_t\)</span> for the first layer, and <span class="math">\(i_t\)</span>,
 <span class="math">\(f_t\)</span>, <span class="math">\(g_t\)</span>, <span class="math">\(o_t\)</span> are the input, forget, cell, and
 out gates, respectively.</p>
 <table class="docutils field-list" frame="void" rules="none">
 <col class="field-name"/>
 <col class="field-body"/>
 <tbody valign="top">
 <tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
 <li><strong>hidden_size</strong> (<em>int</em>) – The number of features in the hidden state h.</li>
 <li><strong>num_layers</strong> (<em>int</em><em>, </em><em>default 1</em>) – Number of recurrent layers.</li>
 <li><strong>layout</strong> (<em>str</em><em>, </em><em>default 'TNC'</em>) – The format of input and output tensors. T, N and C stand for
 sequence length, batch size, and feature dimensions respectively.</li>
 <li><strong>dropout</strong> (<em>float</em><em>, </em><em>default 0</em>) – If non-zero, introduces a dropout layer on the outputs of each
 RNN layer except the last layer.</li>
 <li><strong>bidirectional</strong> (<em>bool</em><em>, </em><em>default False</em>) – If <cite>True</cite>, becomes a bidirectional RNN.</li>
 <li><strong>i2h_weight_initializer</strong> (<em>str</em><em> or </em><a class="reference internal" href="optimization.html#mxnet.initializer.Initializer" title="mxnet.initializer.Initializer"><em>Initializer</em></a>) – Initializer for the input weights matrix, used for the linear
 transformation of the inputs.</li>
 <li><strong>h2h_weight_initializer</strong> (<em>str</em><em> or </em><a class="reference internal" href="optimization.html#mxnet.initializer.Initializer" title="mxnet.initializer.Initializer"><em>Initializer</em></a>) – Initializer for the recurrent weights matrix, used for the linear
 transformation of the recurrent state.</li>
 <li><strong>i2h_bias_initializer</strong> (<em>str</em><em> or </em><a class="reference internal" href="optimization.html#mxnet.initializer.Initializer" title="mxnet.initializer.Initializer"><em>Initializer</em></a><em>, </em><em>default 'lstmbias'</em>) – Initializer for the bias vector. By default, bias for the forget
 gate is initialized to 1 while all other biases are initialized
 to zero.</li>
 <li><strong>h2h_bias_initializer</strong> (<em>str</em><em> or </em><a class="reference internal" href="optimization.html#mxnet.initializer.Initializer" title="mxnet.initializer.Initializer"><em>Initializer</em></a>) – Initializer for the bias vector.</li>
 <li><strong>input_size</strong> (<em>int</em><em>, </em><em>default 0</em>) – The number of expected features in the input x.
 If not specified, it will be inferred from input.</li>
 <li><strong>prefix</strong> (<em>str</em><em> or </em><em>None</em>) – Prefix of this <cite>Block</cite>.</li>
 <li><strong>params</strong> (<cite>ParameterDict</cite> or <cite>None</cite>) – Shared Parameters for this <cite>Block</cite>.</li>
 </ul>
 </td>
 </tr>
 </tbody>
 </table>
 <dl class="docutils">
 <dt>Input shapes:</dt>
 <dd>The input shape depends on <cite>layout</cite>. For <cite>layout=’TNC’</cite>, the
 input has shape <cite>(sequence_length, batch_size, input_size)</cite></dd>
 <dt>Output shape:</dt>
 <dd>The output shape depends on <cite>layout</cite>. For <cite>layout=’TNC’</cite>, the
 output has shape <cite>(sequence_length, batch_size, num_hidden)</cite>.
 If <cite>bidirectional</cite> is True, output shape will instead be
 <cite>(sequence_length, batch_size, 2*num_hidden)</cite></dd>
 <dt>Recurrent state shape:</dt>
 <dd>The recurrent state is a list of two NDArrays. Both has shape
 <cite>(num_layers, batch_size, num_hidden)</cite>.
 If <cite>bidirectional</cite> is True, state shape will instead be
 <cite>(num_layers, batch_size, 2*num_hidden)</cite>.</dd>
 </dl>
 <p class="rubric">Examples</p>
 <div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">>>> </span><span class="n">layer</span> <span class="o">=</span> <span class="n">mx</span><span class="o">.</span><span class="n">gluon</span><span class="o">.</span><span class="n">rnn</span><span class="o">.</span><span class="n">LSTM</span><span class="p">(</span><span class="mi">100</span><span class="p">,</span> <span class="mi">3</span><span class="p">)</span>
 <span class="gp">>>> </span><span class="n">layer</span><span class="o">.</span><span class="n">initialize</span><span class="p">()</span>
 <span class="gp">>>> </span><span class="nb">input</span> <span class="o">=</span> <span class="n">mx</span><span class="o">.</span><span class="n">nd</span><span class="o">.</span><span class="n">random_uniform</span><span class="p">(</span><span class="n">shape</span><span class="o">=</span><span class="p">(</span><span class="mi">5</span><span class="p">,</span> <span class="mi">3</span><span class="p">,</span> <span class="mi">10</span><span class="p">))</span>
 <span class="gp">>>> </span><span class="n">h0</span> <span class="o">=</span> <span class="n">mx</span><span class="o">.</span><span class="n">nd</span><span class="o">.</span><span class="n">random_uniform</span><span class="p">(</span><span class="n">shape</span><span class="o">=</span><span class="p">(</span><span class="mi">3</span><span class="p">,</span> <span class="mi">3</span><span class="p">,</span> <span class="mi">100</span><span class="p">))</span>
 <span class="gp">>>> </span><span class="n">c0</span> <span class="o">=</span> <span class="n">mx</span><span class="o">.</span><span class="n">nd</span><span class="o">.</span><span class="n">random_uniform</span><span class="p">(</span><span class="n">shape</span><span class="o">=</span><span class="p">(</span><span class="mi">3</span><span class="p">,</span> <span class="mi">3</span><span class="p">,</span> <span class="mi">100</span><span class="p">))</span>
 <span class="gp">>>> </span><span class="n">output</span><span class="p">,</span> <span class="n">hn</span> <span class="o">=</span> <span class="n">layer</span><span class="p">(</span><span class="nb">input</span><span class="p">,</span> <span class="p">[</span><span class="n">h0</span><span class="p">,</span> <span class="n">c0</span><span class="p">])</span>
 </pre></div>
 </div>
 </dd></dl>
 <dl class="class">
 <dt id="mxnet.gluon.rnn.GRU">
 <em class="property">class </em><code class="descclassname">mxnet.gluon.rnn.</code><code class="descname">GRU</code><span class="sig-paren">(</span><em>hidden_size</em>, <em>num_layers=1</em>, <em>layout='TNC'</em>, <em>dropout=0</em>, <em>bidirectional=False</em>, <em>input_size=0</em>, <em>i2h_weight_initializer=None</em>, <em>h2h_weight_initializer=None</em>, <em>i2h_bias_initializer='zeros'</em>, <em>h2h_bias_initializer='zeros'</em>, <em>**kwargs</em><span class="sig-paren">)</span><a class="headerlink" href="#mxnet.gluon.rnn.GRU" title="Permalink to this definition">¶</a></dt>
 <dd><p>Applies a multi-layer gated recurrent unit (GRU) RNN to an input sequence.</p>
 <p>For each element in the input sequence, each layer computes the following
 function:</p>
 <div class="math">
 \[\begin{split}\begin{array}{ll}
 r_t = sigmoid(W_{ir} x_t + b_{ir} + W_{hr} h_{(t-1)} + b_{hr}) \\
 i_t = sigmoid(W_{ii} x_t + b_{ii} + W_hi h_{(t-1)} + b_{hi}) \\
 n_t = \tanh(W_{in} x_t + b_{in} + r_t * (W_{hn} h_{(t-1)}+ b_{hn})) \\
 h_t = (1 - i_t) * n_t + i_t * h_{(t-1)} \\
 \end{array}\end{split}\]</div>
 <p>where <span class="math">\(h_t\)</span> is the hidden state at time <cite>t</cite>, <span class="math">\(x_t\)</span> is the hidden
 state of the previous layer at time <cite>t</cite> or <span class="math">\(input_t\)</span> for the first layer,
 and <span class="math">\(r_t\)</span>, <span class="math">\(i_t\)</span>, <span class="math">\(n_t\)</span> are the reset, input, and new gates, respectively.</p>
 <table class="docutils field-list" frame="void" rules="none">
 <col class="field-name"/>
 <col class="field-body"/>
 <tbody valign="top">
 <tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
 <li><strong>hidden_size</strong> (<em>int</em>) – The number of features in the hidden state h</li>
 <li><strong>num_layers</strong> (<em>int</em><em>, </em><em>default 1</em>) – Number of recurrent layers.</li>
 <li><strong>layout</strong> (<em>str</em><em>, </em><em>default 'TNC'</em>) – The format of input and output tensors. T, N and C stand for
 sequence length, batch size, and feature dimensions respectively.</li>
 <li><strong>dropout</strong> (<em>float</em><em>, </em><em>default 0</em>) – If non-zero, introduces a dropout layer on the outputs of each
 RNN layer except the last layer</li>
 <li><strong>bidirectional</strong> (<em>bool</em><em>, </em><em>default False</em>) – If True, becomes a bidirectional RNN.</li>
 <li><strong>i2h_weight_initializer</strong> (<em>str</em><em> or </em><a class="reference internal" href="optimization.html#mxnet.initializer.Initializer" title="mxnet.initializer.Initializer"><em>Initializer</em></a>) – Initializer for the input weights matrix, used for the linear
 transformation of the inputs.</li>
 <li><strong>h2h_weight_initializer</strong> (<em>str</em><em> or </em><a class="reference internal" href="optimization.html#mxnet.initializer.Initializer" title="mxnet.initializer.Initializer"><em>Initializer</em></a>) – Initializer for the recurrent weights matrix, used for the linear
 transformation of the recurrent state.</li>
 <li><strong>i2h_bias_initializer</strong> (<em>str</em><em> or </em><a class="reference internal" href="optimization.html#mxnet.initializer.Initializer" title="mxnet.initializer.Initializer"><em>Initializer</em></a>) – Initializer for the bias vector.</li>
 <li><strong>h2h_bias_initializer</strong> (<em>str</em><em> or </em><a class="reference internal" href="optimization.html#mxnet.initializer.Initializer" title="mxnet.initializer.Initializer"><em>Initializer</em></a>) – Initializer for the bias vector.</li>
 <li><strong>input_size</strong> (<em>int</em><em>, </em><em>default 0</em>) – The number of expected features in the input x.
 If not specified, it will be inferred from input.</li>
 <li><strong>prefix</strong> (<em>str</em><em> or </em><em>None</em>) – Prefix of this <cite>Block</cite>.</li>
 <li><strong>params</strong> (<a class="reference internal" href="#mxnet.gluon.ParameterDict" title="mxnet.gluon.ParameterDict"><em>ParameterDict</em></a><em> or </em><em>None</em>) – Shared Parameters for this <cite>Block</cite>.</li>
 </ul>
 </td>
 </tr>
 </tbody>
 </table>
 <dl class="docutils">
 <dt>Input shapes:</dt>
 <dd>The input shape depends on <cite>layout</cite>. For <cite>layout=’TNC’</cite>, the
 input has shape <cite>(sequence_length, batch_size, input_size)</cite></dd>
 <dt>Output shape:</dt>
 <dd>The output shape depends on <cite>layout</cite>. For <cite>layout=’TNC’</cite>, the
 output has shape <cite>(sequence_length, batch_size, num_hidden)</cite>.
 If <cite>bidirectional</cite> is True, output shape will instead be
 <cite>(sequence_length, batch_size, 2*num_hidden)</cite></dd>
 <dt>Recurrent state shape:</dt>
 <dd>The recurrent state’s shape is <cite>(num_layers, batch_size, num_hidden)</cite>.
 If <cite>bidirectional</cite> is True, state shape will instead be
 <cite>(num_layers, batch_size, 2*num_hidden)</cite></dd>
 </dl>
 <p class="rubric">Examples</p>
 <div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">>>> </span><span class="n">layer</span> <span class="o">=</span> <span class="n">mx</span><span class="o">.</span><span class="n">gluon</span><span class="o">.</span><span class="n">rnn</span><span class="o">.</span><span class="n">GRU</span><span class="p">(</span><span class="mi">100</span><span class="p">,</span> <span class="mi">3</span><span class="p">)</span>
 <span class="gp">>>> </span><span class="n">layer</span><span class="o">.</span><span class="n">initialize</span><span class="p">()</span>
 <span class="gp">>>> </span><span class="nb">input</span> <span class="o">=</span> <span class="n">mx</span><span class="o">.</span><span class="n">nd</span><span class="o">.</span><span class="n">random_uniform</span><span class="p">(</span><span class="n">shape</span><span class="o">=</span><span class="p">(</span><span class="mi">5</span><span class="p">,</span> <span class="mi">3</span><span class="p">,</span> <span class="mi">10</span><span class="p">))</span>
 <span class="gp">>>> </span><span class="n">h0</span> <span class="o">=</span> <span class="n">mx</span><span class="o">.</span><span class="n">nd</span><span class="o">.</span><span class="n">random_uniform</span><span class="p">(</span><span class="n">shape</span><span class="o">=</span><span class="p">(</span><span class="mi">3</span><span class="p">,</span> <span class="mi">3</span><span class="p">,</span> <span class="mi">100</span><span class="p">))</span>
 <span class="gp">>>> </span><span class="n">output</span><span class="p">,</span> <span class="n">hn</span> <span class="o">=</span> <span class="n">layer</span><span class="p">(</span><span class="nb">input</span><span class="p">,</span> <span class="n">h0</span><span class="p">)</span>
 </pre></div>
 </div>
 </dd></dl>
 <dl class="class">
 <dt id="mxnet.gluon.rnn.RNNCell">
 <em class="property">class </em><code class="descclassname">mxnet.gluon.rnn.</code><code class="descname">RNNCell</code><span class="sig-paren">(</span><em>hidden_size</em>, <em>activation='tanh'</em>, <em>i2h_weight_initializer=None</em>, <em>h2h_weight_initializer=None</em>, <em>i2h_bias_initializer='zeros'</em>, <em>h2h_bias_initializer='zeros'</em>, <em>input_size=0</em>, <em>prefix=None</em>, <em>params=None</em><span class="sig-paren">)</span><a class="headerlink" href="#mxnet.gluon.rnn.RNNCell" title="Permalink to this definition">¶</a></dt>
 <dd><p>Simple recurrent neural network cell.</p>
 <table class="docutils field-list" frame="void" rules="none">
 <col class="field-name"/>
 <col class="field-body"/>
 <tbody valign="top">
 <tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
 <li><strong>hidden_size</strong> (<em>int</em>) – Number of units in output symbol</li>
 <li><strong>activation</strong> (<em>str</em><em> or </em><a class="reference internal" href="symbol.html#mxnet.symbol.Symbol" title="mxnet.symbol.Symbol"><em>Symbol</em></a><em>, </em><em>default 'tanh'</em>) – Type of activation function.</li>
 <li><strong>i2h_weight_initializer</strong> (<em>str</em><em> or </em><a class="reference internal" href="optimization.html#mxnet.initializer.Initializer" title="mxnet.initializer.Initializer"><em>Initializer</em></a>) – Initializer for the input weights matrix, used for the linear
 transformation of the inputs.</li>
 <li><strong>h2h_weight_initializer</strong> (<em>str</em><em> or </em><a class="reference internal" href="optimization.html#mxnet.initializer.Initializer" title="mxnet.initializer.Initializer"><em>Initializer</em></a>) – Initializer for the recurrent weights matrix, used for the linear
 transformation of the recurrent state.</li>
 <li><strong>i2h_bias_initializer</strong> (<em>str</em><em> or </em><a class="reference internal" href="optimization.html#mxnet.initializer.Initializer" title="mxnet.initializer.Initializer"><em>Initializer</em></a>) – Initializer for the bias vector.</li>
 <li><strong>h2h_bias_initializer</strong> (<em>str</em><em> or </em><a class="reference internal" href="optimization.html#mxnet.initializer.Initializer" title="mxnet.initializer.Initializer"><em>Initializer</em></a>) – Initializer for the bias vector.</li>
 <li><strong>prefix</strong> (str, default ‘<a href="#id12"><span class="problematic" id="id13">rnn_</span></a>‘) – Prefix for name of <cite>Block`s
 (and name of weight if params is `None</cite>).</li>
 <li><strong>params</strong> (<a class="reference internal" href="#mxnet.gluon.Parameter" title="mxnet.gluon.Parameter"><em>Parameter</em></a><em> or </em><em>None</em>) – Container for weight sharing between cells.
 Created if <cite>None</cite>.</li>
 </ul>
 </td>
 </tr>
 </tbody>
 </table>
 </dd></dl>
 <dl class="class">
 <dt id="mxnet.gluon.rnn.LSTMCell">
 <em class="property">class </em><code class="descclassname">mxnet.gluon.rnn.</code><code class="descname">LSTMCell</code><span class="sig-paren">(</span><em>hidden_size</em>, <em>i2h_weight_initializer=None</em>, <em>h2h_weight_initializer=None</em>, <em>i2h_bias_initializer='zeros'</em>, <em>h2h_bias_initializer='zeros'</em>, <em>input_size=0</em>, <em>prefix=None</em>, <em>params=None</em><span class="sig-paren">)</span><a class="headerlink" href="#mxnet.gluon.rnn.LSTMCell" title="Permalink to this definition">¶</a></dt>
 <dd><p>Long-Short Term Memory (LSTM) network cell.</p>
 <table class="docutils field-list" frame="void" rules="none">
 <col class="field-name"/>
 <col class="field-body"/>
 <tbody valign="top">
 <tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
 <li><strong>hidden_size</strong> (<em>int</em>) – Number of units in output symbol.</li>
 <li><strong>i2h_weight_initializer</strong> (<em>str</em><em> or </em><a class="reference internal" href="optimization.html#mxnet.initializer.Initializer" title="mxnet.initializer.Initializer"><em>Initializer</em></a>) – Initializer for the input weights matrix, used for the linear
 transformation of the inputs.</li>
 <li><strong>h2h_weight_initializer</strong> (<em>str</em><em> or </em><a class="reference internal" href="optimization.html#mxnet.initializer.Initializer" title="mxnet.initializer.Initializer"><em>Initializer</em></a>) – Initializer for the recurrent weights matrix, used for the linear
 transformation of the recurrent state.</li>
 <li><strong>i2h_bias_initializer</strong> (<em>str</em><em> or </em><a class="reference internal" href="optimization.html#mxnet.initializer.Initializer" title="mxnet.initializer.Initializer"><em>Initializer</em></a><em>, </em><em>default 'lstmbias'</em>) – Initializer for the bias vector. By default, bias for the forget
 gate is initialized to 1 while all other biases are initialized
 to zero.</li>
 <li><strong>h2h_bias_initializer</strong> (<em>str</em><em> or </em><a class="reference internal" href="optimization.html#mxnet.initializer.Initializer" title="mxnet.initializer.Initializer"><em>Initializer</em></a>) – Initializer for the bias vector.</li>
 <li><strong>prefix</strong> (str, default ‘<a href="#id14"><span class="problematic" id="id15">lstm_</span></a>‘) – Prefix for name of <cite>Block`s
 (and name of weight if params is `None</cite>).</li>
 <li><strong>params</strong> (<a class="reference internal" href="#mxnet.gluon.Parameter" title="mxnet.gluon.Parameter"><em>Parameter</em></a><em> or </em><em>None</em>) – Container for weight sharing between cells.
 Created if <cite>None</cite>.</li>
 </ul>
 </td>
 </tr>
 </tbody>
 </table>
 </dd></dl>
 <dl class="class">
 <dt id="mxnet.gluon.rnn.GRUCell">
 <em class="property">class </em><code class="descclassname">mxnet.gluon.rnn.</code><code class="descname">GRUCell</code><span class="sig-paren">(</span><em>hidden_size</em>, <em>i2h_weight_initializer=None</em>, <em>h2h_weight_initializer=None</em>, <em>i2h_bias_initializer='zeros'</em>, <em>h2h_bias_initializer='zeros'</em>, <em>input_size=0</em>, <em>prefix=None</em>, <em>params=None</em><span class="sig-paren">)</span><a class="headerlink" href="#mxnet.gluon.rnn.GRUCell" title="Permalink to this definition">¶</a></dt>
 <dd><p>Gated Rectified Unit (GRU) network cell.
 Note: this is an implementation of the cuDNN version of GRUs
 (slight modification compared to Cho et al. 2014).</p>
 <table class="docutils field-list" frame="void" rules="none">
 <col class="field-name"/>
 <col class="field-body"/>
 <tbody valign="top">
 <tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
 <li><strong>hidden_size</strong> (<em>int</em>) – Number of units in output symbol.</li>
 <li><strong>i2h_weight_initializer</strong> (<em>str</em><em> or </em><a class="reference internal" href="optimization.html#mxnet.initializer.Initializer" title="mxnet.initializer.Initializer"><em>Initializer</em></a>) – Initializer for the input weights matrix, used for the linear
 transformation of the inputs.</li>
 <li><strong>h2h_weight_initializer</strong> (<em>str</em><em> or </em><a class="reference internal" href="optimization.html#mxnet.initializer.Initializer" title="mxnet.initializer.Initializer"><em>Initializer</em></a>) – Initializer for the recurrent weights matrix, used for the linear
 transformation of the recurrent state.</li>
 <li><strong>i2h_bias_initializer</strong> (<em>str</em><em> or </em><a class="reference internal" href="optimization.html#mxnet.initializer.Initializer" title="mxnet.initializer.Initializer"><em>Initializer</em></a>) – Initializer for the bias vector.</li>
 <li><strong>h2h_bias_initializer</strong> (<em>str</em><em> or </em><a class="reference internal" href="optimization.html#mxnet.initializer.Initializer" title="mxnet.initializer.Initializer"><em>Initializer</em></a>) – Initializer for the bias vector.</li>
 <li><strong>prefix</strong> (str, default ‘<a href="#id16"><span class="problematic" id="id17">gru_</span></a>‘) – prefix for name of <cite>Block`s
 (and name of weight if params is `None</cite>).</li>
 <li><strong>params</strong> (<a class="reference internal" href="#mxnet.gluon.Parameter" title="mxnet.gluon.Parameter"><em>Parameter</em></a><em> or </em><em>None</em>) – Container for weight sharing between cells.
 Created if <cite>None</cite>.</li>
 </ul>
 </td>
 </tr>
 </tbody>
 </table>
 </dd></dl>
 <dl class="class">
 <dt id="mxnet.gluon.rnn.SequentialRNNCell">
 <em class="property">class </em><code class="descclassname">mxnet.gluon.rnn.</code><code class="descname">SequentialRNNCell</code><span class="sig-paren">(</span><em>prefix=None</em>, <em>params=None</em><span class="sig-paren">)</span><a class="headerlink" href="#mxnet.gluon.rnn.SequentialRNNCell" title="Permalink to this definition">¶</a></dt>
 <dd><p>Sequentially stacking multiple RNN cells.</p>
 <dl class="method">
 <dt id="mxnet.gluon.rnn.SequentialRNNCell.add">
 <code class="descname">add</code><span class="sig-paren">(</span><em>cell</em><span class="sig-paren">)</span><a class="headerlink" href="#mxnet.gluon.rnn.SequentialRNNCell.add" title="Permalink to this definition">¶</a></dt>
 <dd><p>Appends a cell into the stack.</p>
 <table class="docutils field-list" frame="void" rules="none">
 <col class="field-name"/>
 <col class="field-body"/>
 <tbody valign="top">
 <tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>cell</strong> (<em>rnn cell</em>) – </td>
 </tr>
 </tbody>
 </table>
 </dd></dl>
 </dd></dl>
 <dl class="class">
 <dt id="mxnet.gluon.rnn.BidirectionalCell">
 <em class="property">class </em><code class="descclassname">mxnet.gluon.rnn.</code><code class="descname">BidirectionalCell</code><span class="sig-paren">(</span><em>l_cell</em>, <em>r_cell</em>, <em>output_prefix='bi_'</em><span class="sig-paren">)</span><a class="headerlink" href="#mxnet.gluon.rnn.BidirectionalCell" title="Permalink to this definition">¶</a></dt>
 <dd><p>Bidirectional RNN cell.</p>
 <table class="docutils field-list" frame="void" rules="none">
 <col class="field-name"/>
 <col class="field-body"/>
 <tbody valign="top">
 <tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
 <li><strong>l_cell</strong> (<a class="reference internal" href="#mxnet.gluon.rnn.RecurrentCell" title="mxnet.gluon.rnn.RecurrentCell"><em>RecurrentCell</em></a>) – Cell for forward unrolling</li>
 <li><strong>r_cell</strong> (<a class="reference internal" href="#mxnet.gluon.rnn.RecurrentCell" title="mxnet.gluon.rnn.RecurrentCell"><em>RecurrentCell</em></a>) – Cell for backward unrolling</li>
 </ul>
 </td>
 </tr>
 </tbody>
 </table>
 </dd></dl>
 <dl class="class">
 <dt id="mxnet.gluon.rnn.DropoutCell">
 <em class="property">class </em><code class="descclassname">mxnet.gluon.rnn.</code><code class="descname">DropoutCell</code><span class="sig-paren">(</span><em>dropout</em>, <em>prefix=None</em>, <em>params=None</em><span class="sig-paren">)</span><a class="headerlink" href="#mxnet.gluon.rnn.DropoutCell" title="Permalink to this definition">¶</a></dt>
 <dd><p>Applies dropout on input.</p>
 <table class="docutils field-list" frame="void" rules="none">
 <col class="field-name"/>
 <col class="field-body"/>
 <tbody valign="top">
 <tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>dropout</strong> (<em>float</em>) – Percentage of elements to drop out, which
 is 1 - percentage to retain.</td>
 </tr>
 </tbody>
 </table>
 </dd></dl>
 <dl class="class">
 <dt id="mxnet.gluon.rnn.ZoneoutCell">
 <em class="property">class </em><code class="descclassname">mxnet.gluon.rnn.</code><code class="descname">ZoneoutCell</code><span class="sig-paren">(</span><em>base_cell</em>, <em>zoneout_outputs=0.0</em>, <em>zoneout_states=0.0</em><span class="sig-paren">)</span><a class="headerlink" href="#mxnet.gluon.rnn.ZoneoutCell" title="Permalink to this definition">¶</a></dt>
 <dd><p>Applies Zoneout on base cell.</p>
 </dd></dl>
 <dl class="class">
 <dt id="mxnet.gluon.rnn.ResidualCell">
 <em class="property">class </em><code class="descclassname">mxnet.gluon.rnn.</code><code class="descname">ResidualCell</code><span class="sig-paren">(</span><em>base_cell</em><span class="sig-paren">)</span><a class="headerlink" href="#mxnet.gluon.rnn.ResidualCell" title="Permalink to this definition">¶</a></dt>
 <dd><p>Adds residual connection as described in Wu et al, 2016
 (<a class="reference external" href="https://arxiv.org/abs/1609.08144">https://arxiv.org/abs/1609.08144</a>).
 Output of the cell is output of the base cell plus input.</p>
 </dd></dl>
 </div>
 <div class="section" id="trainer">
 <span id="trainer"></span><h2>Trainer<a class="headerlink" href="#trainer" title="Permalink to this headline">¶</a></h2>
 <dl class="class">
 <dt id="mxnet.gluon.Trainer">
 <em class="property">class </em><code class="descclassname">mxnet.gluon.</code><code class="descname">Trainer</code><span class="sig-paren">(</span><em>params</em>, <em>optimizer</em>, <em>optimizer_params</em>, <em>kvstore='device'</em><span class="sig-paren">)</span><a class="headerlink" href="#mxnet.gluon.Trainer" title="Permalink to this definition">¶</a></dt>
 <dd><p>Applies an <cite>Optimizer</cite> on a set of Parameters. Trainer should
 be used together with <cite>autograd</cite>.</p>
 <table class="docutils field-list" frame="void" rules="none">
 <col class="field-name"/>
 <col class="field-body"/>
 <tbody valign="top">
 <tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
 <li><strong>params</strong> (<a class="reference internal" href="#mxnet.gluon.ParameterDict" title="mxnet.gluon.ParameterDict"><em>ParameterDict</em></a>) – The set of parameters to optimize.</li>
 <li><strong>optimizer</strong> (<em>str</em><em> or </em><a class="reference internal" href="optimization.html#mxnet.optimizer.Optimizer" title="mxnet.optimizer.Optimizer"><em>Optimizer</em></a>) – The optimizer to use.</li>
 <li><strong>optimizer_params</strong> (<em>dict</em>) – Key-word arguments to be passed to optimizer constructor. For example,
 <cite>{‘learning_rate’: 0.1}</cite></li>
 <li><strong>kvstore</strong> (<em>str</em><em> or </em><a class="reference internal" href="kvstore.html#mxnet.kvstore.KVStore" title="mxnet.kvstore.KVStore"><em>KVStore</em></a>) – kvstore type for multi-gpu and distributed training.</li>
 </ul>
 </td>
 </tr>
 </tbody>
 </table>
 <dl class="method">
 <dt id="mxnet.gluon.Trainer.step">
 <code class="descname">step</code><span class="sig-paren">(</span><em>batch_size</em>, <em>ignore_stale_grad=False</em><span class="sig-paren">)</span><a class="headerlink" href="#mxnet.gluon.Trainer.step" title="Permalink to this definition">¶</a></dt>
 <dd><p>Makes one step of parameter update. Should be called after
 <cite>autograd.compute_gradient</cite> and outside of <cite>record()</cite> scope.</p>
 <table class="docutils field-list" frame="void" rules="none">
 <col class="field-name"/>
 <col class="field-body"/>
 <tbody valign="top">
 <tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
 <li><strong>batch_size</strong> (<em>int</em>) – Batch size of data processed. Gradient will be normalized by <cite>1/batch_size</cite>.
 Set this to 1 if you normalized loss manually with <cite>loss = mean(loss)</cite>.</li>
 <li><strong>ignore_stale_grad</strong> (<em>bool</em><em>, </em><em>optional</em><em>, </em><em>default=False</em>) – If true, ignores Parameters with stale gradient (gradient that has not
 been updated by <cite>backward</cite> after last step) and skip update.</li>
 </ul>
 </td>
 </tr>
 </tbody>
 </table>
 </dd></dl>
 </dd></dl>
 </div>
 <div class="section" id="loss-functions">
 <span id="loss-functions"></span><h2>Loss functions<a class="headerlink" href="#loss-functions" title="Permalink to this headline">¶</a></h2>
 <dl class="class">
 <dt id="mxnet.gluon.loss.L2Loss">
 <em class="property">class </em><code class="descclassname">mxnet.gluon.loss.</code><code class="descname">L2Loss</code><span class="sig-paren">(</span><em>weight=1.0</em>, <em>batch_axis=0</em>, <em>**kwargs</em><span class="sig-paren">)</span><a class="headerlink" href="#mxnet.gluon.loss.L2Loss" title="Permalink to this definition">¶</a></dt>
 <dd><p>Calculates the mean squared error between output and label:</p>
 <div class="math">
 \[L = \frac{1}{2}\sum_i \Vert {output}_i - {label}_i \Vert^2.\]</div>
 <p>Output and label can have arbitrary shape as long as they have the same
 number of elements.</p>
 <table class="docutils field-list" frame="void" rules="none">
 <col class="field-name"/>
 <col class="field-body"/>
 <tbody valign="top">
 <tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
 <li><strong>weight</strong> (<em>float</em><em> or </em><em>None</em>) – Global scalar weight for loss.</li>
 <li><strong>sample_weight</strong> (<a class="reference internal" href="symbol.html#mxnet.symbol.Symbol" title="mxnet.symbol.Symbol"><em>Symbol</em></a><em> or </em><em>None</em>) – Per sample weighting. Must be broadcastable to
 the same shape as loss. For example, if loss has
 shape (64, 10) and you want to weight each sample
 in the batch, <cite>sample_weight</cite> should have shape (64, 1).</li>
 <li><strong>batch_axis</strong> (<em>int</em><em>, </em><em>default 0</em>) – The axis that represents mini-batch.</li>
 </ul>
 </td>
 </tr>
 </tbody>
 </table>
 </dd></dl>
 <dl class="class">
 <dt id="mxnet.gluon.loss.L1Loss">
 <em class="property">class </em><code class="descclassname">mxnet.gluon.loss.</code><code class="descname">L1Loss</code><span class="sig-paren">(</span><em>weight=None</em>, <em>batch_axis=0</em>, <em>**kwargs</em><span class="sig-paren">)</span><a class="headerlink" href="#mxnet.gluon.loss.L1Loss" title="Permalink to this definition">¶</a></dt>
 <dd><p>Calculates the mean absolute error between output and label:</p>
 <div class="math">
 \[L = \frac{1}{2}\sum_i \vert {output}_i - {label}_i \vert.\]</div>
 <p>Output and label must have the same shape.</p>
 <table class="docutils field-list" frame="void" rules="none">
 <col class="field-name"/>
 <col class="field-body"/>
 <tbody valign="top">
 <tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
 <li><strong>weight</strong> (<em>float</em><em> or </em><em>None</em>) – Global scalar weight for loss.</li>
 <li><strong>sample_weight</strong> (<a class="reference internal" href="symbol.html#mxnet.symbol.Symbol" title="mxnet.symbol.Symbol"><em>Symbol</em></a><em> or </em><em>None</em>) – Per sample weighting. Must be broadcastable to
 the same shape as loss. For example, if loss has
 shape (64, 10) and you want to weight each sample
 in the batch, <cite>sample_weight</cite> should have shape (64, 1).</li>
 <li><strong>batch_axis</strong> (<em>int</em><em>, </em><em>default 0</em>) – The axis that represents mini-batch.</li>
 </ul>
 </td>
 </tr>
 </tbody>
 </table>
 </dd></dl>
 <dl class="class">
 <dt id="mxnet.gluon.loss.SoftmaxCrossEntropyLoss">
 <em class="property">class </em><code class="descclassname">mxnet.gluon.loss.</code><code class="descname">SoftmaxCrossEntropyLoss</code><span class="sig-paren">(</span><em>axis=-1</em>, <em>sparse_label=True</em>, <em>from_logits=False</em>, <em>weight=None</em>, <em>batch_axis=0</em>, <em>**kwargs</em><span class="sig-paren">)</span><a class="headerlink" href="#mxnet.gluon.loss.SoftmaxCrossEntropyLoss" title="Permalink to this definition">¶</a></dt>
 <dd><p>Computes the softmax cross entropy loss.</p>
 <p>If <cite>sparse_label</cite> is <cite>True</cite>, label should contain integer category indicators:</p>
 <div class="math">
 \[ \begin{align}\begin{aligned}p = {softmax}({output})\\L = -\sum_i {log}(p_{i,{label}_i})\end{aligned}\end{align} \]</div>
 <p>Label’s shape should be output’s shape without the <cite>axis</cite> dimension. i.e. for
 <cite>output.shape</cite> = (1,2,3,4) and axis = 2, <cite>label.shape</cite> should be (1,2,4).</p>
 <p>If <cite>sparse_label</cite> is <cite>False</cite>, label should contain probability distribution
 with the same shape as output:</p>
 <div class="math">
 \[ \begin{align}\begin{aligned}p = {softmax}({output})\\L = -\sum_i \sum_j {label}_j {log}(p_{ij})\end{aligned}\end{align} \]</div>
 <table class="docutils field-list" frame="void" rules="none">
 <col class="field-name"/>
 <col class="field-body"/>
 <tbody valign="top">
 <tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
 <li><strong>axis</strong> (<em>int</em><em>, </em><em>default -1</em>) – The axis to sum over when computing softmax and entropy.</li>
 <li><strong>sparse_label</strong> (<em>bool</em><em>, </em><em>default True</em>) – Whether label is an integer array instead of probability distribution.</li>
 <li><strong>from_logits</strong> (<em>bool</em><em>, </em><em>default False</em>) – Whether input is a log probability (usually from log_softmax) instead
 of unnormalized numbers.</li>
 <li><strong>weight</strong> (<em>float</em><em> or </em><em>None</em>) – Global scalar weight for loss.</li>
 <li><strong>sample_weight</strong> (<a class="reference internal" href="symbol.html#mxnet.symbol.Symbol" title="mxnet.symbol.Symbol"><em>Symbol</em></a><em> or </em><em>None</em>) – Per sample weighting. Must be broadcastable to
 the same shape as loss. For example, if loss has
 shape (64, 10) and you want to weight each sample
 in the batch, <cite>sample_weight</cite> should have shape (64, 1).</li>
 <li><strong>batch_axis</strong> (<em>int</em><em>, </em><em>default 0</em>) – The axis that represents mini-batch.</li>
 </ul>
 </td>
 </tr>
 </tbody>
 </table>
 </dd></dl>
 <dl class="class">
 <dt id="mxnet.gluon.loss.KLDivLoss">
 <em class="property">class </em><code class="descclassname">mxnet.gluon.loss.</code><code class="descname">KLDivLoss</code><span class="sig-paren">(</span><em>from_logits=True</em>, <em>weight=None</em>, <em>batch_axis=0</em>, <em>**kwargs</em><span class="sig-paren">)</span><a class="headerlink" href="#mxnet.gluon.loss.KLDivLoss" title="Permalink to this definition">¶</a></dt>
 <dd><p>The Kullback-Leibler divergence loss.</p>
 <p>KL divergence is a useful distance measure for continuous distributions
 and is often useful when performing direct regression over the space of
 (discretely sampled) continuous output distributions.</p>
 <div class="math">
 \[L = 1/n \sum_i (label_i * (log(label_i) - output_i))\]</div>
 <p>Label’s shape should be the same as output’s.</p>
 <table class="docutils field-list" frame="void" rules="none">
 <col class="field-name"/>
 <col class="field-body"/>
 <tbody valign="top">
 <tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
 <li><strong>from_logits</strong> (bool, default is <cite>True</cite>) – Whether the input is log probability (usually from log_softmax) instead
 of unnormalized numbers.</li>
 <li><strong>weight</strong> (<em>float</em><em> or </em><em>None</em>) – Global scalar weight for loss.</li>
 <li><strong>sample_weight</strong> (<a class="reference internal" href="symbol.html#mxnet.symbol.Symbol" title="mxnet.symbol.Symbol"><em>Symbol</em></a><em> or </em><em>None</em>) – Per sample weighting. Must be broadcastable to
 the same shape as loss. For example, if loss has
 shape (64, 10) and you want to weight each sample
 in the batch, <cite>sample_weight</cite> should have shape (64, 1).</li>
 <li><strong>batch_axis</strong> (<em>int</em><em>, </em><em>default 0</em>) – The axis that represents mini-batch.</li>
 </ul>
 </td>
 </tr>
 </tbody>
 </table>
 </dd></dl>
 </div>
 <div class="section" id="utilities">
 <span id="utilities"></span><h2>Utilities<a class="headerlink" href="#utilities" title="Permalink to this headline">¶</a></h2>
 <dl class="method">
 <dt id="mxnet.gluon.utils.split_data">
 <code class="descclassname">utils.</code><code class="descname">split_data</code><span class="sig-paren">(</span><em>data</em>, <em>num_slice</em>, <em>batch_axis=0</em>, <em>even_split=True</em><span class="sig-paren">)</span><a class="headerlink" href="#mxnet.gluon.utils.split_data" title="Permalink to this definition">¶</a></dt>
 <dd><p>Splits an NDArray into <cite>num_slice</cite> slices along <cite>batch_axis</cite>.
 Usually used for data parallelism where each slices is sent
 to one device (i.e. GPU).</p>
 <table class="docutils field-list" frame="void" rules="none">
 <col class="field-name"/>
 <col class="field-body"/>
 <tbody valign="top">
 <tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
 <li><strong>data</strong> (<a class="reference internal" href="ndarray.html#mxnet.ndarray.NDArray" title="mxnet.ndarray.NDArray"><em>NDArray</em></a>) – A batch of data.</li>
 <li><strong>num_slice</strong> (<em>int</em>) – Number of desired slices.</li>
 <li><strong>batch_axis</strong> (<em>int</em><em>, </em><em>default 0</em>) – The axis along which to slice.</li>
 <li><strong>even_split</strong> (<em>bool</em><em>, </em><em>default True</em>) – Whether to force all slices to have the same number of elements.
 If <cite>True</cite>, an error will be raised when <cite>num_slice</cite> does not evenly
 divide <cite>data.shape[batch_axis]</cite>.</li>
 </ul>
 </td>
 </tr>
 <tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first">Return value is a list even if <cite>num_slice</cite> is 1.</p>
 </td>
 </tr>
 <tr class="field-odd field"><th class="field-name">Return type:</th><td class="field-body"><p class="first last">list of NDArray</p>
 </td>
 </tr>
 </tbody>
 </table>
 </dd></dl>
 <dl class="method">
 <dt id="mxnet.gluon.utils.split_and_load">
 <code class="descclassname">utils.</code><code class="descname">split_and_load</code><span class="sig-paren">(</span><em>data</em>, <em>ctx_list</em>, <em>batch_axis=0</em>, <em>even_split=True</em><span class="sig-paren">)</span><a class="headerlink" href="#mxnet.gluon.utils.split_and_load" title="Permalink to this definition">¶</a></dt>
 <dd><p>Splits an NDArray into <cite>len(ctx_list)</cite> slices along <cite>batch_axis</cite> and loads
 each slice to one context in <cite>ctx_list</cite>.</p>
 <table class="docutils field-list" frame="void" rules="none">
 <col class="field-name"/>
 <col class="field-body"/>
 <tbody valign="top">
 <tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
 <li><strong>data</strong> (<a class="reference internal" href="ndarray.html#mxnet.ndarray.NDArray" title="mxnet.ndarray.NDArray"><em>NDArray</em></a>) – A batch of data.</li>
 <li><strong>ctx_list</strong> (<em>list of Context</em>) – A list of Contexts.</li>
 <li><strong>batch_axis</strong> (<em>int</em><em>, </em><em>default 0</em>) – The axis along which to slice.</li>
 <li><strong>even_split</strong> (<em>bool</em><em>, </em><em>default True</em>) – Whether to force all slices to have the same number of elements.</li>
 </ul>
 </td>
 </tr>
 <tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first">Each corresponds to a context in <cite>ctx_list</cite>.</p>
 </td>
 </tr>
 <tr class="field-odd field"><th class="field-name">Return type:</th><td class="field-body"><p class="first last">list of NDArray</p>
 </td>
 </tr>
 </tbody>
 </table>
 </dd></dl>
 <dl class="method">
 <dt id="mxnet.gluon.utils.clip_global_norm">
 <code class="descclassname">utils.</code><code class="descname">clip_global_norm</code><span class="sig-paren">(</span><em>arrays</em>, <em>max_norm</em><span class="sig-paren">)</span><a class="headerlink" href="#mxnet.gluon.utils.clip_global_norm" title="Permalink to this definition">¶</a></dt>
 <dd><p>Rescales NDArrays so that the sum of their 2-norm is smaller than <cite>max_norm</cite>.</p>
 </dd></dl>
 <script>auto_index("api-reference");</script></div>
 </div>
 <div class="container">
 <div class="footer">
 <p> © 2015-2017 DMLC. All rights reserved. </p>
 </div>
 </div>
 </div>
 <div aria-label="main navigation" class="sphinxsidebar rightsidebar" role="navigation">
 <div class="sphinxsidebarwrapper">
 <h3><a href="../../index.html">Table Of Contents</a></h3>
 <ul>
 <li><a class="reference internal" href="#">Gluon Package</a><ul>
 <li><a class="reference internal" href="#overview">Overview</a></li>
 <li><a class="reference internal" href="#parameter">Parameter</a></li>
 <li><a class="reference internal" href="#containers">Containers</a></li>
 <li><a class="reference internal" href="#neural-network-layers">Neural Network Layers</a><ul>
 <li><a class="reference internal" href="#containers">Containers</a></li>
 <li><a class="reference internal" href="#basic-layers">Basic Layers</a></li>
 <li><a class="reference internal" href="#convolutional-layers">Convolutional Layers</a></li>
 <li><a class="reference internal" href="#pooling-layers">Pooling Layers</a></li>
 </ul>
 </li>
 <li><a class="reference internal" href="#recurrent-layers">Recurrent Layers</a></li>
 <li><a class="reference internal" href="#trainer">Trainer</a></li>
 <li><a class="reference internal" href="#loss-functions">Loss functions</a></li>
 <li><a class="reference internal" href="#utilities">Utilities</a></li>
 </ul>
 </li>
 </ul>
 </div>
 </div>
 </div> <!-- pagename != index -->
 <script crossorigin="anonymous" integrity="sha384-0mSbJDEHialfmuBBQP6A4Qrprq5OVfW37PRR3j5ELqxss1yVqOtnepnHVP9aJ7xS" src="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.6/js/bootstrap.min.js"></script>
 <script src="../../_static/js/sidebar.js" type="text/javascript"></script>
 <script src="../../_static/js/search.js" type="text/javascript"></script>
 <script src="../../_static/js/navbar.js" type="text/javascript"></script>
 <script src="../../_static/js/clipboard.min.js" type="text/javascript"></script>
 <script src="../../_static/js/copycode.js" type="text/javascript"></script>
 <script type="text/javascript">
         $('body').ready(function () {
             $('body').css('visibility', 'visible');
         });
     </script>
 </div></body>
 </html>