versions/0.11.0/architecture/program_model.html - mxnet-site - Git at Google

 <!DOCTYPE html>

 <html lang="en">
 <head>
 <meta charset="utf-8"/>
 <meta content="IE=edge" http-equiv="X-UA-Compatible"/>
 <meta content="width=device-width, initial-scale=1" name="viewport"/>
 <title>Deep Learning Programming Style — mxnet  documentation</title>
 <link crossorigin="anonymous" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.6/css/bootstrap.min.css" integrity="sha384-1q8mTJOASx8j1Au+a5WDVnPi2lkFfwwEAa8hDDdjZlpLegxhjVME1fgjWPGmkzs7" rel="stylesheet"/>
 <link href="https://maxcdn.bootstrapcdn.com/font-awesome/4.5.0/css/font-awesome.min.css" rel="stylesheet"/>
 <link href="../_static/basic.css" rel="stylesheet" type="text/css"/>
 <link href="../_static/pygments.css" rel="stylesheet" type="text/css"/>
 <link href="../_static/mxnet.css" rel="stylesheet" type="text/css"/>
 <script type="text/javascript">
       var DOCUMENTATION_OPTIONS = {
         URL_ROOT:    '../',
         VERSION:     '',
         COLLAPSE_INDEX: false,
         FILE_SUFFIX: '.html',
         HAS_SOURCE:  true,
         SOURCELINK_SUFFIX: ''
       };
     </script>
 <script src="../_static/jquery-1.11.1.js" type="text/javascript"></script>
 <script src="../_static/underscore.js" type="text/javascript"></script>
 <script src="../_static/searchtools_custom.js" type="text/javascript"></script>
 <script src="../_static/doctools.js" type="text/javascript"></script>
 <script src="../_static/selectlang.js" type="text/javascript"></script>
 <script src="https://cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML" type="text/javascript"></script>
 <script type="text/javascript"> jQuery(function() { Search.loadIndex("/searchindex.js"); Search.init();}); </script>
 <script>
       (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
       (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new
       Date();a=s.createElement(o),
       m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
       })(window,document,'script','https://www.google-analytics.com/analytics.js','ga');

       ga('create', 'UA-96378503-1', 'auto');
       ga('send', 'pageview');

     </script>
 <!-- -->
 <!-- <script type="text/javascript" src="../_static/jquery.js"></script> -->
 <!-- -->
 <!-- <script type="text/javascript" src="../_static/underscore.js"></script> -->
 <!-- -->
 <!-- <script type="text/javascript" src="../_static/doctools.js"></script> -->
 <!-- -->
 <!-- <script type="text/javascript" src="https://cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML"></script> -->
 <!-- -->
 <link href="https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/image/mxnet-icon.png" rel="icon" type="image/png"/>
 </head>
 <body role="document"><div class="navbar navbar-fixed-top">
 <div class="container" id="navContainer">
 <div class="innder" id="header-inner">
 <h1 id="logo-wrap">
 <a href="../" id="logo"><img src="../_static/mxnet.png"/></a>
 </h1>
 <nav class="nav-bar" id="main-nav">
 <a class="main-nav-link" href="../get_started/install.html">Install</a>
 <a class="main-nav-link" href="../tutorials/index.html">Tutorials</a>
 <span id="dropdown-menu-position-anchor">
 <a aria-expanded="true" aria-haspopup="true" class="main-nav-link dropdown-toggle" data-toggle="dropdown" href="#" role="button">Gluon <span class="caret"></span></a>
 <ul class="dropdown-menu" id="package-dropdown-menu">
 <li><a class="main-nav-link" href="../gluon/index.html">About</a></li>
 <li><a class="main-nav-link" href="http://gluon.mxnet.io/">Tutorials</a></li>
 </ul>
 </span>
 <a class="main-nav-link" href="../how_to/index.html">How To</a>
 <span id="dropdown-menu-position-anchor">
 <a aria-expanded="true" aria-haspopup="true" class="main-nav-link dropdown-toggle" data-toggle="dropdown" href="#" role="button">API <span class="caret"></span></a>
 <ul class="dropdown-menu" id="package-dropdown-menu">
 <li><a class="main-nav-link" href="../api/python/index.html">Python</a></li>
 <li><a class="main-nav-link" href="../api/scala/index.html">Scala</a></li>
 <li><a class="main-nav-link" href="../api/r/index.html">R</a></li>
 <li><a class="main-nav-link" href="../api/julia/index.html">Julia</a></li>
 <li><a class="main-nav-link" href="../api/c++/index.html">C++</a></li>
 <li><a class="main-nav-link" href="../api/perl/index.html">Perl</a></li>
 </ul>
 </span>
 <a class="main-nav-link" href="../architecture/index.html">Architecture</a>
 <!-- <a class="main-nav-link" href="../community/index.html">Community</a> -->
 <a class="main-nav-link" href="https://github.com/dmlc/mxnet">Github</a>
 <span id="dropdown-menu-position-anchor-version" style="position: relative"><a href="#" class="main-nav-link dropdown-toggle" data-toggle="dropdown" role="button" aria-haspopup="true" aria-expanded="true">Versions(0.11.0)<span class="caret"></span></a><ul id="package-dropdown-menu" class="dropdown-menu"><li><a class="main-nav-link" href=https://mxnet.incubator.apache.org/>1.0.0</a></li><li><a class="main-nav-link" href=https://mxnet.incubator.apache.org/versions/0.12.1/index.html>0.12.1</a></li><li><a class="main-nav-link" href=https://mxnet.incubator.apache.org/versions/0.12.0/index.html>0.12.0</a></li><li><a class="main-nav-link" href=https://mxnet.incubator.apache.org/versions/0.11.0/index.html>0.11.0</a></li><li><a class="main-nav-link" href=https://mxnet.incubator.apache.org/versions/master/index.html>master</a></li></ul></span></nav>
 <script> function getRootPath(){ return "../" } </script>
 <div class="burgerIcon dropdown">
 <a class="dropdown-toggle" data-toggle="dropdown" href="#" role="button">☰</a>
 <ul class="dropdown-menu dropdown-menu-right" id="burgerMenu">
 <li><a href="../get_started/install.html">Install</a></li>
 <li><a href="../tutorials/index.html">Tutorials</a></li>
 <li><a href="../how_to/index.html">How To</a></li>
 <li class="dropdown-submenu">
 <a href="#" tabindex="-1">API</a>
 <ul class="dropdown-menu">
 <li><a href="../api/python/index.html" tabindex="-1">Python</a>
 </li>
 <li><a href="../api/scala/index.html" tabindex="-1">Scala</a>
 </li>
 <li><a href="../api/r/index.html" tabindex="-1">R</a>
 </li>
 <li><a href="../api/julia/index.html" tabindex="-1">Julia</a>
 </li>
 <li><a href="../api/c++/index.html" tabindex="-1">C++</a>
 </li>
 <li><a href="../api/perl/index.html" tabindex="-1">Perl</a>
 </li>
 </ul>
 </li>
 <li><a href="../architecture/index.html">Architecture</a></li>
 <li><a class="main-nav-link" href="https://github.com/dmlc/mxnet">Github</a></li>
 <li id="dropdown-menu-position-anchor-version-mobile" class="dropdown-submenu" style="position: relative"><a href="#" tabindex="-1">Versions(0.11.0)</a><ul class="dropdown-menu"><li><a tabindex="-1" href=https://mxnet.incubator.apache.org/>1.0.0</a></li><li><a tabindex="-1" href=https://mxnet.incubator.apache.org/versions/0.12.1/index.html>0.12.1</a></li><li><a tabindex="-1" href=https://mxnet.incubator.apache.org/versions/0.12.0/index.html>0.12.0</a></li><li><a tabindex="-1" href=https://mxnet.incubator.apache.org/versions/0.11.0/index.html>0.11.0</a></li><li><a tabindex="-1" href=https://mxnet.incubator.apache.org/versions/master/index.html>master</a></li></ul></li></ul>
 </div>
 <div class="plusIcon dropdown">
 <a class="dropdown-toggle" data-toggle="dropdown" href="#" role="button"><span aria-hidden="true" class="glyphicon glyphicon-plus"></span></a>
 <ul class="dropdown-menu dropdown-menu-right" id="plusMenu"></ul>
 </div>
 <div id="search-input-wrap">
 <form action="../search.html" autocomplete="off" class="" method="get" role="search">
 <div class="form-group inner-addon left-addon">
 <i class="glyphicon glyphicon-search"></i>
 <input class="form-control" name="q" placeholder="Search" type="text"/>
 </div>
 <input name="check_keywords" type="hidden" value="yes"/>
 <input name="area" type="hidden" value="default"/>
 </form>
 <div id="search-preview"></div>
 </div>
 <div id="searchIcon">
 <span aria-hidden="true" class="glyphicon glyphicon-search"></span>
 </div>
 <!-- <div id="lang-select-wrap"> -->
 <!--   <label id="lang-select-label"> -->
 <!--     <\!-- <i class="fa fa-globe"></i> -\-> -->
 <!--     <span></span> -->
 <!--   </label> -->
 <!--   <select id="lang-select"> -->
 <!--     <option value="en">Eng</option> -->
 <!--     <option value="zh">中文</option> -->
 <!--   </select> -->
 <!-- </div> -->
 <!--     <a id="mobile-nav-toggle">
         <span class="mobile-nav-toggle-bar"></span>
         <span class="mobile-nav-toggle-bar"></span>
         <span class="mobile-nav-toggle-bar"></span>
       </a> -->
 </div>
 </div>
 </div>
 <div class="container">
 <div class="row">
 <div aria-label="main navigation" class="sphinxsidebar leftsidebar" role="navigation">
 <div class="sphinxsidebarwrapper">
 <ul>
 <li class="toctree-l1"><a class="reference internal" href="../api/python/index.html">Python Documents</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../api/r/index.html">R Documents</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../api/julia/index.html">Julia Documents</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../api/c++/index.html">C++ Documents</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../api/scala/index.html">Scala Documents</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../api/perl/index.html">Perl Documents</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../how_to/index.html">HowTo Documents</a></li>
 <li class="toctree-l1"><a class="reference internal" href="index.html">System Documents</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../tutorials/index.html">Tutorials</a></li>
 </ul>
 </div>
 </div>
 <div class="content">
 <div class="section" id="deep-learning-programming-style">
 <span id="deep-learning-programming-style"></span><h1>Deep Learning Programming Style<a class="headerlink" href="#deep-learning-programming-style" title="Permalink to this headline">¶</a></h1>
 <p>However much we might ultimately care about performance,
 we first need working code before we can start worrying about optimization.
 Writing clear, intuitive deep learning code can be challenging,
 and the first thing any practitioner must deal with is the language syntax itself.
 Complicating matters, of the many deep learning libraries out there,
 each has its own approach to programming style.</p>
 <p>In this document, we focus on two of the most important high-level design decisions:</p>
 <ol class="simple">
 <li>Whether to embrace the <em>symbolic</em> or <em>imperative</em> paradigm for mathematical computation.</li>
 <li>Whether to build networks with bigger (more abstract) or more atomic operations.</li>
 </ol>
 <p>Throughout, we’ll focus on the programming models themselves.
 When programming style decisions may impact performance, we point this out,
 but we don’t dwell on specific implementation details.</p>
 <div class="section" id="symbolic-vs-imperative-programs">
 <span id="symbolic-vs-imperative-programs"></span><h2>Symbolic vs. Imperative Programs<a class="headerlink" href="#symbolic-vs-imperative-programs" title="Permalink to this headline">¶</a></h2>
 <p>If you are a Python or C++ programmer, then you’re already familiar with imperative programs.
 Imperative-style programs perform computation as you run them.
 Most code you write in Python is imperative, as is the following NumPy snippet.</p>
 <div class="highlight-python"><div class="highlight"><pre><span></span>    <span class="kn">import</span> <span class="nn">numpy</span> <span class="kn">as</span> <span class="nn">np</span>
     <span class="n">a</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">ones</span><span class="p">(</span><span class="mi">10</span><span class="p">)</span>
     <span class="n">b</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">ones</span><span class="p">(</span><span class="mi">10</span><span class="p">)</span> <span class="o">*</span> <span class="mi">2</span>
     <span class="n">c</span> <span class="o">=</span> <span class="n">b</span> <span class="o">*</span> <span class="n">a</span>
     <span class="n">d</span> <span class="o">=</span> <span class="n">c</span> <span class="o">+</span> <span class="mi">1</span>
 </pre></div>
 </div>
 <p>When the program executes <code class="docutils literal"><span class="pre">c</span> <span class="pre">=</span> <span class="pre">b</span> <span class="pre">*</span> <span class="pre">a</span></code>, it runs the actual numerical computation.</p>
 <p>Symbolic programs are a bit different. With symbolic-style programs,
 we first define a (potentially complex) function abstractly.
 When defining the function, no actual numerical computation takes place.
 We define the abstract function in terms of placeholder values.
 Then we can compile the function, and evaluate it given real inputs.
 In the following example, we rewrite the imperative program from above
 as a symbolic-style program:</p>
 <div class="highlight-python"><div class="highlight"><pre><span></span>    <span class="n">A</span> <span class="o">=</span> <span class="n">Variable</span><span class="p">(</span><span class="s1">'A'</span><span class="p">)</span>
     <span class="n">B</span> <span class="o">=</span> <span class="n">Variable</span><span class="p">(</span><span class="s1">'B'</span><span class="p">)</span>
     <span class="n">C</span> <span class="o">=</span> <span class="n">B</span> <span class="o">*</span> <span class="n">A</span>
     <span class="n">D</span> <span class="o">=</span> <span class="n">C</span> <span class="o">+</span> <span class="n">Constant</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span>
     <span class="c1"># compiles the function</span>
     <span class="n">f</span> <span class="o">=</span> <span class="nb">compile</span><span class="p">(</span><span class="n">D</span><span class="p">)</span>
     <span class="n">d</span> <span class="o">=</span> <span class="n">f</span><span class="p">(</span><span class="n">A</span><span class="o">=</span><span class="n">np</span><span class="o">.</span><span class="n">ones</span><span class="p">(</span><span class="mi">10</span><span class="p">),</span> <span class="n">B</span><span class="o">=</span><span class="n">np</span><span class="o">.</span><span class="n">ones</span><span class="p">(</span><span class="mi">10</span><span class="p">)</span><span class="o">*</span><span class="mi">2</span><span class="p">)</span>
 </pre></div>
 </div>
 <p>As you can see, in the symbolic version, when <code class="docutils literal"><span class="pre">C</span> <span class="pre">=</span> <span class="pre">B</span> <span class="pre">*</span> <span class="pre">A</span></code> is executed, no computation occurs.
 Instead, this operation generates a <em>computation graph</em> (also called a <em>symbolic graph</em>)
 that represents the computation.
 The following figure shows a computation graph to compute <code class="docutils literal"><span class="pre">D</span></code>.</p>
 <p><img alt="Comp Graph" src="https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/prog_model/comp_graph.png"/></p>
 <p>Most symbolic-style programs contain, either explicitly or implicitly, a <em>compile</em> step.
 This converts the computation graph into a function that we can later call.
 In the above example, numerical computation only occurs in the last line of code.
 The defining characteristic of symbolic programs is their clear separation
 between building the computation graph and executing it.
 For neural networks, we typically define the entire model as a single compute graph.</p>
 <p>Among other popular deep learning libraries, Torch, Chainer, and Minerva embrace the imperative style.
 Examples of symbolic-style deep learning libraries include Theano, CGT, and TensorFlow.
 We might also view libraries like CXXNet and Caffe, which rely on configuration files, as symbolic-style libraries.
 In this interpretation, we’d consider the content of the configuration file as defining the computation graph.</p>
 <p>Now that you understand the difference between these two programming models, let’s compare the advantages of each.</p>
 <div class="section" id="imperative-programs-tend-to-be-more-flexible">
 <span id="imperative-programs-tend-to-be-more-flexible"></span><h3>Imperative Programs Tend to be More Flexible<a class="headerlink" href="#imperative-programs-tend-to-be-more-flexible" title="Permalink to this headline">¶</a></h3>
 <p>When you’re using an imperative-style library from Python, you are writing in Python.
 Nearly anything that would be intuitive to write in Python, you could accelerate by calling down in the appropriate places to the imperative deep learning library.
 On the other hand, when you write a symbolic program, you may not have access to all the familiar Python constructs, like iteration.
 Consider the following imperative program, and think about how you can translate this into a symbolic program.</p>
 <div class="highlight-python"><div class="highlight"><pre><span></span>    <span class="n">a</span> <span class="o">=</span> <span class="mi">2</span>
     <span class="n">b</span> <span class="o">=</span> <span class="n">a</span> <span class="o">+</span> <span class="mi">1</span>
     <span class="n">d</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">zeros</span><span class="p">(</span><span class="mi">10</span><span class="p">)</span>
     <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="n">d</span><span class="p">):</span>
         <span class="n">d</span> <span class="o">+=</span> <span class="n">np</span><span class="o">.</span><span class="n">zeros</span><span class="p">(</span><span class="mi">10</span><span class="p">)</span>
 </pre></div>
 </div>
 <p>This wouldn’t be so easy if the Python for-loop weren’t supported by the symbolic API.
 When you write a symbolic program in Python, you’re <em>not</em> writing in Python.
 Instead, you’re writing in a domain-specific language (DSL) defined by the symbolic API.
 The symbolic APIs found in deep learning libraries
 are powerful DSLs that generate callable computation graphs for neural networks.</p>
 <!-- In that sense, config-file input libraries are all symbolic. --><p>Intuitively, you might say that imperative programs
 are more <em>native</em> than symbolic programs.
 It’s easier to use native language features.
 For example, it’s straightforward to print out the values
 in the middle of computation or to use native control flow and loops
 at any point in the flow of computation.</p>
 </div>
 <div class="section" id="symbolic-programs-tend-to-be-more-efficient">
 <span id="symbolic-programs-tend-to-be-more-efficient"></span><h3>Symbolic Programs Tend to be More Efficient<a class="headerlink" href="#symbolic-programs-tend-to-be-more-efficient" title="Permalink to this headline">¶</a></h3>
 <p>As we’ve seen, imperative programs tend to be flexible
 and fit nicely into the programming flow of a host language.
 So you might wonder, why do so many deep learning libraries
 embrace the symbolic paradigm?
 The main reason is efficiency, both in terms of memory and speed.
 Let’s revisit our toy example from before.</p>
 <div class="highlight-python"><div class="highlight"><pre><span></span>    <span class="kn">import</span> <span class="nn">numpy</span> <span class="kn">as</span> <span class="nn">np</span>
     <span class="n">a</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">ones</span><span class="p">(</span><span class="mi">10</span><span class="p">)</span>
     <span class="n">b</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">ones</span><span class="p">(</span><span class="mi">10</span><span class="p">)</span> <span class="o">*</span> <span class="mi">2</span>
     <span class="n">c</span> <span class="o">=</span> <span class="n">b</span> <span class="o">*</span> <span class="n">a</span>
     <span class="n">d</span> <span class="o">=</span> <span class="n">c</span> <span class="o">+</span> <span class="mi">1</span>
     <span class="o">...</span>
 </pre></div>
 </div>
 <p><img alt="Comp Graph" src="https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/prog_model/comp_graph.png"/></p>
 <p>Assume that each cell in the array occupies 8 bytes of memory.
 How much memory do you need to execute this program in the Python console?</p>
 <p>As an imperative program we need to allocate memory at each line.
 That leaves us allocating 4 arrays of size 10.
 So we’ll need <code class="docutils literal"><span class="pre">4</span> <span class="pre">*</span> <span class="pre">10</span> <span class="pre">*</span> <span class="pre">8</span> <span class="pre">=</span> <span class="pre">320</span></code> bytes.
 On the other hand, if we built a computation graph,
 and knew in advance that we only needed <code class="docutils literal"><span class="pre">d</span></code>,
 we could reuse the memory originally allocated for intermediate values.
 For example, by performing computations in-place,
 we might recycle the bits allocated for <code class="docutils literal"><span class="pre">b</span></code> to store <code class="docutils literal"><span class="pre">c</span></code>.
 And we might recycle the bits allocated for <code class="docutils literal"><span class="pre">c</span></code> to store <code class="docutils literal"><span class="pre">d</span></code>.
 In the end we could cut our memory requirement in half,
 requiring just <code class="docutils literal"><span class="pre">2</span> <span class="pre">*</span> <span class="pre">10</span> <span class="pre">*</span> <span class="pre">8</span> <span class="pre">=</span> <span class="pre">160</span></code> bytes.</p>
 <p>Symbolic programs are more <em>restricted</em>.
 When we call <code class="docutils literal"><span class="pre">compile</span></code> on D, we tell the system
 that only the value of <code class="docutils literal"><span class="pre">d</span></code> is needed.
 The intermediate values of the computation,
 in this case <code class="docutils literal"><span class="pre">c</span></code>, is then invisible to us.</p>
 <p>We benefit because the symbolic programs
 can then safely reuse the memory for in-place computation.
 But on the other hand, if we later decide that we need to access <code class="docutils literal"><span class="pre">c</span></code>, we’re out of luck.
 So imperative programs are better prepared to encounter all possible demands.
 If we ran the imperative version of the code in a Python console,
 we could inspect any of the intermediate variables in the future.</p>
 <!-- Of course, this is somewhat misleading, because garbage collection can occur in imperative programs and memory could then be reused.
 However, imperative programs do need to be "prepared to encounter all possible demands," and this limits the optimization you can perform. This is true for non-trivial cases, such
 as gradient calculation, which we discuss in next section. --><p>Symbolic programs can also perform another kind of optimization, called operation folding.
 Returning to our toy example, the multiplication and addition operations
 can be folded into one operation, as shown in the following graph.
 If the computation runs on a GPU processor,
 one GPU kernel will be executed, instead of two.
 In fact, this is one way we hand-craft operations
 in optimized libraries, such as CXXNet and Caffe.
 Operation folding improves computation efficiency.</p>
 <p><img alt="Comp Graph Folded" src="https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/prog_model/comp_graph_fold.png"/></p>
 <p>Note, you can’t perform operation folding in imperative programs,
 because the intermediate values might be referenced in the future.
 Operation folding is possible in symbolic programs
 because you get the entire computation graph,
 and a clear specification of which values will be needed and which are not.</p>
 </div>
 <div class="section" id="case-study-backprop-and-autodiff">
 <span id="case-study-backprop-and-autodiff"></span><h3>Case Study: Backprop and AutoDiff<a class="headerlink" href="#case-study-backprop-and-autodiff" title="Permalink to this headline">¶</a></h3>
 <p>In this section, we compare the two programming models
 on the problem of auto differentiation, or backpropagation.
 Differentiation is of vital importance in deep learning
 because it’s the mechanism by which we train our models.
 In any deep learning model, we define a <em>loss function</em>.
 A <em>loss function</em> measures how far the model is from the desired output.
 We then typically pass over training examples (pairs of inputs and ground-truth outputs).
 At each step we update the model’s <em>parameters</em> to minimize the loss.
 To determine the direction in which to update the parameters,
 we need to take the derivative of the loss function with respect to the parameters.</p>
 <p>In the past, whenever someone defined a new model,
 they had to work out the derivative calculations by hand.
 While the math is reasonably straightforward,
 for complex models, it can be time-consuming and tedious work.
 All modern deep learning libraries make the practitioner/researcher’s job
 much easier, by automatically solving the problem of gradient calculation.</p>
 <p>Both imperative and symbolic programs can perform gradient calculation.
 So let’s take a look at how you might perform automatic differentiation with each.</p>
 <p>Let’s start with imperative programs.
 The following example Python code performs automatic differentiation using our toy example:</p>
 <div class="highlight-python"><div class="highlight"><pre><span></span>    <span class="k">class</span> <span class="nc">array</span><span class="p">(</span><span class="nb">object</span><span class="p">)</span> <span class="p">:</span>
         <span class="sd">"""Simple Array object that support autodiff."""</span>
         <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">,</span> <span class="n">name</span><span class="o">=</span><span class="bp">None</span><span class="p">):</span>
             <span class="bp">self</span><span class="o">.</span><span class="n">value</span> <span class="o">=</span> <span class="n">value</span>
             <span class="k">if</span> <span class="n">name</span><span class="p">:</span>
                 <span class="bp">self</span><span class="o">.</span><span class="n">grad</span> <span class="o">=</span> <span class="k">lambda</span> <span class="n">g</span> <span class="p">:</span> <span class="p">{</span><span class="n">name</span> <span class="p">:</span> <span class="n">g</span><span class="p">}</span>

         <span class="k">def</span> <span class="fm">__add__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">):</span>
             <span class="k">assert</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">other</span><span class="p">,</span> <span class="nb">int</span><span class="p">)</span>
             <span class="n">ret</span> <span class="o">=</span> <span class="n">array</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">value</span> <span class="o">+</span> <span class="n">other</span><span class="p">)</span>
             <span class="n">ret</span><span class="o">.</span><span class="n">grad</span> <span class="o">=</span> <span class="k">lambda</span> <span class="n">g</span> <span class="p">:</span> <span class="bp">self</span><span class="o">.</span><span class="n">grad</span><span class="p">(</span><span class="n">g</span><span class="p">)</span>
             <span class="k">return</span> <span class="n">ret</span>

         <span class="k">def</span> <span class="fm">__mul__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">):</span>
             <span class="k">assert</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">other</span><span class="p">,</span> <span class="n">array</span><span class="p">)</span>
             <span class="n">ret</span> <span class="o">=</span> <span class="n">array</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">value</span> <span class="o">*</span> <span class="n">other</span><span class="o">.</span><span class="n">value</span><span class="p">)</span>
             <span class="k">def</span> <span class="nf">grad</span><span class="p">(</span><span class="n">g</span><span class="p">):</span>
                 <span class="n">x</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">grad</span><span class="p">(</span><span class="n">g</span> <span class="o">*</span> <span class="n">other</span><span class="o">.</span><span class="n">value</span><span class="p">)</span>
                 <span class="n">x</span><span class="o">.</span><span class="n">update</span><span class="p">(</span><span class="n">other</span><span class="o">.</span><span class="n">grad</span><span class="p">(</span><span class="n">g</span> <span class="o">*</span> <span class="bp">self</span><span class="o">.</span><span class="n">value</span><span class="p">))</span>
                 <span class="k">return</span> <span class="n">x</span>
             <span class="n">ret</span><span class="o">.</span><span class="n">grad</span> <span class="o">=</span> <span class="n">grad</span>
             <span class="k">return</span> <span class="n">ret</span>

     <span class="c1"># some examples</span>
     <span class="n">a</span> <span class="o">=</span> <span class="n">array</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="s1">'a'</span><span class="p">)</span>
     <span class="n">b</span> <span class="o">=</span> <span class="n">array</span><span class="p">(</span><span class="mi">2</span><span class="p">,</span> <span class="s1">'b'</span><span class="p">)</span>
     <span class="n">c</span> <span class="o">=</span> <span class="n">b</span> <span class="o">*</span> <span class="n">a</span>
     <span class="n">d</span> <span class="o">=</span> <span class="n">c</span> <span class="o">+</span> <span class="mi">1</span>
     <span class="k">print</span> <span class="n">d</span><span class="o">.</span><span class="n">value</span>
     <span class="k">print</span> <span class="n">d</span><span class="o">.</span><span class="n">grad</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span>
     <span class="c1"># Results</span>
     <span class="c1"># 3</span>
     <span class="c1"># {'a': 2, 'b': 1}</span>
 </pre></div>
 </div>
 <p>In this code, each array object contains a grad function (it is actually a closure).
 When you run <code class="docutils literal"><span class="pre">d.grad</span></code>, it recursively invokes the grad function of its inputs,
 backprops the gradient value back, and
 returns the gradient value of each input.</p>
 <p>This might look a bit complicated, so let’s consider
 the gradient calculation for symbolic programs.
 The following program performs symbolic gradient calculation for the same task.</p>
 <div class="highlight-python"><div class="highlight"><pre><span></span>    <span class="n">A</span> <span class="o">=</span> <span class="n">Variable</span><span class="p">(</span><span class="s1">'A'</span><span class="p">)</span>
     <span class="n">B</span> <span class="o">=</span> <span class="n">Variable</span><span class="p">(</span><span class="s1">'B'</span><span class="p">)</span>
     <span class="n">C</span> <span class="o">=</span> <span class="n">B</span> <span class="o">*</span> <span class="n">A</span>
     <span class="n">D</span> <span class="o">=</span> <span class="n">C</span> <span class="o">+</span> <span class="n">Constant</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span>
     <span class="c1"># get gradient node.</span>
     <span class="n">gA</span><span class="p">,</span> <span class="n">gB</span> <span class="o">=</span> <span class="n">D</span><span class="o">.</span><span class="n">grad</span><span class="p">(</span><span class="n">wrt</span><span class="o">=</span><span class="p">[</span><span class="n">A</span><span class="p">,</span> <span class="n">B</span><span class="p">])</span>
     <span class="c1"># compiles the gradient function.</span>
     <span class="n">f</span> <span class="o">=</span> <span class="nb">compile</span><span class="p">([</span><span class="n">gA</span><span class="p">,</span> <span class="n">gB</span><span class="p">])</span>
     <span class="n">grad_a</span><span class="p">,</span> <span class="n">grad_b</span> <span class="o">=</span> <span class="n">f</span><span class="p">(</span><span class="n">A</span><span class="o">=</span><span class="n">np</span><span class="o">.</span><span class="n">ones</span><span class="p">(</span><span class="mi">10</span><span class="p">),</span> <span class="n">B</span><span class="o">=</span><span class="n">np</span><span class="o">.</span><span class="n">ones</span><span class="p">(</span><span class="mi">10</span><span class="p">)</span><span class="o">*</span><span class="mi">2</span><span class="p">)</span>
 </pre></div>
 </div>
 <p>The grad function of <code class="docutils literal"><span class="pre">D</span></code> generates a backward computation graph,
 and returns a gradient node, <code class="docutils literal"><span class="pre">gA,</span> <span class="pre">gB</span></code>,
 which correspond to the red nodes in the following figure.</p>
 <p><img alt="Comp Graph Folded" src="https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/prog_model/comp_graph_backward.png"/></p>
 <p>The imperative program actually does the same thing as the symbolic program.
 It implicitly saves a backward computation graph in the grad closure.
 When you invoked <code class="docutils literal"><span class="pre">d.grad</span></code>, you start from <code class="docutils literal"><span class="pre">d(D)</span></code>,
 backtrack through the graph to compute the gradient, and collect the results.</p>
 <p>The gradient calculations in both symbolic
 and imperative programming follow the same pattern.
 What’s the difference then?
 Recall the <em>be prepared to encounter all possible demands</em> requirement of imperative programs.
 If you are creating an array library that supports automatic differentiation,
 you have to keep the grad closure along with the computation.
 This means that none of the history variables can be
 garbage-collected because they are referenced by variable <code class="docutils literal"><span class="pre">d</span></code> by way of function closure.</p>
 <p>What if you want to compute only the value of <code class="docutils literal"><span class="pre">d</span></code>,
 and don’t want the gradient value?
 In symbolic programming, you declare this with <code class="docutils literal"><span class="pre">f=compiled([D])</span></code>.
 This also declares the boundary of computation,
 telling the system that you want to compute only the forward pass.
 As a result, the system can free the memory of previous results,
 and share the memory between inputs and outputs.</p>
 <p>Imagine running a deep neural network with <code class="docutils literal"><span class="pre">n</span></code> layers.
 If you are running only the forward pass,
 not the backward(gradient) pass,
 you need to allocate only two copies of
 temporal space to store the values of the intermediate layers,
 instead of <code class="docutils literal"><span class="pre">n</span></code> copies of them.
 However, because imperative programs need to be prepared
 to encounter all possible demands of getting the gradient,
 they have to store the intermediate values,
 which requires <code class="docutils literal"><span class="pre">n</span></code> copies of temporal space.</p>
 <p>As you can see, the level of optimization depends
 on the restrictions on what you can do.
 Symbolic programs ask you to clearly specify
 these restrictions when you compile the graph.
 One the other hand, imperative programs
 must be prepared for a wider range of demands.
 Symbolic programs have a natural advantage
 because they know more about what you do and don’t want.</p>
 <p>There are ways in which we can modify imperative programs
 to incorporate similar restrictions.
 For example, one solution to the preceding
 problem is to introduce a context variable.
 You can introduce a no-gradient context variable
 to turn gradient calculation off.</p>
 <div class="highlight-python"><div class="highlight"><pre><span></span>    <span class="k">with</span> <span class="n">context</span><span class="o">.</span><span class="n">NoGradient</span><span class="p">():</span>
         <span class="n">a</span> <span class="o">=</span> <span class="n">array</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="s1">'a'</span><span class="p">)</span>
         <span class="n">b</span> <span class="o">=</span> <span class="n">array</span><span class="p">(</span><span class="mi">2</span><span class="p">,</span> <span class="s1">'b'</span><span class="p">)</span>
         <span class="n">c</span> <span class="o">=</span> <span class="n">b</span> <span class="o">*</span> <span class="n">a</span>
         <span class="n">d</span> <span class="o">=</span> <span class="n">c</span> <span class="o">+</span> <span class="mi">1</span>
 </pre></div>
 </div>
 <!-- This provides an imperative program with the ability to impose some restrictions, but reduces efficiency. --><p>However, this example still must be prepared to encounter all possible demands,
 which means that you can’t perform the in-place calculation
 to reuse memory in the forward pass (a trick commonly used to reduce GPU memory usage).
 The techniques we’ve discussed generate an explicit backward pass.
 Some of the libraries such as Caffe and CXXNet perform backprop implicitly on the same graph.
 The approach we’ve discussed in this section also applies to them.</p>
 <p>Most configuration-file-based libraries,
 such as CXXNet and Caffe are designed
 to meet one or two generic requirements:
 get the activation of each layer,
 or get the gradient of all of the weights.
 These libraries have the same problem:
 the more generic operations the library has to support,
 the less optimization (memory sharing) you can do,
 based on the same data structure.</p>
 <p>As you can see, the trade-off between restriction
 and flexibility is the same for most cases.</p>
 </div>
 <div class="section" id="model-checkpoint">
 <span id="model-checkpoint"></span><h3>Model Checkpoint<a class="headerlink" href="#model-checkpoint" title="Permalink to this headline">¶</a></h3>
 <p>It’s important to able to save a model and load it back later.
 There are different ways to <em>save</em> your work.
 Normally, to save a neural network,
 you need to save two things: a net configuration
 for the structure of the neural network and the weights of the neural network.</p>
 <p>The ability to check the configuration is a plus for symbolic programs.
 Because the symbolic construction phase does not perform computation,
 you can directly serialize the computation graph, and load it back later.
 This solves the problem of saving the configuration
 without introducing an additional layer.</p>
 <div class="highlight-python"><div class="highlight"><pre><span></span>    <span class="n">A</span> <span class="o">=</span> <span class="n">Variable</span><span class="p">(</span><span class="s1">'A'</span><span class="p">)</span>
     <span class="n">B</span> <span class="o">=</span> <span class="n">Variable</span><span class="p">(</span><span class="s1">'B'</span><span class="p">)</span>
     <span class="n">C</span> <span class="o">=</span> <span class="n">B</span> <span class="o">*</span> <span class="n">A</span>
     <span class="n">D</span> <span class="o">=</span> <span class="n">C</span> <span class="o">+</span> <span class="n">Constant</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span>
     <span class="n">D</span><span class="o">.</span><span class="n">save</span><span class="p">(</span><span class="s1">'mygraph'</span><span class="p">)</span>
     <span class="o">...</span>
     <span class="n">D2</span> <span class="o">=</span> <span class="n">load</span><span class="p">(</span><span class="s1">'mygraph'</span><span class="p">)</span>
     <span class="n">f</span> <span class="o">=</span> <span class="nb">compile</span><span class="p">([</span><span class="n">D2</span><span class="p">])</span>
     <span class="c1"># more operations</span>
     <span class="o">...</span>
 </pre></div>
 </div>
 <p>Because an imperative program executes as it describes the computation,
 you have to save the code itself as the <code class="docutils literal"><span class="pre">configuration</span></code>,
 or build another configuration layer on top of the imperative language.</p>
 </div>
 <div class="section" id="parameter-updates">
 <span id="parameter-updates"></span><h3>Parameter Updates<a class="headerlink" href="#parameter-updates" title="Permalink to this headline">¶</a></h3>
 <p>Most symbolic programs are data flow (computation) graphs.
 Data flow graphs describe computation.
 But it’s not obvious how to use graphs to describe parameter updates.
 That’s because parameter updates introduce mutation,
 which is not a data flow concept.
 Most symbolic programs introduce a special update statement
 to update persistent state in the programs.</p>
 <p>It’s usually easier to write parameter updates in an imperative style,
 especially when you need multiple updates that relate to each other.
 For symbolic programs, the update statement is also executed as you call it.
 So in that sense, most symbolic deep learning libraries
 fall back on the imperative approach to perform updates,
 while using the symbolic approach to perform gradient calculation.</p>
 </div>
 <div class="section" id="there-is-no-strict-boundary">
 <span id="there-is-no-strict-boundary"></span><h3>There Is No Strict Boundary<a class="headerlink" href="#there-is-no-strict-boundary" title="Permalink to this headline">¶</a></h3>
 <p>In comparing the two programming styles,
 some of our arguments might not be strictly true,
 i.e., it’s possible to make an imperative program
 more like a traditional symbolic program or vice versa.
 However, the two archetypes are useful abstractions,
 especially for understanding the differences between deep learning libraries.
 We might reasonably conclude that there is no clear boundary between programming styles.
 For example, you can create a just-in-time (JIT) compiler in Python
 to compile imperative Python programs,
 which provides some of the advantages of global
 information held in symbolic programs.</p>
 </div>
 </div>
 <div class="section" id="big-vs-small-operations">
 <span id="big-vs-small-operations"></span><h2>Big vs. Small Operations<a class="headerlink" href="#big-vs-small-operations" title="Permalink to this headline">¶</a></h2>
 <p>When designing a deep learning library, another important programming model decision
 is precisely what operations to support.
 In general, there are two families of operations supported by most deep learning libraries:</p>
 <ul class="simple">
 <li>Big operations - typically for computing neural network layers (e.g. FullyConnected and BatchNormalize).</li>
 <li>Small operations - mathematical functions like matrix multiplication and element-wise addition.</li>
 </ul>
 <p>Libraries like CXXNet and Caffe support layer-level operations.
 Libraries like Theano and Minerva support fine-grained operations.</p>
 <div class="section" id="smaller-operations-can-be-more-flexible">
 <span id="smaller-operations-can-be-more-flexible"></span><h3>Smaller Operations Can Be More Flexible<a class="headerlink" href="#smaller-operations-can-be-more-flexible" title="Permalink to this headline">¶</a></h3>
 <p>It’s quite natural to use smaller operations to compose bigger operations.
 For example, the sigmoid unit can simply be composed of division, addition and an exponentiation:</p>
 <div class="highlight-python"><div class="highlight"><pre><span></span>    <span class="n">sigmoid</span><span class="p">(</span><span class="n">x</span><span class="p">)</span> <span class="o">=</span> <span class="mf">1.0</span> <span class="o">/</span> <span class="p">(</span><span class="mf">1.0</span> <span class="o">+</span> <span class="n">exp</span><span class="p">(</span><span class="o">-</span><span class="n">x</span><span class="p">))</span>
 </pre></div>
 </div>
 <p>Using smaller operations as building blocks, you can express nearly anything you want.
 If you’re more familiar with CXXNet- or Caffe-style layers,
 note that these operations don’t differ from a layer, except that they are smaller.</p>
 <div class="highlight-python"><div class="highlight"><pre><span></span>    <span class="n">SigmoidLayer</span><span class="p">(</span><span class="n">x</span><span class="p">)</span> <span class="o">=</span> <span class="n">EWiseDivisionLayer</span><span class="p">(</span><span class="mf">1.0</span><span class="p">,</span> <span class="n">AddScalarLayer</span><span class="p">(</span><span class="n">ExpLayer</span><span class="p">(</span><span class="o">-</span><span class="n">x</span><span class="p">),</span> <span class="mf">1.0</span><span class="p">))</span>
 </pre></div>
 </div>
 <p>This expression composes three layers,
 with each defining its forward and backward (gradient) function.
 Using smaller operations gives you the advantage of building new layers quickly,
 because you only need to compose the components.</p>
 </div>
 <div class="section" id="big-operations-are-more-efficient">
 <span id="big-operations-are-more-efficient"></span><h3>Big Operations Are More Efficient<a class="headerlink" href="#big-operations-are-more-efficient" title="Permalink to this headline">¶</a></h3>
 <p>Directly composing sigmoid layers requires three layers of operation, instead of one.</p>
 <div class="highlight-python"><div class="highlight"><pre><span></span>    <span class="n">SigmoidLayer</span><span class="p">(</span><span class="n">x</span><span class="p">)</span> <span class="o">=</span> <span class="n">EWiseDivisionLayer</span><span class="p">(</span><span class="mf">1.0</span><span class="p">,</span> <span class="n">AddScalarLayer</span><span class="p">(</span><span class="n">ExpLayer</span><span class="p">(</span><span class="o">-</span><span class="n">x</span><span class="p">),</span> <span class="mf">1.0</span><span class="p">))</span>
 </pre></div>
 </div>
 <p>This code creates overhead for computation and memory (which could be optimized, with cost).</p>
 <p>Libraries like CXXNet and Caffe take a different approach.
 To support coarse-grained operations,
 such as BatchNormalization and the SigmoidLayer directly,
 in each layer, the calculation kernel is hand crafted
 with one or only some CUDA kernel launches.
 This makes these implementations more efficient.</p>
 </div>
 <div class="section" id="compilation-and-optimization">
 <span id="compilation-and-optimization"></span><h3>Compilation and Optimization<a class="headerlink" href="#compilation-and-optimization" title="Permalink to this headline">¶</a></h3>
 <p>Can small operations be optimized? Of course, they can.
 Let’s look at the system optimization part of the compilation engine.
 Two types of optimization can be performed on the computation graph:</p>
 <ul class="simple">
 <li>Memory allocation optimization, to reuse the memory of the intermediate computations.</li>
 <li>Operator fusion, to detect sub-graph patterns, such as the sigmoid, and fuse them into a bigger operation kernel.</li>
 </ul>
 <p>Memory allocation optimization isn’t restricted to small operations graphs.
 You can use it with bigger operations graph, too.
 However, optimization might not be essential
 for bigger operation libraries like CXXNet and Caffe,
 because you can’t find the compilation step in them.
 However, there’s a (dumb) <code class="docutils literal"><span class="pre">compilation</span> <span class="pre">step</span></code> in these libraries,
 that basically translates the layers into a fixed forward,
 backprop execution plan, by running each operation one by one.</p>
 <p>For computation graphs with smaller operations,
 these optimizations are crucial to performance.
 Because the operations are small,
 there are many sub-graph patterns that can be matched.
 Also, because the final, generated operations
 might not be enumerable,
 an explicit recompilation of the kernels is required,
 as opposed to the fixed amount of precompiled kernels
 in the big operation libraries.
 This creates compilation overhead for the symbolic libraries
 that support small operations.
 Requiring compilation optimization also creates engineering overhead
 for the libraries that solely support smaller operations.</p>
 <p>As in the case of symbolic vs. imperative,
 the bigger operation libraries “cheat”
 by asking you to provide restrictions (to the common layer),
 so that you actually perform the sub-graph matching.
 This moves the compilation overhead to the real brain, which is usually not too bad.</p>
 </div>
 <div class="section" id="expression-template-and-statically-typed-language">
 <span id="expression-template-and-statically-typed-language"></span><h3>Expression Template and Statically Typed Language<a class="headerlink" href="#expression-template-and-statically-typed-language" title="Permalink to this headline">¶</a></h3>
 <p>You always have a need to write small operations and compose them.
 Libraries like Caffe use hand-crafted kernels to build these bigger blocks.
 Otherwise, you would have to compose smaller operations using Python.</p>
 <p>There’s a third choice that works pretty well.
 This is called the expression template.
 Basically, you use template programming to
 generate generic kernels from an expression tree at compile time.
 For details, see <a class="reference external" href="https://github.com/dmlc/mshadow/blob/master/guide/exp-template/README.md">Expression Template Tutorial</a>.
 CXXNet makes extensive use of an expression template,
 which enables creating much shorter and more readable code that matches
 the performance of hand-crafted kernels.</p>
 <p>The difference between using an expression template and Python kernel generation
 is that expression evaluation is done at compile time for C++ with an existing type,
 so there is no additional runtime overhead.
 In principle, this is also possible with other statically typed languages that support templates,
 but we’ve seen this trick used only in C++.</p>
 <p>Expression template libraries create a middle ground between Python operations
 and hand-crafted big kernels by allowing C++ users to craft efficient big
 operations by composing smaller operations. It’s an option worth considering.</p>
 </div>
 </div>
 <div class="section" id="mix-the-approaches">
 <span id="mix-the-approaches"></span><h2>Mix the Approaches<a class="headerlink" href="#mix-the-approaches" title="Permalink to this headline">¶</a></h2>
 <p>Now that we’ve compared the programming models, which one should you choose?
 Before delving into that, we should emphasize that depending on the problems you’re trying to solve,
 our comparison might not necessarily have a big impact.</p>
 <p>Remember <a class="reference external" href="https://en.wikipedia.org/wiki/Amdahl%27s_law">Amdahl’s law</a>:
 If you are optimizing a non-performance-critical part of your problem,
 you won’t get much of a performance gain.</p>
 <p>As you’ve seen, there usually is a trade-off between efficiency,
 flexibility, and engineering complexity.
 The more suitable programming style depends on the problem you are trying to solve.
 For example, imperative programs are better for parameter updates,
 and symbolic programs for gradient calculation.</p>
 <p>We advocate <em>mixing</em> the approaches.
 Sometimes the part that we want to be flexible
 isn’t crucial to performance.
 In these cases, it’s okay to leave some efficiency on the table
 to support more flexible interfaces.
 In machine learning, combining methods usually works better than using just one.</p>
 <p>If you can combine the programming models correctly,
 you can get better results than when using a single programming model.
 In this section, we discuss how to do so.</p>
 <div class="section" id="symbolic-and-imperative-programs">
 <span id="symbolic-and-imperative-programs"></span><h3>Symbolic and Imperative Programs<a class="headerlink" href="#symbolic-and-imperative-programs" title="Permalink to this headline">¶</a></h3>
 <p>There are two ways to mix symbolic and imperative programs:</p>
 <ul class="simple">
 <li>Use imperative programs within symbolic programs as callbacks</li>
 <li>Use symbolic programs as part of imperative programs</li>
 </ul>
 <p>We’ve observed that it’s usually helpful to write parameter updates imperatively,
 and perform gradient calculations in symbolic programs.</p>
 <p>Symbolic libraries already mix programs because Python itself is imperative.
 For example, the following program mixes the symbolic approach with NumPy, which is imperative.</p>
 <div class="highlight-python"><div class="highlight"><pre><span></span>    <span class="n">A</span> <span class="o">=</span> <span class="n">Variable</span><span class="p">(</span><span class="s1">'A'</span><span class="p">)</span>
     <span class="n">B</span> <span class="o">=</span> <span class="n">Variable</span><span class="p">(</span><span class="s1">'B'</span><span class="p">)</span>
     <span class="n">C</span> <span class="o">=</span> <span class="n">B</span> <span class="o">*</span> <span class="n">A</span>
     <span class="n">D</span> <span class="o">=</span> <span class="n">C</span> <span class="o">+</span> <span class="n">Constant</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span>
     <span class="c1"># compiles the function</span>
     <span class="n">f</span> <span class="o">=</span> <span class="nb">compile</span><span class="p">(</span><span class="n">D</span><span class="p">)</span>
     <span class="n">d</span> <span class="o">=</span> <span class="n">f</span><span class="p">(</span><span class="n">A</span><span class="o">=</span><span class="n">np</span><span class="o">.</span><span class="n">ones</span><span class="p">(</span><span class="mi">10</span><span class="p">),</span> <span class="n">B</span><span class="o">=</span><span class="n">np</span><span class="o">.</span><span class="n">ones</span><span class="p">(</span><span class="mi">10</span><span class="p">)</span><span class="o">*</span><span class="mi">2</span><span class="p">)</span>
     <span class="n">d</span> <span class="o">=</span> <span class="n">d</span> <span class="o">+</span> <span class="mf">1.0</span>
 </pre></div>
 </div>
 <p>The symbolic graphs are compiled into a function that can be executed imperatively.
 The internals are a black box to the user.
 This is exactly like writing C++ programs and exposing them to Python, which we commonly do.</p>
 <p>Because parameter memory resides on the GPU,
 you might not want to use NumPy as an imperative component.
 Supporting a GPU-compatible imperative library
 that interacts with symbolic compiled functions
 or provides a limited amount of updating syntax
 in the update statement in symbolic program execution
 might be a better choice.</p>
 </div>
 <div class="section" id="small-and-big-operations">
 <span id="small-and-big-operations"></span><h3>Small and Big Operations<a class="headerlink" href="#small-and-big-operations" title="Permalink to this headline">¶</a></h3>
 <p>There might be a good reason to combine small and big operations.
 Consider applications that perform tasks such as changing
 a loss function or adding a few customized layers to an existing structure.
 Usually, you can use big operations to compose existing
 components, and use smaller operations to build the new parts.</p>
 <p>Recall Amdahl’s law. Often, the new components
 are not the cause of the computation bottleneck.
 Because the performance-critical part is already optimized by
 the bigger operations, it’s okay to forego optimizing the additional small operations,
 or to do a limited amount of memory optimization instead
 of operation fusion and directly running them.</p>
 </div>
 <div class="section" id="choose-your-own-approach">
 <span id="choose-your-own-approach"></span><h3>Choose Your Own Approach<a class="headerlink" href="#choose-your-own-approach" title="Permalink to this headline">¶</a></h3>
 <p>In this document, we compared multiple approaches
 to developing programming environments for deep learning.
 We compared both the usability and efficiency implications of each,
 finding that many of these trade-offs (like imperative vs symbolic aren’t necessarily black and white).
 You can choose your approach, or combine the approaches
 to create more interesting and intelligent deep learning libraries.</p>
 </div>
 </div>
 <div class="section" id="contribute-to-mxnet">
 <span id="contribute-to-mxnet"></span><h2>Contribute to MXNet<a class="headerlink" href="#contribute-to-mxnet" title="Permalink to this headline">¶</a></h2>
 <p>This document is part of our effort to provide <a class="reference internal" href="index.html"><em>open-source system design notes</em></a>
 for deep learning libraries. If you’re interested in contributing to <em>MXNet</em> or its
 documentation, <a class="reference external" href="http://github.com/dmlc/mxnet">fork us on GitHub</a>.</p>
 </div>
 <div class="section" id="next-steps">
 <span id="next-steps"></span><h2>Next Steps<a class="headerlink" href="#next-steps" title="Permalink to this headline">¶</a></h2>
 <div class="toctree-wrapper compound">
 <ul>
 <li class="toctree-l1"><a class="reference external" href="https://mxnet.incubator.apache.org/architecture/note_engine.html">Dependency Engine for Deep Learning</a></li>
 <li class="toctree-l1"><a class="reference external" href="https://mxnet.incubator.apache.org/architecture/note_memory.html">Squeeze the Memory Consumption of Deep Learning</a></li>
 <li class="toctree-l1"><a class="reference external" href="https://mxnet.incubator.apache.org/architecture/note_data_loading.html">Efficient Data Loading Module for Deep Learning</a></li>
 <li class="toctree-l1"><a class="reference external" href="https://mxnet.incubator.apache.org/architecture/rnn_interface.html">Survey of RNN Interface</a></li>
 </ul>
 </div>
 </div>
 </div>
 <div class="container">
 <div class="footer">
 <p> </p>
 </div>
 </div>
 </div>
 <div aria-label="main navigation" class="sphinxsidebar rightsidebar" role="navigation">
 <div class="sphinxsidebarwrapper">
 <h3><a href="../index.html">Table Of Contents</a></h3>
 <ul>
 <li><a class="reference internal" href="#">Deep Learning Programming Style</a><ul>
 <li><a class="reference internal" href="#symbolic-vs-imperative-programs">Symbolic vs. Imperative Programs</a><ul>
 <li><a class="reference internal" href="#imperative-programs-tend-to-be-more-flexible">Imperative Programs Tend to be More Flexible</a></li>
 <li><a class="reference internal" href="#symbolic-programs-tend-to-be-more-efficient">Symbolic Programs Tend to be More Efficient</a></li>
 <li><a class="reference internal" href="#case-study-backprop-and-autodiff">Case Study: Backprop and AutoDiff</a></li>
 <li><a class="reference internal" href="#model-checkpoint">Model Checkpoint</a></li>
 <li><a class="reference internal" href="#parameter-updates">Parameter Updates</a></li>
 <li><a class="reference internal" href="#there-is-no-strict-boundary">There Is No Strict Boundary</a></li>
 </ul>
 </li>
 <li><a class="reference internal" href="#big-vs-small-operations">Big vs. Small Operations</a><ul>
 <li><a class="reference internal" href="#smaller-operations-can-be-more-flexible">Smaller Operations Can Be More Flexible</a></li>
 <li><a class="reference internal" href="#big-operations-are-more-efficient">Big Operations Are More Efficient</a></li>
 <li><a class="reference internal" href="#compilation-and-optimization">Compilation and Optimization</a></li>
 <li><a class="reference internal" href="#expression-template-and-statically-typed-language">Expression Template and Statically Typed Language</a></li>
 </ul>
 </li>
 <li><a class="reference internal" href="#mix-the-approaches">Mix the Approaches</a><ul>
 <li><a class="reference internal" href="#symbolic-and-imperative-programs">Symbolic and Imperative Programs</a></li>
 <li><a class="reference internal" href="#small-and-big-operations">Small and Big Operations</a></li>
 <li><a class="reference internal" href="#choose-your-own-approach">Choose Your Own Approach</a></li>
 </ul>
 </li>
 <li><a class="reference internal" href="#contribute-to-mxnet">Contribute to MXNet</a></li>
 <li><a class="reference internal" href="#next-steps">Next Steps</a></li>
 </ul>
 </li>
 </ul>
 </div>
 </div>
 </div> <!-- pagename != index -->
 <script crossorigin="anonymous" integrity="sha384-0mSbJDEHialfmuBBQP6A4Qrprq5OVfW37PRR3j5ELqxss1yVqOtnepnHVP9aJ7xS" src="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.6/js/bootstrap.min.js"></script>
 <script src="../_static/js/sidebar.js" type="text/javascript"></script>
 <script src="../_static/js/search.js" type="text/javascript"></script>
 <script src="../_static/js/navbar.js" type="text/javascript"></script>
 <script src="../_static/js/clipboard.min.js" type="text/javascript"></script>
 <script src="../_static/js/copycode.js" type="text/javascript"></script>
 <script type="text/javascript">
         $('body').ready(function () {
             $('body').css('visibility', 'visible');
         });
     </script>
 </div></body>
 </html>