versions/1.0.0/architecture/note_memory.html - mxnet-site - Git at Google

 <!DOCTYPE html>

 <html lang="en">
 <head>
 <meta charset="utf-8"/>
 <meta content="IE=edge" http-equiv="X-UA-Compatible"/>
 <meta content="width=device-width, initial-scale=1" name="viewport"/>
 <meta content="Optimizing Memory Consumption in Deep Learning" property="og:title">
 <meta content="https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/image/og-logo.png" property="og:image">
 <meta content="https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/image/og-logo.png" property="og:image:secure_url">
 <meta content="Optimizing Memory Consumption in Deep Learning" property="og:description"/>
 <title>Optimizing Memory Consumption in Deep Learning — mxnet  documentation</title>
 <link crossorigin="anonymous" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.6/css/bootstrap.min.css" integrity="sha384-1q8mTJOASx8j1Au+a5WDVnPi2lkFfwwEAa8hDDdjZlpLegxhjVME1fgjWPGmkzs7" rel="stylesheet"/>
 <link href="https://maxcdn.bootstrapcdn.com/font-awesome/4.5.0/css/font-awesome.min.css" rel="stylesheet"/>
 <link href="../_static/basic.css" rel="stylesheet" type="text/css">
 <link href="../_static/pygments.css" rel="stylesheet" type="text/css">
 <link href="../_static/mxnet.css" rel="stylesheet" type="text/css"/>
 <script type="text/javascript">
       var DOCUMENTATION_OPTIONS = {
         URL_ROOT:    '../',
         VERSION:     '',
         COLLAPSE_INDEX: false,
         FILE_SUFFIX: '.html',
         HAS_SOURCE:  true,
         SOURCELINK_SUFFIX: '.txt'
       };
     </script>
 <script src="https://code.jquery.com/jquery-1.11.1.min.js" type="text/javascript"></script>
 <script src="../_static/underscore.js" type="text/javascript"></script>
 <script src="../_static/searchtools_custom.js" type="text/javascript"></script>
 <script src="../_static/doctools.js" type="text/javascript"></script>
 <script src="../_static/selectlang.js" type="text/javascript"></script>
 <script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.1/MathJax.js?config=TeX-AMS-MML_HTMLorMML" type="text/javascript"></script>
 <script type="text/javascript"> jQuery(function() { Search.loadIndex("/versions/1.0.0/searchindex.js"); Search.init();}); </script>
 <script>
       (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
       (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new
       Date();a=s.createElement(o),
       m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
       })(window,document,'script','https://www.google-analytics.com/analytics.js','ga');

       ga('create', 'UA-96378503-1', 'auto');
       ga('send', 'pageview');

     </script>
 <!-- -->
 <!-- <script type="text/javascript" src="../_static/jquery.js"></script> -->
 <!-- -->
 <!-- <script type="text/javascript" src="../_static/underscore.js"></script> -->
 <!-- -->
 <!-- <script type="text/javascript" src="../_static/doctools.js"></script> -->
 <!-- -->
 <!-- <script type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.0/MathJax.js?config=TeX-AMS-MML_HTMLorMML"></script> -->
 <!-- -->
 <link href="../genindex.html" rel="index" title="Index">
 <link href="../search.html" rel="search" title="Search"/>
 <link href="https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/image/mxnet-icon.png" rel="icon" type="image/png"/>
 </link></link></link></meta></meta></meta></head>
 <body background="https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/image/mxnet-background-compressed.jpeg" role="document">
 <div class="content-block"><div class="navbar navbar-fixed-top">
 <div class="container" id="navContainer">
 <div class="innder" id="header-inner">
 <h1 id="logo-wrap">
 <a href="../" id="logo"><img src="https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/image/mxnet_logo.png"/></a>
 </h1>
 <nav class="nav-bar" id="main-nav">
 <a class="main-nav-link" href="/versions/1.0.0/install/index.html">Install</a>
 <span id="dropdown-menu-position-anchor">
 <a aria-expanded="true" aria-haspopup="true" class="main-nav-link dropdown-toggle" data-toggle="dropdown" href="#" role="button">Gluon <span class="caret"></span></a>
 <ul class="dropdown-menu navbar-menu" id="package-dropdown-menu">
 <li><a class="main-nav-link" href="/versions/1.0.0/tutorials/gluon/gluon.html">About</a></li>
 <li><a class="main-nav-link" href="https://www.d2l.ai/">Dive into Deep Learning</a></li>
 <li><a class="main-nav-link" href="https://gluon-cv.mxnet.io">GluonCV Toolkit</a></li>
 <li><a class="main-nav-link" href="https://gluon-nlp.mxnet.io/">GluonNLP Toolkit</a></li>
 </ul>
 </span>
 <span id="dropdown-menu-position-anchor">
 <a aria-expanded="true" aria-haspopup="true" class="main-nav-link dropdown-toggle" data-toggle="dropdown" href="#" role="button">API <span class="caret"></span></a>
 <ul class="dropdown-menu navbar-menu" id="package-dropdown-menu">
 <li><a class="main-nav-link" href="/versions/1.0.0/api/python/index.html">Python</a></li>
 <li><a class="main-nav-link" href="/versions/1.0.0/api/c++/index.html">C++</a></li>
 <li><a class="main-nav-link" href="/versions/1.0.0/api/julia/index.html">Julia</a></li>
 <li><a class="main-nav-link" href="/versions/1.0.0/api/perl/index.html">Perl</a></li>
 <li><a class="main-nav-link" href="/versions/1.0.0/api/r/index.html">R</a></li>
 <li><a class="main-nav-link" href="/versions/1.0.0/api/scala/index.html">Scala</a></li>
 </ul>
 </span>
 <span id="dropdown-menu-position-anchor-docs">
 <a aria-expanded="true" aria-haspopup="true" class="main-nav-link dropdown-toggle" data-toggle="dropdown" href="#" role="button">Docs <span class="caret"></span></a>
 <ul class="dropdown-menu navbar-menu" id="package-dropdown-menu-docs">
 <li><a class="main-nav-link" href="/versions/1.0.0/faq/index.html">FAQ</a></li>
 <li><a class="main-nav-link" href="/versions/1.0.0/tutorials/index.html">Tutorials</a>
 <li><a class="main-nav-link" href="https://github.com/apache/incubator-mxnet/tree/v1.0.0/example">Examples</a></li>
 <li><a class="main-nav-link" href="/versions/1.0.0/architecture/index.html">Architecture</a></li>
 <li><a class="main-nav-link" href="https://cwiki.apache.org/confluence/display/MXNET/Apache+MXNet+Home">Developer Wiki</a></li>
 <li><a class="main-nav-link" href="/versions/1.0.0/model_zoo/index.html">Model Zoo</a></li>
 <li><a class="main-nav-link" href="https://github.com/onnx/onnx-mxnet">ONNX</a></li>
 </li></ul>
 </span>
 <span id="dropdown-menu-position-anchor-community">
 <a aria-expanded="true" aria-haspopup="true" class="main-nav-link dropdown-toggle" data-toggle="dropdown" href="#" role="button">Community <span class="caret"></span></a>
 <ul class="dropdown-menu navbar-menu" id="package-dropdown-menu-community">
 <li><a class="main-nav-link" href="http://discuss.mxnet.io">Forum</a></li>
 <li><a class="main-nav-link" href="https://github.com/apache/incubator-mxnet/tree/v1.0.0">Github</a></li>
 <li><a class="main-nav-link" href="/versions/1.0.0/community/contribute.html">Contribute</a></li>
 <li><a class="main-nav-link" href="/versions/1.0.0/community/powered_by.html">Powered By</a></li>
 </ul>
 </span>
 <span id="dropdown-menu-position-anchor-version" style="position: relative"><a href="#" class="main-nav-link dropdown-toggle" data-toggle="dropdown" role="button" aria-haspopup="true" aria-expanded="true">1.0.0<span class="caret"></span></a><ul id="package-dropdown-menu" class="dropdown-menu"><li><a href="/">master</a></li><li><a href="/versions/1.7.0/">1.7.0</a></li><li><a href=/versions/1.6.0/>1.6.0</a></li><li><a href=/versions/1.5.0/>1.5.0</a></li><li><a href=/versions/1.4.1/>1.4.1</a></li><li><a href=/versions/1.3.1/>1.3.1</a></li><li><a href=/versions/1.2.1/>1.2.1</a></li><li><a href=/versions/1.1.0/>1.1.0</a></li><li><a href=/versions/1.0.0/>1.0.0</a></li><li><a href=/versions/0.12.1/>0.12.1</a></li><li><a href=/versions/0.11.0/>0.11.0</a></li></ul></span></nav>
 <script> function getRootPath(){ return "../" } </script>
 <div class="burgerIcon dropdown">
 <a class="dropdown-toggle" data-toggle="dropdown" href="#" role="button">☰</a>
 <ul class="dropdown-menu" id="burgerMenu">
 <li><a href="/versions/1.0.0/install/index.html">Install</a></li>
 <li><a class="main-nav-link" href="/versions/1.0.0/tutorials/index.html">Tutorials</a></li>
 <li class="dropdown-submenu dropdown">
 <a aria-expanded="true" aria-haspopup="true" class="dropdown-toggle burger-link" data-toggle="dropdown" href="#" tabindex="-1">Gluon</a>
 <ul class="dropdown-menu navbar-menu" id="package-dropdown-menu">
 <li><a class="main-nav-link" href="/versions/1.0.0/tutorials/gluon/gluon.html">About</a></li>
 <li><a class="main-nav-link" href="http://gluon.mxnet.io">The Straight Dope (Tutorials)</a></li>
 <li><a class="main-nav-link" href="https://gluon-cv.mxnet.io">GluonCV Toolkit</a></li>
 <li><a class="main-nav-link" href="https://gluon-nlp.mxnet.io/">GluonNLP Toolkit</a></li>
 </ul>
 </li>
 <li class="dropdown-submenu">
 <a aria-expanded="true" aria-haspopup="true" class="dropdown-toggle burger-link" data-toggle="dropdown" href="#" tabindex="-1">API</a>
 <ul class="dropdown-menu">
 <li><a class="main-nav-link" href="/versions/1.0.0/api/python/index.html">Python</a></li>
 <li><a class="main-nav-link" href="/versions/1.0.0/api/c++/index.html">C++</a></li>
 <li><a class="main-nav-link" href="/versions/1.0.0/api/julia/index.html">Julia</a></li>
 <li><a class="main-nav-link" href="/versions/1.0.0/api/perl/index.html">Perl</a></li>
 <li><a class="main-nav-link" href="/versions/1.0.0/api/r/index.html">R</a></li>
 <li><a class="main-nav-link" href="/versions/1.0.0/api/scala/index.html">Scala</a></li>
 </ul>
 </li>
 <li class="dropdown-submenu">
 <a aria-expanded="true" aria-haspopup="true" class="dropdown-toggle burger-link" data-toggle="dropdown" href="#" tabindex="-1">Docs</a>
 <ul class="dropdown-menu">
 <li><a href="/versions/1.0.0/faq/index.html" tabindex="-1">FAQ</a></li>
 <li><a href="/versions/1.0.0/tutorials/index.html" tabindex="-1">Tutorials</a></li>
 <li><a href="https://github.com/apache/incubator-mxnet/tree/v1.0.0/example" tabindex="-1">Examples</a></li>
 <li><a href="/versions/1.0.0/architecture/index.html" tabindex="-1">Architecture</a></li>
 <li><a href="https://cwiki.apache.org/confluence/display/MXNET/Apache+MXNet+Home" tabindex="-1">Developer Wiki</a></li>
 <li><a href="/versions/1.0.0/model_zoo/index.html" tabindex="-1">Gluon Model Zoo</a></li>
 <li><a href="https://github.com/onnx/onnx-mxnet" tabindex="-1">ONNX</a></li>
 </ul>
 </li>
 <li class="dropdown-submenu dropdown">
 <a aria-haspopup="true" class="dropdown-toggle burger-link" data-toggle="dropdown" href="#" role="button" tabindex="-1">Community</a>
 <ul class="dropdown-menu">
 <li><a href="http://discuss.mxnet.io" tabindex="-1">Forum</a></li>
 <li><a href="https://github.com/apache/incubator-mxnet/tree/v1.0.0" tabindex="-1">Github</a></li>
 <li><a href="/versions/1.0.0/community/contribute.html" tabindex="-1">Contribute</a></li>
 <li><a href="/versions/1.0.0/community/powered_by.html" tabindex="-1">Powered By</a></li>
 </ul>
 </li>
 <li id="dropdown-menu-position-anchor-version-mobile" class="dropdown-submenu" style="position: relative"><a href="#" tabindex="-1">1.0.0</a><ul class="dropdown-menu"><li><a tabindex="-1" href=/>master</a></li><li><a tabindex="-1" href=/versions/1.6.0/>1.6.0</a></li><li><a tabindex="-1" href=/versions/1.5.0/>1.5.0</a></li><li><a tabindex="-1" href=/versions/1.4.1/>1.4.1</a></li><li><a tabindex="-1" href=/versions/1.3.1/>1.3.1</a></li><li><a tabindex="-1" href=/versions/1.2.1/>1.2.1</a></li><li><a tabindex="-1" href=/versions/1.1.0/>1.1.0</a></li><li><a tabindex="-1" href=/versions/1.0.0/>1.0.0</a></li><li><a tabindex="-1" href=/versions/0.12.1/>0.12.1</a></li><li><a tabindex="-1" href=/versions/0.11.0/>0.11.0</a></li></ul></li></ul>
 </div>
 <div class="plusIcon dropdown">
 <a class="dropdown-toggle" data-toggle="dropdown" href="#" role="button"><span aria-hidden="true" class="glyphicon glyphicon-plus"></span></a>
 <ul class="dropdown-menu dropdown-menu-right" id="plusMenu"></ul>
 </div>
 <div id="search-input-wrap">
 <form action="../search.html" autocomplete="off" class="" method="get" role="search">
 <div class="form-group inner-addon left-addon">
 <i class="glyphicon glyphicon-search"></i>
 <input class="form-control" name="q" placeholder="Search" type="text"/>
 </div>
 <input name="check_keywords" type="hidden" value="yes">
 <input name="area" type="hidden" value="default"/>
 </input></form>
 <div id="search-preview"></div>
 </div>
 <div id="searchIcon">
 <span aria-hidden="true" class="glyphicon glyphicon-search"></span>
 </div>
 <!-- <div id="lang-select-wrap"> -->
 <!--   <label id="lang-select-label"> -->
 <!--     <\!-- <i class="fa fa-globe"></i> -\-> -->
 <!--     <span></span> -->
 <!--   </label> -->
 <!--   <select id="lang-select"> -->
 <!--     <option value="en">Eng</option> -->
 <!--     <option value="zh">中文</option> -->
 <!--   </select> -->
 <!-- </div> -->
 <!--     <a id="mobile-nav-toggle">
         <span class="mobile-nav-toggle-bar"></span>
         <span class="mobile-nav-toggle-bar"></span>
         <span class="mobile-nav-toggle-bar"></span>
       </a> -->
 </div>
 </div>
 </div>
 <script type="text/javascript">
         $('body').css('background', 'white');
     </script>
 <div class="container">
 <div class="row">
 <div aria-label="main navigation" class="sphinxsidebar leftsidebar" role="navigation">
 <div class="sphinxsidebarwrapper">
 <ul>
 <li class="toctree-l1"><a class="reference internal" href="../api/python/index.html">Python Documents</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../api/r/index.html">R Documents</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../api/julia/index.html">Julia Documents</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../api/c++/index.html">C++ Documents</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../api/scala/index.html">Scala Documents</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../api/perl/index.html">Perl Documents</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../faq/index.html">HowTo Documents</a></li>
 <li class="toctree-l1"><a class="reference internal" href="index.html">System Documents</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../tutorials/index.html">Tutorials</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../community/index.html">Community</a></li>
 </ul>
 </div>
 </div>
 <div class="content">
 <div class="page-tracker"></div>
 <div class="section" id="optimizing-memory-consumption-in-deep-learning">
 <span id="optimizing-memory-consumption-in-deep-learning"></span><h1>Optimizing Memory Consumption in Deep Learning<a class="headerlink" href="#optimizing-memory-consumption-in-deep-learning" title="Permalink to this headline">¶</a></h1>
 <p>Over the last ten years, a constant trend in deep learning
 is towards deeper and larger networks.
 Despite rapid advances in hardware performance,
 cutting-edge deep learning models continue to push the limits of GPU RAM.
 So even today, it’s always desirable to find ways
 to train larger models while consuming less memory.
 Doing so enables us to train faster, using larger batch sizes,
 and consequently achieving a higher GPU utilization rate.</p>
 <p>In this document, we explore techniques for optimizing
 memory allocation for deep neural networks.
 We discuss a few candidate solutions.
 While our proposals are by no means exhaustive,
 these solutions are instructive and allow us to
 introduce the major design issues at play.</p>
 <div class="section" id="computation-graph">
 <span id="computation-graph"></span><h2>Computation Graph<a class="headerlink" href="#computation-graph" title="Permalink to this headline">¶</a></h2>
 <p>First, let’s revisit the idea of the computation graph.
 A computation graph describes the (data flow) dependencies
 between the operations in the deep network.
 The operations performed in the graph
 can be either fine-grained or coarse-grained.
 The following figure shows two examples of computation graphs.</p>
 <p><img alt="Comp Graph Example" src="https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/memory/comp_graph_example.png"/></p>
 <p>The concept of a computation graph is explicitly encoded in packages like Theano and CGT.
 In other libraries, computation graphs appear implicitly as network configuration files.
 The major difference in these libraries comes down to how they calculate gradients.
 There are mainly two ways: performing back-propagation on the <em>same</em> graph
 or explicitly representing a <em>backwards path</em> to calculate the required gradients.</p>
 <p><img alt="Backward Graph" src="https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/memory/back_graph.png"/></p>
 <p>Libraries like Caffe, CXXNet, and Torch take the former approach,
 performing back-prop on the original graph.
 Libraries like Theano and CGT take the latter approach,
 explicitly representing the backward path.
 In this discussion, we adopt the <em>explicit backward path</em> approach
 because it has several advantages for optimization.</p>
 <p>However, we should emphasize that choosing the explicit backward path approach doesn’t restrict us
 to symbolic libraries, such as Theano and CGT. We can also use the explicit backward path for gradient calculation of
 layer-based (which ties forward and backward together) libraries. The following graph shows how to do this.
 Basically, we introduce a backward node that links to the forward node of the graph and calls the <code class="docutils literal"><span class="pre">layer.backward</span></code>
 in the backward operations.</p>
 <p><img alt="Backward Layer" src="https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/memory/explicit_back_layer.png"/></p>
 <p>This discussion applies to almost all existing deep learning libraries.
 (There are differences between libraries,  e.g., higher-order differentiation, which is beyond the scope of this topic.)</p>
 <p>Why is the explicit backward path better? Let’s explain it with two examples.
 The first reason is that the explicit backward path
 clearly describes the dependency between computations.
 Consider the following case, where we want to get
 the gradient of A and B. As we can see clearly from the graph,
 the computation of the <code class="docutils literal"><span class="pre">d(C)</span></code> gradient doesn’t depend on F.
 This means that we can free the memory of <code class="docutils literal"><span class="pre">F</span></code>
 right after the forward computation is done.
 Similarly, the memory of <code class="docutils literal"><span class="pre">C</span></code> can be recycled.</p>
 <p><img alt="Backward Prune" src="https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/memory/back_dep_prune.png"/></p>
 <p>Another advantage of the explicit backward path
 is the ability to have a different backward path,
 instead of a mirror of forward one.
 A common example is the split connection case,
 as shown in the following figure.</p>
 <p><img alt="Backward Agg" src="https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/memory/back_agg_grad.png"/></p>
 <p>In this example, the output of B is referenced by two operations.
 If we want to do the gradient calculation in the same
 network, we need to introduce an explicit split layer.
 This means we need to do the split for the forward pass, too.
 In this figure, the forward pass doesn’t contain a split layer,
 but the graph will automatically insert a gradient
 aggregation node before passing the gradient back to B.
 This helps us to save the memory cost of allocating the output of the split layer,
 and the operation cost of replicating the data in the forward pass.</p>
 <p>If we adopt the explicit backward approach,
 there’s no difference between the forward pass and the backward pass.
 We simply step through the computation graph in chronological order
 and carry out computations.
 This makes the explicit backward approach easy to analyze.
 We just need to answer the question:
 how do we allocate memory for each output node of a computation graph?</p>
 </div>
 <div class="section" id="what-can-be-optimized">
 <span id="what-can-be-optimized"></span><h2>What Can Be Optimized?<a class="headerlink" href="#what-can-be-optimized" title="Permalink to this headline">¶</a></h2>
 <p>As you can see, the computation graph is a useful way
 to discuss memory allocation optimization techniques.
 Already, we’ve shown how you can save some memory
 by using the explicit backward graph.
 Now let’s explore further optimizations,
 and see how we might determine reasonable baselines for benchmarking.</p>
 <p>Assume that we want to build a neural network with <code class="docutils literal"><span class="pre">n</span></code> layers.
 Typically, when implementing a neural network,
 we need to allocate node space for both the output of each layer
 and the gradient values used during back-propagation.
 This means we need roughly <code class="docutils literal"><span class="pre">2</span> <span class="pre">n</span></code> memory cells.
 We face the same requirement when using the explicit backward graph approach
 because the number of nodes in a backward pass
 is roughly the same as in a forward pass.</p>
 <div class="section" id="in-place-operations">
 <span id="in-place-operations"></span><h3>In-place Operations<a class="headerlink" href="#in-place-operations" title="Permalink to this headline">¶</a></h3>
 <p>One of the simplest techniques we can employ
 is <em>in-place memory sharing</em> across operations.
 For neural networks, we can usually apply this technique
 for the operations corresponding to activation functions.
 Consider the following case, where we want
 to compute the value of three chained sigmoid functions.</p>
 <p><img alt="Inplace op" src="https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/memory/alloc_inline.png"/></p>
 <p>Because we can compute sigmoid <code class="docutils literal"><span class="pre">in-place</span></code>,
 using the same memory for input and output,
 we can compute an arbitrary-length chain
 of sigmoid functions using constant memory.</p>
 <p>Note: it’s easy to make mistakes when implementing in-place optimization.
 Consider the following case, where the value of B is used not only by C, but also by F.</p>
 <p><img alt="In-place trap" src="https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/memory/alloc_inline_trap.png"/></p>
 <p>We can’t perform in-place optimization because the value of B
 is still needed after <code class="docutils literal"><span class="pre">C=sigmoid(B)</span></code> is computed.
 An algorithm that simply does in-place optimization
 for every sigmoid operation might fall into such trap,
 so we need to be careful about when we can use it.</p>
 </div>
 <div class="section" id="standard-memory-sharing">
 <span id="standard-memory-sharing"></span><h3>Standard Memory Sharing<a class="headerlink" href="#standard-memory-sharing" title="Permalink to this headline">¶</a></h3>
 <p>In-place operations are not the only places where we can share memory.
 In the following example, because the value of B is no longer needed
 after we compute E, we can reuse B’s memory to hold the result of E.</p>
 <p><img alt="Normal Sharing" src="https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/memory/alloc_normal.png"/></p>
 <p><em>Memory sharing doesn’t necessarily require the same data shape</em>.
 Note that in the preceding example, the shapes of <code class="docutils literal"><span class="pre">B</span></code> and <code class="docutils literal"><span class="pre">E</span></code> can differ.
 To handle such a situation, we can allocate a memory region
 of size equal to the maximum of that required by <code class="docutils literal"><span class="pre">B</span></code> and <code class="docutils literal"><span class="pre">E</span></code> and share it between them.</p>
 </div>
 <div class="section" id="example-of-real-neural-network-allocation">
 <span id="example-of-real-neural-network-allocation"></span><h3>Example of Real Neural Network Allocation<a class="headerlink" href="#example-of-real-neural-network-allocation" title="Permalink to this headline">¶</a></h3>
 <p>Of course, these are only toy examples and they address only the computation of the forward pass.
 But the same ideas apply to real neural networks.
 The following figure shows an allocation plan for a two-layer perceptron.</p>
 <p><img alt="Net Alloc" src="https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/memory/alloc_mlp.png"/></p>
 <p>In this example:</p>
 <ul class="simple">
 <li>In-place optimization is applied when computing <code class="docutils literal"><span class="pre">act1</span></code>, <code class="docutils literal"><span class="pre">d(fc1)</span></code>, <code class="docutils literal"><span class="pre">out</span></code> and <code class="docutils literal"><span class="pre">d(fc2)</span></code>.</li>
 <li>Memory is shared between <code class="docutils literal"><span class="pre">d(act1)</span></code> and <code class="docutils literal"><span class="pre">d(A)</span></code>.</li>
 </ul>
 </div>
 </div>
 <div class="section" id="memory-allocation-algorithm">
 <span id="memory-allocation-algorithm"></span><h2>Memory Allocation Algorithm<a class="headerlink" href="#memory-allocation-algorithm" title="Permalink to this headline">¶</a></h2>
 <p>So far, we’ve discussed general techniques for optimizing memory allocation.
 We’ve seen that there are traps to avoid,
 as demonstrated in the case of in-place memory optimization.
 So, how can we allocate memory correctly?
 This is not a new problem.
 For example, it is very similar
 to the problem with register allocation in compilers.
 There might be techniques that we can borrow.
 We’re not attempting to give a comprehensive review of techniques here,
 but rather to introduce some simple
 but useful tricks to attack the problem.</p>
 <p>The key problem is that we need to place resources
 so that they don’t conflict with each other.
 More specifically, each variable has a <em>life time</em>
 between the time it gets computed until the last time it is used.
 In the case of the multi-layer perceptron,
 the <em>life time</em> of <code class="docutils literal"><span class="pre">fc1</span></code> ends after <code class="docutils literal"><span class="pre">act1</span></code> get computed.</p>
 <p><img alt="Net Alloc" src="https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/memory/alloc_mlp.png"/></p>
 <p>The principle is <em>to allow memory sharing only between variables whose lifetimes don’t overlap</em>.
 There are multiple ways to do this.
 You can construct the conflicting graph
 with each variable as a node and link the edge
 between variables with overlapping lifespans,
 and then run a graph-coloring algorithm.
 This likely has <span class="math">\(O(n^2)\)</span> complexity,
 where <code class="docutils literal"><span class="pre">n</span></code> is the number of nodes in the graph.
 This might be too costly.</p>
 <p>Let’s consider another simple heuristic.
 The idea is to simulate the procedure of traversing the graph,
 and keep a count of future operations that depends on the node.</p>
 <p><img alt="Alloc" src="https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/memory/alloc_step.png"/></p>
 <ul class="simple">
 <li>An in-place optimization can be performed when only the current operation depends on the source (i.e., <code class="docutils literal"><span class="pre">count==1</span></code>).</li>
 <li>Memory can be recycled into the box on the upper right corner when the <code class="docutils literal"><span class="pre">count</span></code> goes to 0.</li>
 <li>When we need new memory, we can either get it from the box or allocate a new one.</li>
 </ul>
 <p><strong><em>Note:</em></strong> During the simulation, no memory is allocated.
 Instead, we keep a record of how much memory each node needs,
 and allocate the maximum of the shared parts in the final memory plan.</p>
 </div>
 <div class="section" id="static-vs-dynamic-allocation">
 <span id="static-vs-dynamic-allocation"></span><h2>Static vs. Dynamic Allocation<a class="headerlink" href="#static-vs-dynamic-allocation" title="Permalink to this headline">¶</a></h2>
 <p>The preceding strategy exactly simulates
 the dynamic memory allocation procedure
 in imperative languages, such as Python.
 The <code class="docutils literal"><span class="pre">count</span></code> is the reference counter for each memory object,
 and the object gets garbage collected
 when the reference counter goes to 0.
 In that sense,
 we are simulating dynamic memory allocation once
 to create a static allocation plan.
 Can we simply use an imperative language
 that dynamically allocates and deallocates memory?</p>
 <p>The major difference is that static allocation is only done once,
 so we can afford to use more complicated algorithms.
 For example, we can search for memory sizes
 that are similar to the required memory block.
 The Allocation can also be made graph aware.
 We’ll talk about that in the next section.
 Dynamic allocation puts more pressure
 on fast memory allocation and garbage collection.</p>
 <p>There is also one takeaway for users
 who want to rely on dynamic memory allocations:
 <em>do not unnecessarily reference objects</em>.
 For example, if we organize all of the nodes in a list
 and store then in a Net object,
 these nodes will never get dereferenced, and we gain no space.
 Unfortunately, this is a common way to organize code.</p>
 </div>
 <div class="section" id="memory-allocation-for-parallel-operations">
 <span id="memory-allocation-for-parallel-operations"></span><h2>Memory Allocation for Parallel Operations<a class="headerlink" href="#memory-allocation-for-parallel-operations" title="Permalink to this headline">¶</a></h2>
 <p>In the previous section, we discussed
 how we can <em>simulate</em> running the procedure
 for a computation graph to get a static allocation plan.
 However, optimizing for parallel computation presents other challenges
 because resource sharing and parallelization are on the two ends of a balance.
 Let’s look at the following two allocation plans for the same graph:</p>
 <p><img alt="Parallel Alloc" src="https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/memory/parallel_alloc.png"/></p>
 <p>Both allocation plans are valid
 if we run the computation serially,
 from <code class="docutils literal"><span class="pre">A[1]</span></code> to <code class="docutils literal"><span class="pre">A[8]</span></code>.
 However, the allocation plan on the left
 introduces additional dependencies,
 which means we can’t run computation of <code class="docutils literal"><span class="pre">A[2]</span></code> and <code class="docutils literal"><span class="pre">A[5]</span></code> in parallel.
 The plan on the right can.
 To parallelize computation, we need to take greater care.</p>
 <div class="section" id="be-correct-and-safe-first">
 <span id="be-correct-and-safe-first"></span><h3>Be Correct and Safe First<a class="headerlink" href="#be-correct-and-safe-first" title="Permalink to this headline">¶</a></h3>
 <p>Being correct is our first principle.
 This means to execute in a way that takes implicit dependency
 memory sharing into consideration.
 You can do this by adding the implicit dependency edge to the execution graph.
 Or, even simpler, if the execution engine is mutation aware,
 as described in <a class="reference external" href="/versions/1.0.0/architecture/note_engine.html">our discussion of dependency engine design</a>,
 push the operation in sequence
 and write to the same variable tag
 that represents the same memory region.</p>
 <p>Always produce a safe memory allocation plan.
 This means never allocate the same memory
 to nodes that can be parallelized.
 This might not be ideal when memory reduction is more desirable,
 and we don’t gain too much when we can get benefit
 from multiple computing streams simultaneously executing on the same GPU.</p>
 </div>
 <div class="section" id="try-to-allow-more-parallelization">
 <span id="try-to-allow-more-parallelization"></span><h3>Try to Allow More Parallelization<a class="headerlink" href="#try-to-allow-more-parallelization" title="Permalink to this headline">¶</a></h3>
 <p>Now we can safely perform some optimizations.
 The general idea is to try and encourage memory sharing between nodes that can’t be parallelized.
 You can do this by creating an ancestor relationship
 graph and querying it during allocation,
 which costs approximately <span class="math">\(O(n^2)\)</span> in time to construct.
 We can also use a heuristic here,
 for example, color the path in the graph.
 As shown in the following figure,
 when you try to find the longest paths in the graph,
 color them the same color and continue.</p>
 <p><img alt="Path Color" src="https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/memory/graph_color.png"/></p>
 <p>After you get the color of the node,
 you allow sharing (or encourage sharing)
 only between nodes of the same color.
 This is a stricter version of the ancestor relationship,
 but it costs only <span class="math">\(O(n)\)</span> of time
 if you search for only the first <code class="docutils literal"><span class="pre">k</span></code> path.</p>
 <p>This is by no means the only solution.
 More sophisticated approaches might exist:</p>
 </div>
 </div>
 <div class="section" id="how-much-can-you-save">
 <span id="how-much-can-you-save"></span><h2>How Much Can you Save?<a class="headerlink" href="#how-much-can-you-save" title="Permalink to this headline">¶</a></h2>
 <p>We’ve discussed the techniques and algorithms you can use
 to squeeze memory usage for deep learning.
 How much can you really save by using these techniques?</p>
 <p>On coarse-grained operation graphs
 that are already optimized for big operations,
 you can reduce memory consumption roughly <em>by half</em>.
 You can reduce memory usage even more
 if you are optimizing a fine-grained computation network
 used by symbolic libraries, such as Theano.</p>
 <p>Most of the ideas in this article inspired the design of <em>MXNet</em>.
 We’ve also provided a <a class="reference external" href="https://github.com/dmlc/mxnet/tree/master/example/memcost">Memory Cost Estimation Script</a>,
 which you can use to see how much memory you need under different scenarios.</p>
 <p>The script has an option called <code class="docutils literal"><span class="pre">forward_only</span></code>,
 which shows the cost of running only the forward pass.
 You will find that cost when using this option
 is extremely low compared to others.
 This is simply because there’s  more memory reuse
 if you run only the forward pass.</p>
 <p>So here are two takeaways:</p>
 <ul class="simple">
 <li>Use a computation graph to allocate memory.</li>
 <li>For deep learning models, prediction consumes much less memory than training.</li>
 </ul>
 </div>
 <div class="section" id="next-steps">
 <span id="next-steps"></span><h2>Next Steps<a class="headerlink" href="#next-steps" title="Permalink to this headline">¶</a></h2>
 <div class="toctree-wrapper compound">
 <ul>
 <li class="toctree-l1"><a class="reference external" href="/versions/1.0.0/architecture/note_data_loading.html">Efficient Data Loading Module for Deep Learning</a></li>
 <li class="toctree-l1"><a class="reference external" href="/versions/1.0.0/architecture/rnn_interface.html">Survey of RNN Interface</a></li>
 </ul>
 </div>
 </div>
 </div>
 </div>
 </div>
 <div aria-label="main navigation" class="sphinxsidebar rightsidebar" role="navigation">
 <div class="sphinxsidebarwrapper">
 <h3><a href="../index.html">Table Of Contents</a></h3>
 <ul>
 <li><a class="reference internal" href="#">Optimizing Memory Consumption in Deep Learning</a><ul>
 <li><a class="reference internal" href="#computation-graph">Computation Graph</a></li>
 <li><a class="reference internal" href="#what-can-be-optimized">What Can Be Optimized?</a><ul>
 <li><a class="reference internal" href="#in-place-operations">In-place Operations</a></li>
 <li><a class="reference internal" href="#standard-memory-sharing">Standard Memory Sharing</a></li>
 <li><a class="reference internal" href="#example-of-real-neural-network-allocation">Example of Real Neural Network Allocation</a></li>
 </ul>
 </li>
 <li><a class="reference internal" href="#memory-allocation-algorithm">Memory Allocation Algorithm</a></li>
 <li><a class="reference internal" href="#static-vs-dynamic-allocation">Static vs. Dynamic Allocation</a></li>
 <li><a class="reference internal" href="#memory-allocation-for-parallel-operations">Memory Allocation for Parallel Operations</a><ul>
 <li><a class="reference internal" href="#be-correct-and-safe-first">Be Correct and Safe First</a></li>
 <li><a class="reference internal" href="#try-to-allow-more-parallelization">Try to Allow More Parallelization</a></li>
 </ul>
 </li>
 <li><a class="reference internal" href="#how-much-can-you-save">How Much Can you Save?</a></li>
 <li><a class="reference internal" href="#next-steps">Next Steps</a></li>
 </ul>
 </li>
 </ul>
 </div>
 </div>
 </div><div class="footer">
 <div class="section-disclaimer">
 <div class="container">
 <div>
 <img height="60" src="https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/image/apache_incubator_logo.png"/>
 <p>
             Apache MXNet is an effort undergoing incubation at The Apache Software Foundation (ASF), <strong>sponsored by the <i>Apache Incubator</i></strong>. Incubation is required of all newly accepted projects until a further review indicates that the infrastructure, communications, and decision making process have stabilized in a manner consistent with other successful ASF projects. While incubation status is not necessarily a reflection of the completeness or stability of the code, it does indicate that the project has yet to be fully endorsed by the ASF.
         </p>
 <p>
             "Copyright © 2017-2018, The Apache Software Foundation
             Apache MXNet, MXNet, Apache, the Apache feather, and the Apache MXNet project logo are either registered trademarks or trademarks of the Apache Software Foundation."
         </p>
 </div>
 </div>
 </div>
 </div> <!-- pagename != index -->
 </div>
 <script crossorigin="anonymous" integrity="sha384-0mSbJDEHialfmuBBQP6A4Qrprq5OVfW37PRR3j5ELqxss1yVqOtnepnHVP9aJ7xS" src="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.6/js/bootstrap.min.js"></script>
 <script src="../_static/js/sidebar.js" type="text/javascript"></script>
 <script src="../_static/js/search.js" type="text/javascript"></script>
 <script src="../_static/js/navbar.js" type="text/javascript"></script>
 <script src="../_static/js/clipboard.min.js" type="text/javascript"></script>
 <script src="../_static/js/copycode.js" type="text/javascript"></script>
 <script src="../_static/js/page.js" type="text/javascript"></script>
 <script src="../_static/js/docversion.js" type="text/javascript"></script>
 <script type="text/javascript">
         $('body').ready(function () {
             $('body').css('visibility', 'visible');
         });
     </script>
 </body>
 </html>