| <!DOCTYPE html> |
| |
| <html lang="en"> |
| <head> |
| <meta charset="utf-8"/> |
| <meta content="IE=edge" http-equiv="X-UA-Compatible"/> |
| <meta content="width=device-width, initial-scale=1" name="viewport"/> |
| <title>Optimizing Memory Consumption in Deep Learning — mxnet documentation</title> |
| <link crossorigin="anonymous" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.6/css/bootstrap.min.css" integrity="sha384-1q8mTJOASx8j1Au+a5WDVnPi2lkFfwwEAa8hDDdjZlpLegxhjVME1fgjWPGmkzs7" rel="stylesheet"/> |
| <link href="https://maxcdn.bootstrapcdn.com/font-awesome/4.5.0/css/font-awesome.min.css" rel="stylesheet"/> |
| <link href="../_static/basic.css" rel="stylesheet" type="text/css"> |
| <link href="../_static/pygments.css" rel="stylesheet" type="text/css"> |
| <link href="../_static/mxnet.css" rel="stylesheet" type="text/css"/> |
| <script type="text/javascript"> |
| var DOCUMENTATION_OPTIONS = { |
| URL_ROOT: '../', |
| VERSION: '', |
| COLLAPSE_INDEX: false, |
| FILE_SUFFIX: '.html', |
| HAS_SOURCE: true, |
| SOURCELINK_SUFFIX: '' |
| }; |
| </script> |
| <script src="../_static/jquery-1.11.1.js" type="text/javascript"></script> |
| <script src="../_static/underscore.js" type="text/javascript"></script> |
| <script src="../_static/searchtools_custom.js" type="text/javascript"></script> |
| <script src="../_static/doctools.js" type="text/javascript"></script> |
| <script src="../_static/selectlang.js" type="text/javascript"></script> |
| <script src="https://cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML" type="text/javascript"></script> |
| <script type="text/javascript"> jQuery(function() { Search.loadIndex("/searchindex.js"); Search.init();}); </script> |
| <script> |
| (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){ |
| (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new |
| Date();a=s.createElement(o), |
| m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m) |
| })(window,document,'script','https://www.google-analytics.com/analytics.js','ga'); |
| |
| ga('create', 'UA-96378503-1', 'auto'); |
| ga('send', 'pageview'); |
| |
| </script> |
| <!-- --> |
| <!-- <script type="text/javascript" src="../_static/jquery.js"></script> --> |
| <!-- --> |
| <!-- <script type="text/javascript" src="../_static/underscore.js"></script> --> |
| <!-- --> |
| <!-- <script type="text/javascript" src="../_static/doctools.js"></script> --> |
| <!-- --> |
| <!-- <script type="text/javascript" src="https://cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML"></script> --> |
| <!-- --> |
| <link href="https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/image/mxnet-icon.png" rel="icon" type="image/png"/> |
| </link></link></head> |
| <body role="document"><!-- Previous Navbar Layout |
| <div class="navbar navbar-default navbar-fixed-top"> |
| <div class="container"> |
| <div class="navbar-header"> |
| <button type="button" class="navbar-toggle collapsed" data-toggle="collapse" data-target="#navbar" aria-expanded="false" aria-controls="navbar"> |
| <span class="sr-only">Toggle navigation</span> |
| <span class="icon-bar"></span> |
| <span class="icon-bar"></span> |
| <span class="icon-bar"></span> |
| </button> |
| <a href="../" class="navbar-brand"> |
| <img src="http://data.mxnet.io/theme/mxnet.png"> |
| </a> |
| </div> |
| <div id="navbar" class="navbar-collapse collapse"> |
| <ul id="navbar" class="navbar navbar-left"> |
| |
| <li> <a href="../get_started/index.html">Get Started</a> </li> |
| |
| <li> <a href="../tutorials/index.html">Tutorials</a> </li> |
| |
| <li> <a href="../how_to/index.html">How To</a> </li> |
| |
| |
| <li class="dropdown"> |
| <a href="#" class="dropdown-toggle" data-toggle="dropdown" role="button" aria-haspopup="true" aria-expanded="true">Packages <span class="caret"></span></a> |
| <ul class="dropdown-menu"> |
| |
| <li><a href="../packages/python/index.html"> |
| Python |
| </a></li> |
| |
| <li><a href="../packages/r/index.html"> |
| R |
| </a></li> |
| |
| <li><a href="../packages/julia/index.html"> |
| Julia |
| </a></li> |
| |
| <li><a href="../packages/c++/index.html"> |
| C++ |
| </a></li> |
| |
| <li><a href="../packages/scala/index.html"> |
| Scala |
| </a></li> |
| |
| <li><a href="../packages/perl/index.html"> |
| Perl |
| </a></li> |
| |
| </ul> |
| </li> |
| |
| <li> <a href="../system/index.html">System</a> </li> |
| <li> |
| <form class="" role="search" action="../search.html" method="get" autocomplete="off"> |
| <div class="form-group inner-addon left-addon"> |
| <i class="glyphicon glyphicon-search"></i> |
| <input type="text" name="q" class="form-control" placeholder="Search"> |
| </div> |
| <input type="hidden" name="check_keywords" value="yes" /> |
| <input type="hidden" name="area" value="default" /> |
| |
| </form> </li> |
| </ul> |
| <ul id="navbar" class="navbar navbar-right"> |
| <li> <a href="../index.html"><span class="flag-icon flag-icon-us"></span></a> </li> |
| <li> <a href="..//zh/index.html"><span class="flag-icon flag-icon-cn"></span></a> </li> |
| </ul> |
| </div> |
| </div> |
| </div> |
| Previous Navbar Layout End --> |
| <div class="navbar navbar-fixed-top"> |
| <div class="container" id="navContainer"> |
| <div class="innder" id="header-inner"> |
| <h1 id="logo-wrap"> |
| <a href="../" id="logo"><img src="http://data.mxnet.io/theme/mxnet.png"/></a> |
| </h1> |
| <nav class="nav-bar" id="main-nav"> |
| <a class="main-nav-link" href="../get_started/install.html">Install</a> |
| <a class="main-nav-link" href="../tutorials/index.html">Tutorials</a> |
| <a class="main-nav-link" href="../how_to/index.html">How To</a> |
| <span id="dropdown-menu-position-anchor"> |
| <a aria-expanded="true" aria-haspopup="true" class="main-nav-link dropdown-toggle" data-toggle="dropdown" href="#" role="button">API <span class="caret"></span></a> |
| <ul class="dropdown-menu" id="package-dropdown-menu"> |
| <li><a class="main-nav-link" href="../api/python/index.html">Python</a></li> |
| <li><a class="main-nav-link" href="../api/scala/index.html">Scala</a></li> |
| <li><a class="main-nav-link" href="../api/r/index.html">R</a></li> |
| <li><a class="main-nav-link" href="../api/julia/index.html">Julia</a></li> |
| <li><a class="main-nav-link" href="../api/c++/index.html">C++</a></li> |
| <li><a class="main-nav-link" href="../api/perl/index.html">Perl</a></li> |
| </ul> |
| </span> |
| <a class="main-nav-link" href="../architecture/index.html">Architecture</a> |
| <!-- <a class="main-nav-link" href="../community/index.html">Community</a> --> |
| <a class="main-nav-link" href="https://github.com/dmlc/mxnet">Github</a> |
| <span id="dropdown-menu-position-anchor-version" style="position: relative"><a href="#" class="main-nav-link dropdown-toggle" data-toggle="dropdown" role="button" aria-haspopup="true" aria-expanded="true">Versions(master)<span class="caret"></span></a><ul id="package-dropdown-menu" class="dropdown-menu"><li><a class="main-nav-link" href=http://mxnet.incubator.apache.org/test/>v0.10.14</a></li><li><a class="main-nav-link" href=http://mxnet.incubator.apache.org/test/versions/0.10/index.html>0.10</a></li><li><a class="main-nav-link" href=http://mxnet.incubator.apache.org/test/versions/master/index.html>master</a></li></ul></span></nav> |
| <script> function getRootPath(){ return "../" } </script> |
| <div class="burgerIcon dropdown"> |
| <a class="dropdown-toggle" data-toggle="dropdown" href="#" role="button">☰</a> |
| <ul class="dropdown-menu dropdown-menu-right" id="burgerMenu"> |
| <li><a href="../get_started/install.html">Install</a></li> |
| <li><a href="../tutorials/index.html">Tutorials</a></li> |
| <li><a href="../how_to/index.html">How To</a></li> |
| <li class="dropdown-submenu"> |
| <a href="#" tabindex="-1">API</a> |
| <ul class="dropdown-menu"> |
| <li><a href="../api/python/index.html" tabindex="-1">Python</a> |
| </li> |
| <li><a href="../api/scala/index.html" tabindex="-1">Scala</a> |
| </li> |
| <li><a href="../api/r/index.html" tabindex="-1">R</a> |
| </li> |
| <li><a href="../api/julia/index.html" tabindex="-1">Julia</a> |
| </li> |
| <li><a href="../api/c++/index.html" tabindex="-1">C++</a> |
| </li> |
| <li><a href="../api/perl/index.html" tabindex="-1">Perl</a> |
| </li> |
| </ul> |
| </li> |
| <li><a href="../architecture/index.html">Architecture</a></li> |
| <li><a class="main-nav-link" href="https://github.com/dmlc/mxnet">Github</a></li> |
| <li id="dropdown-menu-position-anchor-version-mobile" class="dropdown-submenu" style="position: relative"><a href="#" tabindex="-1">Versions(master)</a><ul class="dropdown-menu"><li><a tabindex="-1" href=http://mxnet.incubator.apache.org/test/>v0.10.14</a></li><li><a tabindex="-1" href=http://mxnet.incubator.apache.org/test/versions/0.10/index.html>0.10</a></li><li><a tabindex="-1" href=http://mxnet.incubator.apache.org/test/versions/master/index.html>master</a></li></ul></li></ul> |
| </div> |
| <div class="plusIcon dropdown"> |
| <a class="dropdown-toggle" data-toggle="dropdown" href="#" role="button"><span aria-hidden="true" class="glyphicon glyphicon-plus"></span></a> |
| <ul class="dropdown-menu dropdown-menu-right" id="plusMenu"></ul> |
| </div> |
| <div id="search-input-wrap"> |
| <form action="../search.html" autocomplete="off" class="" method="get" role="search"> |
| <div class="form-group inner-addon left-addon"> |
| <i class="glyphicon glyphicon-search"></i> |
| <input class="form-control" name="q" placeholder="Search" type="text"/> |
| </div> |
| <input name="check_keywords" type="hidden" value="yes"> |
| <input name="area" type="hidden" value="default"/> |
| </input></form> |
| <div id="search-preview"></div> |
| </div> |
| <div id="searchIcon"> |
| <span aria-hidden="true" class="glyphicon glyphicon-search"></span> |
| </div> |
| <!-- <div id="lang-select-wrap"> --> |
| <!-- <label id="lang-select-label"> --> |
| <!-- <\!-- <i class="fa fa-globe"></i> -\-> --> |
| <!-- <span></span> --> |
| <!-- </label> --> |
| <!-- <select id="lang-select"> --> |
| <!-- <option value="en">Eng</option> --> |
| <!-- <option value="zh">中文</option> --> |
| <!-- </select> --> |
| <!-- </div> --> |
| <!-- <a id="mobile-nav-toggle"> |
| <span class="mobile-nav-toggle-bar"></span> |
| <span class="mobile-nav-toggle-bar"></span> |
| <span class="mobile-nav-toggle-bar"></span> |
| </a> --> |
| </div> |
| </div> |
| </div> |
| <div class="container"> |
| <div class="row"> |
| <div aria-label="main navigation" class="sphinxsidebar leftsidebar" role="navigation"> |
| <div class="sphinxsidebarwrapper"> |
| <ul> |
| <li class="toctree-l1"><a class="reference internal" href="../api/python/index.html">Python Documents</a></li> |
| <li class="toctree-l1"><a class="reference internal" href="../api/r/index.html">R Documents</a></li> |
| <li class="toctree-l1"><a class="reference internal" href="../api/julia/index.html">Julia Documents</a></li> |
| <li class="toctree-l1"><a class="reference internal" href="../api/c++/index.html">C++ Documents</a></li> |
| <li class="toctree-l1"><a class="reference internal" href="../api/scala/index.html">Scala Documents</a></li> |
| <li class="toctree-l1"><a class="reference internal" href="../api/perl/index.html">Perl Documents</a></li> |
| <li class="toctree-l1"><a class="reference internal" href="../how_to/index.html">HowTo Documents</a></li> |
| <li class="toctree-l1"><a class="reference internal" href="index.html">System Documents</a></li> |
| <li class="toctree-l1"><a class="reference internal" href="../tutorials/index.html">Tutorials</a></li> |
| </ul> |
| </div> |
| </div> |
| <div class="content"> |
| <div class="section" id="optimizing-memory-consumption-in-deep-learning"> |
| <span id="optimizing-memory-consumption-in-deep-learning"></span><h1>Optimizing Memory Consumption in Deep Learning<a class="headerlink" href="#optimizing-memory-consumption-in-deep-learning" title="Permalink to this headline">¶</a></h1> |
| <p>Over the last ten years, a constant trend in deep learning |
| is towards deeper and larger networks. |
| Despite rapid advances in hardware performance, |
| cutting-edge deep learning models continue to push the limits of GPU RAM. |
| So even today, it’s always desirable to find ways |
| to train larger models while consuming less memory. |
| Doing so enables us to train faster, using larger batch sizes, |
| and consequently achieving a higher GPU utilization rate.</p> |
| <p>In this document, we explore techniques for optimizing |
| memory allocation for deep neural networks. |
| We discuss a few candidate solutions. |
| While our proposals are by no means exhaustive, |
| these solutions are instructive and allow us to |
| introduce the major design issues at play.</p> |
| <div class="section" id="computation-graph"> |
| <span id="computation-graph"></span><h2>Computation Graph<a class="headerlink" href="#computation-graph" title="Permalink to this headline">¶</a></h2> |
| <p>First, let’s revisit the idea of the computation graph. |
| A computation graph describes the (data flow) dependencies |
| between the operations in the deep network. |
| The operations performed in the graph |
| can be either fine-grained or coarse-grained. |
| The following figure shows two examples of computation graphs.</p> |
| <p><img alt="Comp Graph Example" src="https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/memory/comp_graph_example.png"/></p> |
| <p>The concept of a computation graph is explicitly encoded in packages like Theano and CGT. |
| In other libraries, computation graphs appear implicitly as network configuration files. |
| The major difference in these libraries comes down to how they calculate gradients. |
| There are mainly two ways: performing back-propagation on the <em>same</em> graph |
| or explicitly representing a <em>backwards path</em> to calculate the required gradients.</p> |
| <p><img alt="Backward Graph" src="https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/memory/back_graph.png"/></p> |
| <p>Libraries like Caffe, CXXNet, and Torch take the former approach, |
| performing back-prop on the original graph. |
| Libraries like Theano and CGT take the latter approach, |
| explicitly representing the backward path. |
| In this discussion, we adopt the <em>explicit backward path</em> approach |
| because it has several advantages for optimization.</p> |
| <p>However, we should emphasize that choosing the explicit backward path approach doesn’t restrict us |
| to symbolic libraries, such as Theano and CGT. We can also use the explicit backward path for gradient calculation of |
| layer-based (which ties forward and backward together) libraries. The following graph shows how to do this. |
| Basically, we introduce a backward node that links to the forward node of the graph and calls the <code class="docutils literal"><span class="pre">layer.backward</span></code> |
| in the backward operations.</p> |
| <p><img alt="Backward Layer" src="https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/memory/explicit_back_layer.png"/></p> |
| <p>This discussion applies to almost all existing deep learning libraries. |
| (There are differences between libraries, e.g., higher-order differentiation, which is beyond the scope of this topic.)</p> |
| <p>Why is the explicit backward path better? Let’s explain it with two examples. |
| The first reason is that the explicit backward path |
| clearly describes the dependency between computations. |
| Consider the following case, where we want to get |
| the gradient of A and B. As we can see clearly from the graph, |
| the computation of the <code class="docutils literal"><span class="pre">d(C)</span></code> gradient doesn’t depend on F. |
| This means that we can free the memory of <code class="docutils literal"><span class="pre">F</span></code> |
| right after the forward computation is done. |
| Similarly, the memory of <code class="docutils literal"><span class="pre">C</span></code> can be recycled.</p> |
| <p><img alt="Backward Prune" src="https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/memory/back_dep_prune.png"/></p> |
| <p>Another advantage of the explicit backward path |
| is the ability to have a different backward path, |
| instead of a mirror of forward one. |
| A common example is the split connection case, |
| as shown in the following figure.</p> |
| <p><img alt="Backward Agg" src="https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/memory/back_agg_grad.png"/></p> |
| <p>In this example, the output of B is referenced by two operations. |
| If we want to do the gradient calculation in the same |
| network, we need to introduce an explicit split layer. |
| This means we need to do the split for the forward pass, too. |
| In this figure, the forward pass doesn’t contain a split layer, |
| but the graph will automatically insert a gradient |
| aggregation node before passing the gradient back to B. |
| This helps us to save the memory cost of allocating the output of the split layer, |
| and the operation cost of replicating the data in the forward pass.</p> |
| <p>If we adopt the explicit backward approach, |
| there’s no difference between the forward pass and the backward pass. |
| We simply step through the computation graph in chronological order |
| and carry out computations. |
| This makes the explicit backward approach easy to analyze. |
| We just need to answer the question: |
| how do we allocate memory for each output node of a computation graph?</p> |
| </div> |
| <div class="section" id="what-can-be-optimized"> |
| <span id="what-can-be-optimized"></span><h2>What Can Be Optimized?<a class="headerlink" href="#what-can-be-optimized" title="Permalink to this headline">¶</a></h2> |
| <p>As you can see, the computation graph is a useful way |
| to discuss memory allocation optimization techniques. |
| Already, we’ve shown how you can save some memory |
| by using the explicit backward graph. |
| Now let’s explore further optimizations, |
| and see how we might determine reasonable baselines for benchmarking.</p> |
| <p>Assume that we want to build a neural network with <code class="docutils literal"><span class="pre">n</span></code> layers. |
| Typically, when implementing a neural network, |
| we need to allocate node space for both the output of each layer |
| and the gradient values used during back-propagation. |
| This means we need roughly <code class="docutils literal"><span class="pre">2</span> <span class="pre">n</span></code> memory cells. |
| We face the same requirement when using the explicit backward graph approach |
| because the number of nodes in a backward pass |
| is roughly the same as in a forward pass.</p> |
| <div class="section" id="in-place-operations"> |
| <span id="in-place-operations"></span><h3>In-place Operations<a class="headerlink" href="#in-place-operations" title="Permalink to this headline">¶</a></h3> |
| <p>One of the simplest techniques we can employ |
| is <em>in-place memory sharing</em> across operations. |
| For neural networks, we can usually apply this technique |
| for the operations corresponding to activation functions. |
| Consider the following case, where we want |
| to compute the value of three chained sigmoid functions.</p> |
| <p><img alt="Inplace op" src="https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/memory/alloc_inline.png"/></p> |
| <p>Because we can compute sigmoid <code class="docutils literal"><span class="pre">in-place</span></code>, |
| using the same memory for input and output, |
| we can compute an arbitrary-length chain |
| of sigmoid functions using constant memory.</p> |
| <p>Note: it’s easy to make mistakes when implementing in-place optimization. |
| Consider the following case, where the value of B is used not only by C, but also by F.</p> |
| <p><img alt="In-place trap" src="https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/memory/alloc_inline_trap.png"/></p> |
| <p>We can’t perform in-place optimization because the value of B |
| is still needed after <code class="docutils literal"><span class="pre">C=sigmoid(B)</span></code> is computed. |
| An algorithm that simply does in-place optimization |
| for every sigmoid operation might fall into such trap, |
| so we need to be careful about when we can use it.</p> |
| </div> |
| <div class="section" id="standard-memory-sharing"> |
| <span id="standard-memory-sharing"></span><h3>Standard Memory Sharing<a class="headerlink" href="#standard-memory-sharing" title="Permalink to this headline">¶</a></h3> |
| <p>In-place operations are not the only places where we can share memory. |
| In the following example, because the value of B is no longer needed |
| after we compute E, we can reuse B’s memory to hold the result of E.</p> |
| <p><img alt="Normal Sharing" src="https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/memory/alloc_normal.png"/></p> |
| <p><em>Memory sharing doesn’t necessarily require the same data shape</em>. |
| Note that in the preceding example, the shapes of <code class="docutils literal"><span class="pre">B</span></code> and <code class="docutils literal"><span class="pre">E</span></code> can differ. |
| To handle such a situation, we can allocate a memory region |
| of size equal to the maximum of that required by <code class="docutils literal"><span class="pre">B</span></code> and <code class="docutils literal"><span class="pre">E</span></code> and share it between them.</p> |
| </div> |
| <div class="section" id="example-of-real-neural-network-allocation"> |
| <span id="example-of-real-neural-network-allocation"></span><h3>Example of Real Neural Network Allocation<a class="headerlink" href="#example-of-real-neural-network-allocation" title="Permalink to this headline">¶</a></h3> |
| <p>Of course, these are only toy examples and they address only the computation of the forward pass. |
| But the same ideas apply to real neural networks. |
| The following figure shows an allocation plan for a two-layer perceptron.</p> |
| <p><img alt="Net Alloc" src="https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/memory/alloc_mlp.png"/></p> |
| <p>In this example:</p> |
| <ul class="simple"> |
| <li>In-place optimization is applied when computing <code class="docutils literal"><span class="pre">act1</span></code>, <code class="docutils literal"><span class="pre">d(fc1)</span></code>, <code class="docutils literal"><span class="pre">out</span></code> and <code class="docutils literal"><span class="pre">d(fc2)</span></code>.</li> |
| <li>Memory is shared between <code class="docutils literal"><span class="pre">d(act1)</span></code> and <code class="docutils literal"><span class="pre">d(A)</span></code>.</li> |
| </ul> |
| </div> |
| </div> |
| <div class="section" id="memory-allocation-algorithm"> |
| <span id="memory-allocation-algorithm"></span><h2>Memory Allocation Algorithm<a class="headerlink" href="#memory-allocation-algorithm" title="Permalink to this headline">¶</a></h2> |
| <p>So far, we’ve discussed general techniques for optimizing memory allocation. |
| We’ve seen that there are traps to avoid, |
| as demonstrated in the case of in-place memory optimization. |
| So, how can we allocate memory correctly? |
| This is not a new problem. |
| For example, it is very similar |
| to the problem with register allocation in compilers. |
| There might be techniques that we can borrow. |
| We’re not attempting to give a comprehensive review of techniques here, |
| but rather to introduce some simple |
| but useful tricks to attack the problem.</p> |
| <p>The key problem is that we need to place resources |
| so that they don’t conflict with each other. |
| More specifically, each variable has a <em>life time</em> |
| between the time it gets computed until the last time it is used. |
| In the case of the multi-layer perceptron, |
| the <em>life time</em> of <code class="docutils literal"><span class="pre">fc1</span></code> ends after <code class="docutils literal"><span class="pre">act1</span></code> get computed.</p> |
| <p><img alt="Net Alloc" src="https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/memory/alloc_mlp.png"/></p> |
| <p>The principle is <em>to allow memory sharing only between variables whose lifetimes don’t overlap</em>. |
| There are multiple ways to do this. |
| You can construct the conflicting graph |
| with each variable as a node and link the edge |
| between variables with overlapping lifespans, |
| and then run a graph-coloring algorithm. |
| This likely has <span class="math">\(O(n^2)\)</span> complexity, |
| where <code class="docutils literal"><span class="pre">n</span></code> is the number of nodes in the graph. |
| This might be too costly.</p> |
| <p>Let’s consider another simple heuristic. |
| The idea is to simulate the procedure of traversing the graph, |
| and keep a count of future operations that depends on the node.</p> |
| <p><img alt="Alloc" src="https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/memory/alloc_step.png"/></p> |
| <ul class="simple"> |
| <li>An in-place optimization can be performed when only the current operation depends on the source (i.e., <code class="docutils literal"><span class="pre">count==1</span></code>).</li> |
| <li>Memory can be recycled into the box on the upper right corner when the <code class="docutils literal"><span class="pre">count</span></code> goes to 0.</li> |
| <li>When we need new memory, we can either get it from the box or allocate a new one.</li> |
| </ul> |
| <p><strong><em>Note:</em></strong> During the simulation, no memory is allocated. |
| Instead, we keep a record of how much memory each node needs, |
| and allocate the maximum of the shared parts in the final memory plan.</p> |
| </div> |
| <div class="section" id="static-vs-dynamic-allocation"> |
| <span id="static-vs-dynamic-allocation"></span><h2>Static vs. Dynamic Allocation<a class="headerlink" href="#static-vs-dynamic-allocation" title="Permalink to this headline">¶</a></h2> |
| <p>The preceding strategy exactly simulates |
| the dynamic memory allocation procedure |
| in imperative languages, such as Python. |
| The <code class="docutils literal"><span class="pre">count</span></code> is the reference counter for each memory object, |
| and the object gets garbage collected |
| when the reference counter goes to 0. |
| In that sense, |
| we are simulating dynamic memory allocation once |
| to create a static allocation plan. |
| Can we simply use an imperative language |
| that dynamically allocates and deallocates memory?</p> |
| <p>The major difference is that static allocation is only done once, |
| so we can afford to use more complicated algorithms. |
| For example, we can search for memory sizes |
| that are similar to the required memory block. |
| The Allocation can also be made graph aware. |
| We’ll talk about that in the next section. |
| Dynamic allocation puts more pressure |
| on fast memory allocation and garbage collection.</p> |
| <p>There is also one takeaway for users |
| who want to rely on dynamic memory allocations: |
| <em>do not unnecessarily reference objects</em>. |
| For example, if we organize all of the nodes in a list |
| and store then in a Net object, |
| these nodes will never get dereferenced, and we gain no space. |
| Unfortunately, this is a common way to organize code.</p> |
| </div> |
| <div class="section" id="memory-allocation-for-parallel-operations"> |
| <span id="memory-allocation-for-parallel-operations"></span><h2>Memory Allocation for Parallel Operations<a class="headerlink" href="#memory-allocation-for-parallel-operations" title="Permalink to this headline">¶</a></h2> |
| <p>In the previous section, we discussed |
| how we can <em>simulate</em> running the procedure |
| for a computation graph to get a static allocation plan. |
| However, optimizing for parallel computation presents other challenges |
| because resource sharing and parallelization are on the two ends of a balance. |
| Let’s look at the following two allocation plans for the same graph:</p> |
| <p><img alt="Parallel Alloc" src="https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/memory/parallel_alloc.png"/></p> |
| <p>Both allocation plans are valid |
| if we run the computation serially, |
| from <code class="docutils literal"><span class="pre">A[1]</span></code> to <code class="docutils literal"><span class="pre">A[8]</span></code>. |
| However, the allocation plan on the left |
| introduces additional dependencies, |
| which means we can’t run computation of <code class="docutils literal"><span class="pre">A[2]</span></code> and <code class="docutils literal"><span class="pre">A[5]</span></code> in parallel. |
| The plan on the right can. |
| To parallelize computation, we need to take greater care.</p> |
| <div class="section" id="be-correct-and-safe-first"> |
| <span id="be-correct-and-safe-first"></span><h3>Be Correct and Safe First<a class="headerlink" href="#be-correct-and-safe-first" title="Permalink to this headline">¶</a></h3> |
| <p>Being correct is our first principle. |
| This means to execute in a way that takes implicit dependency |
| memory sharing into consideration. |
| You can do this by adding the implicit dependency edge to the execution graph. |
| Or, even simpler, if the execution engine is mutation aware, |
| as described in <a class="reference external" href="http://mxnet.io/architecture/note_engine.html">our discussion of dependency engine design</a>, |
| push the operation in sequence |
| and write to the same variable tag |
| that represents the same memory region.</p> |
| <p>Always produce a safe memory allocation plan. |
| This means never allocate the same memory |
| to nodes that can be parallelized. |
| This might not be ideal when memory reduction is more desirable, |
| and we don’t gain too much when we can get benefit |
| from multiple computing streams simultaneously executing on the same GPU.</p> |
| </div> |
| <div class="section" id="try-to-allow-more-parallelization"> |
| <span id="try-to-allow-more-parallelization"></span><h3>Try to Allow More Parallelization<a class="headerlink" href="#try-to-allow-more-parallelization" title="Permalink to this headline">¶</a></h3> |
| <p>Now we can safely perform some optimizations. |
| The general idea is to try and encourage memory sharing between nodes that can’t be parallelized. |
| You can do this by creating an ancestor relationship |
| graph and querying it during allocation, |
| which costs approximately <span class="math">\(O(n^2)\)</span> in time to construct. |
| We can also use a heuristic here, |
| for example, color the path in the graph. |
| As shown in the following figure, |
| when you try to find the longest paths in the graph, |
| color them the same color and continue.</p> |
| <p><img alt="Path Color" src="https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/memory/graph_color.png"/></p> |
| <p>After you get the color of the node, |
| you allow sharing (or encourage sharing) |
| only between nodes of the same color. |
| This is a stricter version of the ancestor relationship, |
| but it costs only <span class="math">\(O(n)\)</span> of time |
| if you search for only the first <code class="docutils literal"><span class="pre">k</span></code> path.</p> |
| <p>This is by no means the only solution. |
| More sophisticated approaches might exist:</p> |
| </div> |
| </div> |
| <div class="section" id="how-much-can-you-save"> |
| <span id="how-much-can-you-save"></span><h2>How Much Can you Save?<a class="headerlink" href="#how-much-can-you-save" title="Permalink to this headline">¶</a></h2> |
| <p>We’ve discussed the techniques and algorithms you can use |
| to squeeze memory usage for deep learning. |
| How much can you really save by using these techniques?</p> |
| <p>On coarse-grained operation graphs |
| that are already optimized for big operations, |
| you can reduce memory consumption roughly <em>by half</em>. |
| You can reduce memory usage even more |
| if you are optimizing a fine-grained computation network |
| used by symbolic libraries, such as Theano.</p> |
| <p>Most of the ideas in this article inspired the design of <em>MXNet</em>. |
| We’ve also provided a <a class="reference external" href="https://github.com/dmlc/mxnet/tree/master/example/memcost">Memory Cost Estimation Script</a>, |
| which you can use to see how much memory you need under different scenarios.</p> |
| <p>The script has an option called <code class="docutils literal"><span class="pre">forward_only</span></code>, |
| which shows the cost of running only the forward pass. |
| You will find that cost when using this option |
| is extremely low compared to others. |
| This is simply because there’s more memory reuse |
| if you run only the forward pass.</p> |
| <p>So here are two takeaways:</p> |
| <ul class="simple"> |
| <li>Use a computation graph to allocate memory.</li> |
| <li>For deep learning models, prediction consumes much less memory than training.</li> |
| </ul> |
| </div> |
| <div class="section" id="next-steps"> |
| <span id="next-steps"></span><h2>Next Steps<a class="headerlink" href="#next-steps" title="Permalink to this headline">¶</a></h2> |
| <div class="toctree-wrapper compound"> |
| <ul> |
| <li class="toctree-l1"><a class="reference external" href="http://mxnet.io/architecture/note_data_loading.html">Efficient Data Loading Module for Deep Learning</a></li> |
| <li class="toctree-l1"><a class="reference external" href="http://mxnet.io/architecture/rnn_interface.html">Survey of RNN Interface</a></li> |
| </ul> |
| </div> |
| </div> |
| </div> |
| <div class="container"> |
| <div class="footer"> |
| <p> © 2015-2017 DMLC. All rights reserved. </p> |
| </div> |
| </div> |
| </div> |
| <div aria-label="main navigation" class="sphinxsidebar rightsidebar" role="navigation"> |
| <div class="sphinxsidebarwrapper"> |
| <h3><a href="../index.html">Table Of Contents</a></h3> |
| <ul> |
| <li><a class="reference internal" href="#">Optimizing Memory Consumption in Deep Learning</a><ul> |
| <li><a class="reference internal" href="#computation-graph">Computation Graph</a></li> |
| <li><a class="reference internal" href="#what-can-be-optimized">What Can Be Optimized?</a><ul> |
| <li><a class="reference internal" href="#in-place-operations">In-place Operations</a></li> |
| <li><a class="reference internal" href="#standard-memory-sharing">Standard Memory Sharing</a></li> |
| <li><a class="reference internal" href="#example-of-real-neural-network-allocation">Example of Real Neural Network Allocation</a></li> |
| </ul> |
| </li> |
| <li><a class="reference internal" href="#memory-allocation-algorithm">Memory Allocation Algorithm</a></li> |
| <li><a class="reference internal" href="#static-vs-dynamic-allocation">Static vs. Dynamic Allocation</a></li> |
| <li><a class="reference internal" href="#memory-allocation-for-parallel-operations">Memory Allocation for Parallel Operations</a><ul> |
| <li><a class="reference internal" href="#be-correct-and-safe-first">Be Correct and Safe First</a></li> |
| <li><a class="reference internal" href="#try-to-allow-more-parallelization">Try to Allow More Parallelization</a></li> |
| </ul> |
| </li> |
| <li><a class="reference internal" href="#how-much-can-you-save">How Much Can you Save?</a></li> |
| <li><a class="reference internal" href="#next-steps">Next Steps</a></li> |
| </ul> |
| </li> |
| </ul> |
| </div> |
| </div> |
| </div> <!-- pagename != index --> |
| <script crossorigin="anonymous" integrity="sha384-0mSbJDEHialfmuBBQP6A4Qrprq5OVfW37PRR3j5ELqxss1yVqOtnepnHVP9aJ7xS" src="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.6/js/bootstrap.min.js"></script> |
| <script src="../_static/js/sidebar.js" type="text/javascript"></script> |
| <script src="../_static/js/search.js" type="text/javascript"></script> |
| <script src="../_static/js/navbar.js" type="text/javascript"></script> |
| <script src="../_static/js/clipboard.min.js" type="text/javascript"></script> |
| <script src="../_static/js/copycode.js" type="text/javascript"></script> |
| <script type="text/javascript"> |
| $('body').ready(function () { |
| $('body').css('visibility', 'visible'); |
| }); |
| </script> |
| </div></body> |
| </html> |