<!DOCTYPE html>
<html lang=" en"><head>
  <meta charset="utf-8">
  <meta http-equiv="X-UA-Compatible" content="IE=edge">
  <meta name="viewport" content="width=device-width, initial-scale=1">
  <link href="https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/image/mxnet-icon.png" rel="icon" type="image/png"><!-- Begin Jekyll SEO tag v2.6.1 -->
<title>Memory Consumption | Apache MXNet</title>
<meta name="generator" content="Jekyll v3.8.6" />
<meta property="og:title" content="Memory Consumption" />
<meta property="og:locale" content="en_US" />
<meta name="description" content="A flexible and efficient library for deep learning." />
<meta property="og:description" content="A flexible and efficient library for deep learning." />
<link rel="canonical" href="https://mxnet.apache.org/versions/1.7/api/architecture/note_memory" />
<meta property="og:url" content="https://mxnet.apache.org/versions/1.7/api/architecture/note_memory" />
<meta property="og:site_name" content="Apache MXNet" />
<script type="application/ld+json">
{"description":"A flexible and efficient library for deep learning.","headline":"Memory Consumption","@type":"WebPage","url":"https://mxnet.apache.org/versions/1.7/api/architecture/note_memory","@context":"https://schema.org"}</script>
<!-- End Jekyll SEO tag -->
<script src="https://medium-widget.pixelpoint.io/widget.js"></script>
  <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/docsearch.js@2/dist/cdn/docsearch.min.css" />
  <link rel="stylesheet" href="/versions/1.7/assets/main.css"><link type="application/atom+xml" rel="alternate" href="https://mxnet.apache.org/versions/1.7/feed.xml" title="Apache MXNet" /><script>
if(!(window.doNotTrack === "1" || navigator.doNotTrack === "1" || navigator.doNotTrack === "yes" || navigator.msDoNotTrack === "1")) {
  (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
  (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o),
  m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
  })(window,document,'script','https://www.google-analytics.com/analytics.js','ga');

  ga('create', 'UA-96378503-1', 'auto');
  ga('send', 'pageview');
}
</script>
  
<script src="/versions/1.7/assets/js/jquery-3.3.1.min.js"></script><script src="https://cdn.jsdelivr.net/npm/docsearch.js@2/dist/cdn/docsearch.min.js" defer></script>
  <script src="/versions/1.7/assets/js/globalSearch.js" defer></script>
  <script src="/versions/1.7/assets/js/clipboard.js" defer></script>
  <script src="/versions/1.7/assets/js/copycode.js" defer></script></head>
<body><header class="site-header" role="banner">

  <script>
    $(document).ready(function () {

      // HEADER OPACITY LOGIC

      function opacity_header() {
        var value = "rgba(4,140,204," + ($(window).scrollTop() / 300 + 0.4) + ")"
        $('.site-header').css("background-color", value)
      }

      $(window).scroll(function () {
        opacity_header()
      })
      opacity_header();

      // MENU SELECTOR LOGIC
      $('.page-link').each( function () {
        if (window.location.href.includes(this.href)) {
          $(this).addClass("page-current");
        }
      });
    })
  </script>
  <div class="wrapper">
    <a class="site-title" rel="author" href="/versions/1.7/"><img
            src="/versions/1.7/assets/img/mxnet_logo.png" class="site-header-logo"></a>
    <nav class="site-nav">
      <input type="checkbox" id="nav-trigger" class="nav-trigger"/>
      <label for="nav-trigger">
          <span class="menu-icon">
            <svg viewBox="0 0 18 15" width="18px" height="15px">
              <path d="M18,1.484c0,0.82-0.665,1.484-1.484,1.484H1.484C0.665,2.969,0,2.304,0,1.484l0,0C0,0.665,0.665,0,1.484,0 h15.032C17.335,0,18,0.665,18,1.484L18,1.484z M18,7.516C18,8.335,17.335,9,16.516,9H1.484C0.665,9,0,8.335,0,7.516l0,0 c0-0.82,0.665-1.484,1.484-1.484h15.032C17.335,6.031,18,6.696,18,7.516L18,7.516z M18,13.516C18,14.335,17.335,15,16.516,15H1.484 C0.665,15,0,14.335,0,13.516l0,0c0-0.82,0.665-1.483,1.484-1.483h15.032C17.335,12.031,18,12.695,18,13.516L18,13.516z"/>
            </svg>
          </span>
      </label>
      <div class="gs-search-border">
        <div id="gs-search-icon"></div>
        <form id="global-search-form">
          <input id="global-search" type="text" title="Search" placeholder="Search" />
          <div id="global-search-dropdown-container">
            <button class="gs-current-version btn" type="button" data-toggle="dropdown">
                <span id="gs-current-version-label">1.7</span>
                <svg class="gs-dropdown-caret" viewBox="0 0 32 32" class="icon icon-caret-bottom" aria-hidden="true">
                    <path class="dropdown-caret-path" d="M24 11.305l-7.997 11.39L8 11.305z"></path>
                </svg>
            </button>
            <ul class="gs-opt-group gs-version-dropdown">
              
                
                  <li class="gs-opt gs-versions">master</li>
                
              
                
                  <li class="gs-opt gs-versions active">1.7</li>
                
              
                
                  <li class="gs-opt gs-versions">1.6</li>
                
              
                
                  <li class="gs-opt gs-versions">1.5.0</li>
                
              
                
                  <li class="gs-opt gs-versions">1.4.1</li>
                
              
                
                  <li class="gs-opt gs-versions">1.3.1</li>
                
              
                
                  <li class="gs-opt gs-versions">1.2.1</li>
                
              
                
                  <li class="gs-opt gs-versions">1.1.0</li>
                
              
                
                  <li class="gs-opt gs-versions">1.0.0</li>
                
              
                
                  <li class="gs-opt gs-versions">0.12.1</li>
                
              
                
                  <li class="gs-opt gs-versions">0.11.0</li>
                
              
            </ul>
        </div>
          <span id="global-search-close">x</span>
        </form>
      </div>
      <div class="trigger">
        <div id="global-search-mobile-border">
          <div id="gs-search-icon-mobile"></div>
          <input id="global-search-mobile" placeholder="Search..." type="text"/>
          <div id="global-search-dropdown-container-mobile">
            <button class="gs-current-version-mobile btn" type="button" data-toggle="dropdown">
                <svg class="gs-dropdown-caret" viewBox="0 0 32 32" class="icon icon-caret-bottom" aria-hidden="true">
                    <path class="dropdown-caret-path" d="M24 11.305l-7.997 11.39L8 11.305z"></path>
                </svg>
            </button>
            <ul class="gs-opt-group gs-version-dropdown-mobile">
              
                
                  <li class="gs-opt gs-versions">master</li>
                
              
                
                  <li class="gs-opt gs-versions active">1.7</li>
                
              
                
                  <li class="gs-opt gs-versions">1.6</li>
                
              
                
                  <li class="gs-opt gs-versions">1.5.0</li>
                
              
                
                  <li class="gs-opt gs-versions">1.4.1</li>
                
              
                
                  <li class="gs-opt gs-versions">1.3.1</li>
                
              
                
                  <li class="gs-opt gs-versions">1.2.1</li>
                
              
                
                  <li class="gs-opt gs-versions">1.1.0</li>
                
              
                
                  <li class="gs-opt gs-versions">1.0.0</li>
                
              
                
                  <li class="gs-opt gs-versions">0.12.1</li>
                
              
                
                  <li class="gs-opt gs-versions">0.11.0</li>
                
              
            </ul>
        </div>
        </div>
        <a class="page-link" href="/versions/1.7/get_started">Get Started</a>
        <a class="page-link" href="/versions/1.7/blog">Blog</a>
        <a class="page-link" href="/versions/1.7/features">Features</a>
        <a class="page-link" href="/versions/1.7/ecosystem">Ecosystem</a>
        <a class="page-link" href="/versions/1.7/api">Docs & Tutorials</a>
        <a class="page-link" href="https://github.com/apache/incubator-mxnet">GitHub</a>
        <div class="dropdown">
          <span class="dropdown-header">1.7
            <svg class="dropdown-caret" viewBox="0 0 32 32" class="icon icon-caret-bottom" aria-hidden="true"><path class="dropdown-caret-path" d="M24 11.305l-7.997 11.39L8 11.305z"></path></svg>
          </span>
          <div class="dropdown-content">
            <a href="/">master</a>
            <a class="dropdown-option-active" href="/versions/1.7/">1.7</a>
            <a href="/versions/1.6/">1.6</a>
            <a href="/versions/1.5.0/">1.5.0</a>
            <a href="/versions/1.4.1/">1.4.1</a>
            <a href="/versions/1.3.1/">1.3.1</a>
            <a href="/versions/1.2.1/">1.2.1</a>
            <a href="/versions/1.1.0/">1.1.0</a>
            <a href="/versions/1.0.0/">1.0.0</a>
            <a href="/versions/0.12.1/">0.12.1</a>
            <a href="/versions/0.11.0/">0.11.0</a>
          </div>
        </div>
      </div>
    </nav>
  </div>
</header>
<main class="page-content" aria-label="Content">
    <script>

</script>
<article class="post">

    <header class="post-header wrapper">
        <h1 class="post-title">Memory Consumption</h1>
        <h3></h3></header>

    <div class="post-content">
        <div class="wrapper">
            <div class="row">
    <div class="col-3 docs-side-bar">
        <h3 style="text-transform: capitalize; padding-left:10px">architecture</h3>
        <ul>
            
              <!-- page-category -->
            
              <!-- page-category -->
            
              <!-- page-category -->
            
              <!-- page-category -->
            
              <!-- page-category -->
            
              <!-- page-category -->
            
              <!-- page-category -->
            
              <!-- page-category -->
            
              <!-- page-category -->
            
              <!-- page-category -->
            
              <!-- page-category -->
            
              <!-- page-category -->
            
              <!-- page-category -->
            
              <!-- page-category -->
            
              <!-- page-category -->
            
              <!-- page-category -->
            
              <!-- page-category -->
            
              <!-- page-category -->
            
              <!-- page-category -->
            
              <!-- page-category -->
            
              <!-- page-category -->
            
              <!-- page-category -->
            
              <!-- page-category -->
            
            
            <li><a href="/versions/1.7/api/architecture/exception_handling">Exception Handling in MXNet</a></li>
              <!-- page-category -->
            
              <!-- page-category -->
            
              <!-- page-category -->
            
              <!-- page-category -->
            
              <!-- page-category -->
            
              <!-- page-category -->
            
              <!-- page-category -->
            
              <!-- page-category -->
            
              <!-- page-category -->
            
              <!-- page-category -->
            
              <!-- page-category -->
            
              <!-- page-category -->
            
              <!-- page-category -->
            
              <!-- page-category -->
            
              <!-- page-category -->
            
              <!-- page-category -->
            
              <!-- page-category -->
            
              <!-- page-category -->
            
              <!-- page-category -->
            
              <!-- page-category -->
            
              <!-- page-category -->
            
              <!-- page-category -->
            
              <!-- page-category -->
            
              <!-- page-category -->
            
              <!-- page-category -->
            
              <!-- page-category -->
            
              <!-- page-category -->
            
              <!-- page-category -->
            
              <!-- page-category -->
            
              <!-- page-category -->
            
              <!-- page-category -->
            
              <!-- page-category -->
            
              <!-- page-category -->
            
              <!-- page-category -->
            
              <!-- page-category -->
            
              <!-- page-category -->
            
              <!-- page-category -->
            
              <!-- page-category -->
            
              <!-- page-category -->
            
              <!-- page-category -->
            
              <!-- page-category -->
            
              <!-- page-category -->
            
              <!-- page-category -->
            
              <!-- page-category -->
            
              <!-- page-category -->
            
              <!-- page-category -->
            
              <!-- page-category -->
            
              <!-- page-category -->
            
              <!-- page-category -->
            
              <!-- page-category -->
            
            
            <li><a href="/versions/1.7/api/architecture/note_data_loading">Efficient Data Loaders</a></li>
              <!-- page-category -->
            
            
            <li><a href="/versions/1.7/api/architecture/note_engine">Dependency Engine</a></li>
              <!-- page-category -->
            
            
            <li><a href="/versions/1.7/api/architecture/note_memory">Memory Consumption</a></li>
              <!-- page-category -->
            
              <!-- page-category -->
            
            
            <li><a href="/versions/1.7/api/architecture/overview">MXNet System Architecture</a></li>
              <!-- page-category -->
            
              <!-- page-category -->
            
              <!-- page-category -->
            
            
            <li><a href="/versions/1.7/api/architecture/program_model">Deep Learning Programming Paradigm</a></li>
              <!-- page-category -->
            
              <!-- page-category -->
            
              <!-- page-category -->
            
              <!-- page-category -->
            
              <!-- page-category -->
            
              <!-- page-category -->
            
              <!-- page-category -->
            
              <!-- page-category -->
            
              <!-- page-category -->
            
              <!-- page-category -->
            
              <!-- page-category -->
            
              <!-- page-category -->
            
              <!-- page-category -->
            
              <!-- page-category -->
            
              <!-- page-category -->
            
              <!-- page-category -->
            
              <!-- page-category -->
            
              <!-- page-category -->
            
              <!-- page-category -->
            
              <!-- page-category -->
               <!-- resource-p -->
        </ul>
    </div>
    <div class="col-9">
        <!--- Licensed to the Apache Software Foundation (ASF) under one -->

<!--- or more contributor license agreements.  See the NOTICE file -->

<!--- distributed with this work for additional information -->

<!--- regarding copyright ownership.  The ASF licenses this file -->

<!--- to you under the Apache License, Version 2.0 (the -->

<!--- "License"); you may not use this file except in compliance -->

<!--- with the License.  You may obtain a copy of the License at -->

<!---   http://www.apache.org/licenses/LICENSE-2.0 -->

<!--- Unless required by applicable law or agreed to in writing, -->

<!--- software distributed under the License is distributed on an -->

<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->

<!--- KIND, either express or implied.  See the License for the -->

<!--- specific language governing permissions and limitations -->

<!--- under the License. -->

<h1 id="optimizing-memory-consumption-in-deep-learning">Optimizing Memory Consumption in Deep Learning</h1>

<p>Over the last ten years, a constant trend in deep learning
is towards deeper and larger networks.
Despite rapid advances in hardware performance,
cutting-edge deep learning models continue to push the limits of GPU RAM.
So even today, it&#39;s always desirable to find ways
to train larger models while consuming less memory.
Doing so enables us to train faster, using larger batch sizes,
and consequently achieving a higher GPU utilization rate.</p>

<p>In this document, we explore techniques for optimizing
memory allocation for deep neural networks.
We discuss a few candidate solutions.
While our proposals are by no means exhaustive,
these solutions are instructive and allow us to
introduce the major design issues at play.</p>

<h2 id="computation-graph">Computation Graph</h2>

<p>First, let&#39;s revisit the idea of the computation graph.
A computation graph describes the (data flow) dependencies
between the operations in the deep network.
The operations performed in the graph
can be either fine-grained or coarse-grained.
The following figure shows two examples of computation graphs.</p>

<p><img src="https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/memory/comp_graph_example.png" alt="Comp Graph Example"></p>

<p>The concept of a computation graph is explicitly encoded in packages like Theano and CGT.
In other libraries, computation graphs appear implicitly as network configuration files.
The major difference in these libraries comes down to how they calculate gradients.
There are mainly two ways: performing back-propagation on the <em>same</em> graph
or explicitly representing a <em>backwards path</em> to calculate the required gradients.</p>

<p><img src="https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/memory/back_graph.png" alt="Backward Graph"></p>

<p>Libraries like Caffe, CXXNet, and Torch take the former approach,
performing back-prop on the original graph.
Libraries like Theano and CGT take the latter approach,
explicitly representing the backward path.
In this discussion, we adopt the <em>explicit backward path</em> approach
because it has several advantages for optimization.</p>

<p>However, we should emphasize that choosing the explicit backward path approach doesn&#39;t restrict us
to symbolic libraries, such as Theano and CGT. We can also use the explicit backward path for gradient calculation of
layer-based (which ties forward and backward together) libraries. The following graph shows how to do this.
Basically, we introduce a backward node that links to the forward node of the graph and calls the <code>layer.backward</code>
in the backward operations.</p>

<p><img src="https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/memory/explicit_back_layer.png" alt="Backward Layer"></p>

<p>This discussion applies to almost all existing deep learning libraries.
(There are differences between libraries,  e.g., higher-order differentiation, which is beyond the scope of this topic.)</p>

<p>Why is the explicit backward path better? Let&#39;s explain it with two examples.
The first reason is that the explicit backward path
clearly describes the dependency between computations.
Consider the following case, where we want to get
the gradient of A and B. As we can see clearly from the graph,
the computation of the <code>d(C)</code> gradient doesn&#39;t depend on F.
This means that we can free the memory of <code>F</code>
right after the forward computation is done.
Similarly, the memory of <code>C</code> can be recycled.</p>

<p><img src="https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/memory/back_dep_prune.png" alt="Backward Prune"></p>

<p>Another advantage of the explicit backward path
is the ability to have a different backward path,
instead of a mirror of forward one.
A common example is the split connection case,
as shown in the following figure.</p>

<p><img src="https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/memory/back_agg_grad.png" alt="Backward Agg"></p>

<p>In this example, the output of B is referenced by two operations.
If we want to do the gradient calculation in the same
network, we need to introduce an explicit split layer.
This means we need to do the split for the forward pass, too.
In this figure, the forward pass doesn&#39;t contain a split layer,
but the graph will automatically insert a gradient
aggregation node before passing the gradient back to B.
This helps us to save the memory cost of allocating the output of the split layer,
and the operation cost of replicating the data in the forward pass.</p>

<p>If we adopt the explicit backward approach,
there&#39;s no difference between the forward pass and the backward pass.
We simply step through the computation graph in chronological order
and carry out computations.
This makes the explicit backward approach easy to analyze.
We just need to answer the question:
how do we allocate memory for each output node of a computation graph?</p>

<h2 id="what-can-be-optimized">What Can Be Optimized?</h2>

<p>As you can see, the computation graph is a useful way
to discuss memory allocation optimization techniques.
Already, we&#39;ve shown how you can save some memory
by using the explicit backward graph.
Now let&#39;s explore further optimizations,
and see how we might determine reasonable baselines for benchmarking.</p>

<p>Assume that we want to build a neural network with <code>n</code> layers.
Typically, when implementing a neural network,
we need to allocate node space for both the output of each layer
and the gradient values used during back-propagation.
This means we need roughly <code>2 n</code> memory cells.
We face the same requirement when using the explicit backward graph approach
because the number of nodes in a backward pass
is roughly the same as in a forward pass.</p>

<h3 id="in-place-operations">In-place Operations</h3>

<p>One of the simplest techniques we can employ
is <em>in-place memory sharing</em> across operations.
For neural networks, we can usually apply this technique
for the operations corresponding to activation functions.
Consider the following case, where we want
to compute the value of three chained sigmoid functions.</p>

<p><img src="https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/memory/alloc_inline.png" alt="Inplace op"></p>

<p>Because we can compute sigmoid <code>in-place</code>,
using the same memory for input and output,
we can compute an arbitrary-length chain
of sigmoid functions using constant memory.</p>

<p>Note: it&#39;s easy to make mistakes when implementing in-place optimization.
Consider the following case, where the value of B is used not only by C, but also by F.</p>

<p><img src="https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/memory/alloc_inline_trap.png" alt="In-place trap"></p>

<p>We can&#39;t perform in-place optimization because the value of B
is still needed after <code>C=sigmoid(B)</code> is computed.
An algorithm that simply does in-place optimization
for every sigmoid operation might fall into such trap,
so we need to be careful about when we can use it.</p>

<h3 id="standard-memory-sharing">Standard Memory Sharing</h3>

<p>In-place operations are not the only places where we can share memory.
In the following example, because the value of B is no longer needed
after we compute E, we can reuse B&#39;s memory to hold the result of E.</p>

<p><img src="https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/memory/alloc_normal.png" alt="Normal Sharing"></p>

<p><em>Memory sharing doesn&#39;t necessarily require the same data shape</em>.
Note that in the preceding example, the shapes of <code>B</code> and <code>E</code> can differ.
To handle such a situation, we can allocate a memory region
of size equal to the maximum of that required by <code>B</code> and <code>E</code> and share it between them.</p>

<h3 id="example-of-real-neural-network-allocation">Example of Real Neural Network Allocation</h3>

<p>Of course, these are only toy examples and they address only the computation of the forward pass.
But the same ideas apply to real neural networks.
The following figure shows an allocation plan for a two-layer perceptron.</p>

<p><img src="https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/memory/alloc_mlp.png" alt="Net Alloc"></p>

<p>In this example:</p>

<ul>
<li>In-place optimization is applied when computing <code>act1</code>, <code>d(fc1)</code>, <code>out</code> and <code>d(fc2)</code>.</li>
<li>Memory is shared between <code>d(act1)</code> and <code>d(A)</code>.</li>
</ul>

<h2 id="memory-allocation-algorithm">Memory Allocation Algorithm</h2>

<p>So far, we&#39;ve discussed general techniques for optimizing memory allocation.
We&#39;ve seen that there are traps to avoid,
as demonstrated in the case of in-place memory optimization.
So, how can we allocate memory correctly?
This is not a new problem.
For example, it is very similar
to the problem with register allocation in compilers.
There might be techniques that we can borrow.
We&#39;re not attempting to give a comprehensive review of techniques here,
but rather to introduce some simple
but useful tricks to attack the problem.</p>

<p>The key problem is that we need to place resources
so that they don&#39;t conflict with each other.
More specifically, each variable has a <em>life time</em>
between the time it gets computed until the last time it is used.
In the case of the multi-layer perceptron,
the <em>life time</em> of <code>fc1</code> ends after <code>act1</code> get computed.</p>

<p><img src="https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/memory/alloc_mlp.png" alt="Net Alloc"></p>

<p>The principle is <em>to allow memory sharing only between variables whose lifetimes don&#39;t overlap</em>.
There are multiple ways to do this.
You can construct the conflicting graph
with each variable as a node and link the edge
between variables with overlapping lifespans,
and then run a graph-coloring algorithm.
This likely has <code>$O(n^2)$</code> complexity,
where <code>n</code> is the number of nodes in the graph.
This might be too costly.</p>

<p>Let&#39;s consider another simple heuristic.
The idea is to simulate the procedure of traversing the graph,
and keep a count of future operations that depends on the node.</p>

<p><img src="https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/memory/alloc_step.png" alt="Alloc"></p>

<ul>
<li>An in-place optimization can be performed when only the current operation depends on the source (i.e., <code>count==1</code>).</li>
<li>Memory can be recycled into the box on the upper right corner when the <code>count</code> goes to 0.</li>
<li>When we need new memory, we can either get it from the box or allocate a new one.</li>
</ul>

<p><strong><em>Note:</em></strong> During the simulation, no memory is allocated.
Instead, we keep a record of how much memory each node needs,
and allocate the maximum of the shared parts in the final memory plan.</p>

<h2 id="static-vs-dynamic-allocation">Static vs. Dynamic Allocation</h2>

<p>The preceding strategy exactly simulates
the dynamic memory allocation procedure
in imperative languages, such as Python.
The <code>count</code> is the reference counter for each memory object,
and the object gets garbage collected
when the reference counter goes to 0.
In that sense,
we are simulating dynamic memory allocation once
to create a static allocation plan.
Can we simply use an imperative language
that dynamically allocates and deallocates memory?</p>

<p>The major difference is that static allocation is only done once,
so we can afford to use more complicated algorithms.
For example, we can search for memory sizes
that are similar to the required memory block.
The Allocation can also be made graph aware.
We&#39;ll talk about that in the next section.
Dynamic allocation puts more pressure
on fast memory allocation and garbage collection.</p>

<p>There is also one takeaway for users
who want to rely on dynamic memory allocations:
<em>do not unnecessarily reference objects</em>.
For example, if we organize all of the nodes in a list
and store then in a Net object,
these nodes will never get dereferenced, and we gain no space.
Unfortunately, this is a common way to organize code.</p>

<h2 id="memory-allocation-for-parallel-operations">Memory Allocation for Parallel Operations</h2>

<p>In the previous section, we discussed
how we can <em>simulate</em> running the procedure
for a computation graph to get a static allocation plan.
However, optimizing for parallel computation presents other challenges
because resource sharing and parallelization are on the two ends of a balance.
Let&#39;s look at the following two allocation plans for the same graph:</p>

<p><img src="https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/memory/parallel_alloc.png" alt="Parallel Alloc"></p>

<p>Both allocation plans are valid
if we run the computation serially,
from <code>A[1]</code> to <code>A[8]</code>.
However, the allocation plan on the left
introduces additional dependencies,
which means we can&#39;t run computation of <code>A[2]</code> and <code>A[5]</code> in parallel.
The plan on the right can.
To parallelize computation, we need to take greater care.</p>

<h3 id="be-correct-and-safe-first">Be Correct and Safe First</h3>

<p>Being correct is our first principle.
This means to execute in a way that takes implicit dependency
memory sharing into consideration.
You can do this by adding the implicit dependency edge to the execution graph.
Or, even simpler, if the execution engine is mutation aware,
as described in <a href="note_engine">our discussion of dependency engine design</a>,
push the operation in sequence
and write to the same variable tag
that represents the same memory region.</p>

<p>Always produce a safe memory allocation plan.
This means never allocate the same memory
to nodes that can be parallelized.
This might not be ideal when memory reduction is more desirable,
and we don&#39;t gain too much when we can get benefit
from multiple computing streams simultaneously executing on the same GPU.</p>

<h3 id="try-to-allow-more-parallelization">Try to Allow More Parallelization</h3>

<p>Now we can safely perform some optimizations.
The general idea is to try and encourage memory sharing between nodes that can&#39;t be parallelized.
You can do this by creating an ancestor relationship
graph and querying it during allocation,
which costs approximately <code>$O(n^2)$</code> in time to construct.
We can also use a heuristic here,
for example, color the path in the graph.
As shown in the following figure,
when you try to find the longest paths in the graph,
color them the same color and continue.</p>

<p><img src="https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/memory/graph_color.png" alt="Path Color"></p>

<p>After you get the color of the node,
you allow sharing (or encourage sharing)
only between nodes of the same color.
This is a stricter version of the ancestor relationship,
but it costs only <code>$O(n)$</code> of time
if you search for only the first <code>k</code> path.</p>

<p>This is by no means the only solution.
More sophisticated approaches might exist:</p>

<h2 id="how-much-can-you-save">How Much Can you Save?</h2>

<p>We&#39;ve discussed the techniques and algorithms you can use
to squeeze memory usage for deep learning.
How much can you really save by using these techniques?</p>

<p>On coarse-grained operation graphs
that are already optimized for big operations,
you can reduce memory consumption roughly <em>by half</em>.
You can reduce memory usage even more
if you are optimizing a fine-grained computation network
used by symbolic libraries, such as Theano. Most of the ideas in this article inspired the design of <em>MXNet</em>.</p>

<p>Also, you will notice that memory cost, for forward pass only execution, is extremely low compared to running both forward and backward pass. This is simply because there&#39;s  more memory reuse if you run only the forward pass.</p>

<p>So here are two takeaways:</p>

<ul>
<li>Use a computation graph to allocate memory.</li>
<li>For deep learning models, prediction consumes much less memory than training.</li>
</ul>

<h2 id="next-steps">Next Steps</h2>

<ul>
<li><a href="note_data_loading">Efficient Data Loading Module for Deep Learning</a></li>
</ul>

    </div>
</div>

        </div>
    </div>

</article>

</main><footer class="site-footer h-card">
    <div class="wrapper">
        <div class="row">
            <div class="col-4">
                <h4 class="footer-category-title">Resources</h4>
                <ul class="contact-list">
                    <li><a href="/versions/1.7/community/contribute#mxnet-dev-communications">Mailing lists</a></li>
                    <li><a href="https://cwiki.apache.org/confluence/display/MXNET/Apache+MXNet+Home">Developer Wiki</a></li>
                    <li><a href="https://issues.apache.org/jira/projects/MXNET/issues">Jira Tracker</a></li>
                    <li><a href="https://github.com/apache/incubator-mxnet/labels/Roadmap">Github Roadmap</a></li>
                    <li><a href="https://discuss.mxnet.io">MXNet Discuss forum</a></li>
                    <li><a href="/versions/1.7/community/contribute">Contribute To MXNet</a></li>

                </ul>
            </div>

            <div class="col-4"><ul class="social-media-list"><li><a href="https://github.com/apache/incubator-mxnet"><svg class="svg-icon"><use xlink:href="/versions/1.7/assets/minima-social-icons.svg#github"></use></svg> <span class="username">apache/incubator-mxnet</span></a></li><li><a href="https://www.twitter.com/apachemxnet"><svg class="svg-icon"><use xlink:href="/versions/1.7/assets/minima-social-icons.svg#twitter"></use></svg> <span class="username">apachemxnet</span></a></li><li><a href="https://youtube.com/apachemxnet"><svg class="svg-icon"><use xlink:href="/versions/1.7/assets/minima-social-icons.svg#youtube"></use></svg> <span class="username">apachemxnet</span></a></li></ul>
</div>

            <div class="col-4 footer-text">
                <p>A flexible and efficient library for deep learning.</p>
            </div>
        </div>
    </div>
</footer>
<footer class="site-footer2">
    <div class="wrapper">
        <div class="row">
            <div class="col-3">
                <img src="/versions/1.7/assets/img/apache_incubator_logo.png" class="footer-logo col-2">
            </div>
            <div class="footer-bottom-warning col-9">
                <p>Apache MXNet is an effort undergoing incubation at The Apache Software Foundation (ASF), <span
                        style="font-weight:bold">sponsored by the <i>Apache Incubator</i></span>. Incubation is required
                    of all newly accepted projects until a further review indicates that the infrastructure,
                    communications, and decision making process have stabilized in a manner consistent with other
                    successful ASF projects. While incubation status is not necessarily a reflection of the completeness
                    or stability of the code, it does indicate that the project has yet to be fully endorsed by the ASF.
                </p><p>"Copyright © 2017-2018, The Apache Software Foundation Apache MXNet, MXNet, Apache, the Apache
                    feather, and the Apache MXNet project logo are either registered trademarks or trademarks of the
                    Apache Software Foundation."</p>
            </div>
        </div>
    </div>
</footer>




</body>

</html>
