versions/master/architecture/note_engine.html - mxnet-test - Git at Google

 <!DOCTYPE html>

 <html lang="en">
 <head>
 <meta charset="utf-8"/>
 <meta content="IE=edge" http-equiv="X-UA-Compatible"/>
 <meta content="width=device-width, initial-scale=1" name="viewport"/>
 <title>Dependency Engine for Deep Learning — mxnet  documentation</title>
 <link crossorigin="anonymous" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.6/css/bootstrap.min.css" integrity="sha384-1q8mTJOASx8j1Au+a5WDVnPi2lkFfwwEAa8hDDdjZlpLegxhjVME1fgjWPGmkzs7" rel="stylesheet"/>
 <link href="https://maxcdn.bootstrapcdn.com/font-awesome/4.5.0/css/font-awesome.min.css" rel="stylesheet"/>
 <link href="../_static/basic.css" rel="stylesheet" type="text/css">
 <link href="../_static/pygments.css" rel="stylesheet" type="text/css">
 <link href="../_static/mxnet.css" rel="stylesheet" type="text/css"/>
 <script type="text/javascript">
       var DOCUMENTATION_OPTIONS = {
         URL_ROOT:    '../',
         VERSION:     '',
         COLLAPSE_INDEX: false,
         FILE_SUFFIX: '.html',
         HAS_SOURCE:  true,
         SOURCELINK_SUFFIX: ''
       };
     </script>
 <script src="../_static/jquery-1.11.1.js" type="text/javascript"></script>
 <script src="../_static/underscore.js" type="text/javascript"></script>
 <script src="../_static/searchtools_custom.js" type="text/javascript"></script>
 <script src="../_static/doctools.js" type="text/javascript"></script>
 <script src="../_static/selectlang.js" type="text/javascript"></script>
 <script src="https://cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML" type="text/javascript"></script>
 <script type="text/javascript"> jQuery(function() { Search.loadIndex("/searchindex.js"); Search.init();}); </script>
 <script>
       (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
       (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new
       Date();a=s.createElement(o),
       m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
       })(window,document,'script','https://www.google-analytics.com/analytics.js','ga');

       ga('create', 'UA-96378503-1', 'auto');
       ga('send', 'pageview');

     </script>
 <!-- -->
 <!-- <script type="text/javascript" src="../_static/jquery.js"></script> -->
 <!-- -->
 <!-- <script type="text/javascript" src="../_static/underscore.js"></script> -->
 <!-- -->
 <!-- <script type="text/javascript" src="../_static/doctools.js"></script> -->
 <!-- -->
 <!-- <script type="text/javascript" src="https://cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML"></script> -->
 <!-- -->
 <link href="https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/image/mxnet-icon.png" rel="icon" type="image/png"/>
 </link></link></head>
 <body role="document"><!-- Previous Navbar Layout
 <div class="navbar navbar-default navbar-fixed-top">
   <div class="container">
     <div class="navbar-header">
       <button type="button" class="navbar-toggle collapsed" data-toggle="collapse" data-target="#navbar" aria-expanded="false" aria-controls="navbar">
         <span class="sr-only">Toggle navigation</span>
         <span class="icon-bar"></span>
         <span class="icon-bar"></span>
         <span class="icon-bar"></span>
       </button>
       <a href="../" class="navbar-brand">
         <img src="http://data.mxnet.io/theme/mxnet.png">
       </a>
     </div>
     <div id="navbar" class="navbar-collapse collapse">
       <ul id="navbar" class="navbar navbar-left">

         <li> <a href="../get_started/index.html">Get Started</a> </li>

         <li> <a href="../tutorials/index.html">Tutorials</a> </li>

         <li> <a href="../how_to/index.html">How To</a> </li>


         <li class="dropdown">
           <a href="#" class="dropdown-toggle" data-toggle="dropdown" role="button" aria-haspopup="true" aria-expanded="true">Packages <span class="caret"></span></a>
           <ul class="dropdown-menu">

             <li><a href="../packages/python/index.html">
                 Python
             </a></li>

             <li><a href="../packages/r/index.html">
                 R
             </a></li>

             <li><a href="../packages/julia/index.html">
                 Julia
             </a></li>

             <li><a href="../packages/c++/index.html">
                 C++
             </a></li>

             <li><a href="../packages/scala/index.html">
                 Scala
             </a></li>

             <li><a href="../packages/perl/index.html">
                 Perl
             </a></li>

           </ul>
         </li>

         <li> <a href="../system/index.html">System</a> </li>
         <li>
 <form class="" role="search" action="../search.html" method="get" autocomplete="off">
   <div class="form-group inner-addon left-addon">
     <i class="glyphicon glyphicon-search"></i>
     <input type="text" name="q" class="form-control" placeholder="Search">
   </div>
   <input type="hidden" name="check_keywords" value="yes" />
   <input type="hidden" name="area" value="default" />

 </form> </li>
       </ul>
       <ul id="navbar" class="navbar navbar-right">
         <li> <a href="../index.html"><span class="flag-icon flag-icon-us"></span></a> </li>
         <li> <a href="..//zh/index.html"><span class="flag-icon flag-icon-cn"></span></a> </li>
       </ul>
     </div>
   </div>
 </div>
 Previous Navbar Layout End -->
 <div class="navbar navbar-fixed-top">
 <div class="container" id="navContainer">
 <div class="innder" id="header-inner">
 <h1 id="logo-wrap">
 <a href="../" id="logo"><img src="http://data.mxnet.io/theme/mxnet.png"/></a>
 </h1>
 <nav class="nav-bar" id="main-nav">
 <a class="main-nav-link" href="../get_started/install.html">Install</a>
 <a class="main-nav-link" href="../tutorials/index.html">Tutorials</a>
 <a class="main-nav-link" href="../how_to/index.html">How To</a>
 <span id="dropdown-menu-position-anchor">
 <a aria-expanded="true" aria-haspopup="true" class="main-nav-link dropdown-toggle" data-toggle="dropdown" href="#" role="button">API <span class="caret"></span></a>
 <ul class="dropdown-menu" id="package-dropdown-menu">
 <li><a class="main-nav-link" href="../api/python/index.html">Python</a></li>
 <li><a class="main-nav-link" href="../api/scala/index.html">Scala</a></li>
 <li><a class="main-nav-link" href="../api/r/index.html">R</a></li>
 <li><a class="main-nav-link" href="../api/julia/index.html">Julia</a></li>
 <li><a class="main-nav-link" href="../api/c++/index.html">C++</a></li>
 <li><a class="main-nav-link" href="../api/perl/index.html">Perl</a></li>
 </ul>
 </span>
 <a class="main-nav-link" href="../architecture/index.html">Architecture</a>
 <!-- <a class="main-nav-link" href="../community/index.html">Community</a> -->
 <a class="main-nav-link" href="https://github.com/dmlc/mxnet">Github</a>
 <span id="dropdown-menu-position-anchor-version" style="position: relative"><a href="#" class="main-nav-link dropdown-toggle" data-toggle="dropdown" role="button" aria-haspopup="true" aria-expanded="true">Versions(master)<span class="caret"></span></a><ul id="package-dropdown-menu" class="dropdown-menu"><li><a class="main-nav-link" href=http://mxnet.incubator.apache.org/test/>v0.10.14</a></li><li><a class="main-nav-link" href=http://mxnet.incubator.apache.org/test/versions/0.10/index.html>0.10</a></li><li><a class="main-nav-link" href=http://mxnet.incubator.apache.org/test/versions/master/index.html>master</a></li></ul></span></nav>
 <script> function getRootPath(){ return "../" } </script>
 <div class="burgerIcon dropdown">
 <a class="dropdown-toggle" data-toggle="dropdown" href="#" role="button">☰</a>
 <ul class="dropdown-menu dropdown-menu-right" id="burgerMenu">
 <li><a href="../get_started/install.html">Install</a></li>
 <li><a href="../tutorials/index.html">Tutorials</a></li>
 <li><a href="../how_to/index.html">How To</a></li>
 <li class="dropdown-submenu">
 <a href="#" tabindex="-1">API</a>
 <ul class="dropdown-menu">
 <li><a href="../api/python/index.html" tabindex="-1">Python</a>
 </li>
 <li><a href="../api/scala/index.html" tabindex="-1">Scala</a>
 </li>
 <li><a href="../api/r/index.html" tabindex="-1">R</a>
 </li>
 <li><a href="../api/julia/index.html" tabindex="-1">Julia</a>
 </li>
 <li><a href="../api/c++/index.html" tabindex="-1">C++</a>
 </li>
 <li><a href="../api/perl/index.html" tabindex="-1">Perl</a>
 </li>
 </ul>
 </li>
 <li><a href="../architecture/index.html">Architecture</a></li>
 <li><a class="main-nav-link" href="https://github.com/dmlc/mxnet">Github</a></li>
 <li id="dropdown-menu-position-anchor-version-mobile" class="dropdown-submenu" style="position: relative"><a href="#" tabindex="-1">Versions(master)</a><ul class="dropdown-menu"><li><a tabindex="-1" href=http://mxnet.incubator.apache.org/test/>v0.10.14</a></li><li><a tabindex="-1" href=http://mxnet.incubator.apache.org/test/versions/0.10/index.html>0.10</a></li><li><a tabindex="-1" href=http://mxnet.incubator.apache.org/test/versions/master/index.html>master</a></li></ul></li></ul>
 </div>
 <div class="plusIcon dropdown">
 <a class="dropdown-toggle" data-toggle="dropdown" href="#" role="button"><span aria-hidden="true" class="glyphicon glyphicon-plus"></span></a>
 <ul class="dropdown-menu dropdown-menu-right" id="plusMenu"></ul>
 </div>
 <div id="search-input-wrap">
 <form action="../search.html" autocomplete="off" class="" method="get" role="search">
 <div class="form-group inner-addon left-addon">
 <i class="glyphicon glyphicon-search"></i>
 <input class="form-control" name="q" placeholder="Search" type="text"/>
 </div>
 <input name="check_keywords" type="hidden" value="yes">
 <input name="area" type="hidden" value="default"/>
 </input></form>
 <div id="search-preview"></div>
 </div>
 <div id="searchIcon">
 <span aria-hidden="true" class="glyphicon glyphicon-search"></span>
 </div>
 <!-- <div id="lang-select-wrap"> -->
 <!--   <label id="lang-select-label"> -->
 <!--     <\!-- <i class="fa fa-globe"></i> -\-> -->
 <!--     <span></span> -->
 <!--   </label> -->
 <!--   <select id="lang-select"> -->
 <!--     <option value="en">Eng</option> -->
 <!--     <option value="zh">中文</option> -->
 <!--   </select> -->
 <!-- </div> -->
 <!--     <a id="mobile-nav-toggle">
         <span class="mobile-nav-toggle-bar"></span>
         <span class="mobile-nav-toggle-bar"></span>
         <span class="mobile-nav-toggle-bar"></span>
       </a> -->
 </div>
 </div>
 </div>
 <div class="container">
 <div class="row">
 <div aria-label="main navigation" class="sphinxsidebar leftsidebar" role="navigation">
 <div class="sphinxsidebarwrapper">
 <ul>
 <li class="toctree-l1"><a class="reference internal" href="../api/python/index.html">Python Documents</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../api/r/index.html">R Documents</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../api/julia/index.html">Julia Documents</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../api/c++/index.html">C++ Documents</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../api/scala/index.html">Scala Documents</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../api/perl/index.html">Perl Documents</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../how_to/index.html">HowTo Documents</a></li>
 <li class="toctree-l1"><a class="reference internal" href="index.html">System Documents</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../tutorials/index.html">Tutorials</a></li>
 </ul>
 </div>
 </div>
 <div class="content">
 <div class="section" id="dependency-engine-for-deep-learning">
 <span id="dependency-engine-for-deep-learning"></span><h1>Dependency Engine for Deep Learning<a class="headerlink" href="#dependency-engine-for-deep-learning" title="Permalink to this headline">¶</a></h1>
 <p>We always want deep learning libraries
 to run faster and scale to larger datasets.
 One natural approach is to see if we can benefit
 from throwing more hardware at the problem,
 as by using multiple GPUs simultaneously.</p>
 <p>Library designers then ask:
 How can we <em>parallelize</em> computation across devices?
 And, more importantly, how can we <em>synchronize</em> computation
 when we introduce multi-threading?
 A runtime dependency engine is a generic solution to these problems.</p>
 <p>In this document, we examine approaches for using
 runtime dependency scheduling to accelerate deep learning.
 We aim to explain how runtime dependency scheduling
 can both speed up and simplify multi-device deep learning.
 We also explore potential designs for a generic dependency engine
 that could be both library- and operation-independent.</p>
 <p>Most of the discussion of on this page draws inspiration
 from the MXNet dependency engine.
 The dependency tracking algorithm we discuss
 was primarily developed by <a class="reference external" href="https://github.com/hotpxl">Yutian Li</a>
 and <a class="reference external" href="https://github.com/jermainewang">Mingjie Wang</a>.</p>
 <div class="section" id="dependency-scheduling">
 <span id="dependency-scheduling"></span><h2>Dependency Scheduling<a class="headerlink" href="#dependency-scheduling" title="Permalink to this headline">¶</a></h2>
 <p>Although most users want to take advantage of parallel computation,
 most of us are more familiar with serial programs.
 So one natural question is: how can we write serial programs
 and build a library to automatically parallelize our programs
 in an asynchronous way?</p>
 <p>For example, in the following code, we can run <code class="docutils literal"><span class="pre">B</span> <span class="pre">=</span> <span class="pre">A</span> <span class="pre">+</span> <span class="pre">1</span></code>
 and <code class="docutils literal"><span class="pre">C</span> <span class="pre">=</span> <span class="pre">A</span> <span class="pre">+</span> <span class="pre">2</span></code> in any order, or in parallel:</p>
 <div class="highlight-python"><div class="highlight"><pre><span></span>    <span class="n">A</span> <span class="o">=</span> <span class="mi">2</span>
     <span class="n">B</span> <span class="o">=</span> <span class="n">A</span> <span class="o">+</span> <span class="mi">1</span>
     <span class="n">C</span> <span class="o">=</span> <span class="n">A</span> <span class="o">+</span> <span class="mi">2</span>
     <span class="n">D</span> <span class="o">=</span> <span class="n">B</span> <span class="o">*</span> <span class="n">C</span>
 </pre></div>
 </div>
 <p>However, it’s quite hard to code the sequence manually
 because the last operation, <code class="docutils literal"><span class="pre">D</span> <span class="pre">=</span> <span class="pre">B</span> <span class="pre">*</span> <span class="pre">C</span></code>, needs to wait
 for both of the preceding operations to complete before it starts.
 The following dependency graph/data flow graph illustrates this.</p>
 <p><img alt="Dep Simple" src="https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/engine/dep_simple.png"/></p>
 <p>A dependency engine is a library that takes a sequence of operations
 and schedules them according to the dependency pattern,  potentially in parallel.
 So in this example, a dependency library
 could run <code class="docutils literal"><span class="pre">B</span> <span class="pre">=</span> <span class="pre">A</span> <span class="pre">+</span> <span class="pre">1</span></code> and <code class="docutils literal"><span class="pre">C</span> <span class="pre">=</span> <span class="pre">A</span> <span class="pre">+</span> <span class="pre">2</span></code> in parallel,
 and run <code class="docutils literal"><span class="pre">D</span> <span class="pre">=</span> <span class="pre">B</span> <span class="pre">*</span> <span class="pre">C</span></code> after those operations complete.</p>
 </div>
 <div class="section" id="problems-in-dependency-scheduling">
 <span id="problems-in-dependency-scheduling"></span><h2>Problems in Dependency Scheduling<a class="headerlink" href="#problems-in-dependency-scheduling" title="Permalink to this headline">¶</a></h2>
 <p>A dependency engine relieves the burden of writing concurrent programs.
 However, as operations become parallelized,
 new dependency tracking problems arise.
 In this section, we discuss those problems.</p>
 <div class="section" id="data-flow-dependency">
 <span id="data-flow-dependency"></span><h3>Data Flow Dependency<a class="headerlink" href="#data-flow-dependency" title="Permalink to this headline">¶</a></h3>
 <p>Data flow dependency describes how the outcome of one computation
 can be used in other computations.
 Every dependency engine has to solve the data flow dependency problem.</p>
 <p><img alt="Dep Simple" src="https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/engine/dep_simple.png"/></p>
 <p>Because we discussed this issue in the preceding section,
 we include the same figure here. Libraries that have
 data flow tracking engines include Minerva and Purine2.</p>
 </div>
 <div class="section" id="memory-recycling">
 <span id="memory-recycling"></span><h3>Memory Recycling<a class="headerlink" href="#memory-recycling" title="Permalink to this headline">¶</a></h3>
 <p>When should we recycle the memory that we allocated to the arrays?
 In serial processing, this is easy to determine.
 We simply recycle the memory after the variable goes out of scope.
 However, as the following figure shows, this is a bit harder in parallel processing.</p>
 <p><img alt="Dep Del" src="https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/engine/dep_del.png"/></p>
 <p>In this example, because both computations need to use values from A,
 we can’t recycle the memory until both complete.
 The engine must schedule the memory recycling operations according to the dependencies,
 and ensure that they are executed after both <code class="docutils literal"><span class="pre">B</span> <span class="pre">=</span> <span class="pre">A</span> <span class="pre">+</span> <span class="pre">1</span></code> and <code class="docutils literal"><span class="pre">C</span> <span class="pre">=</span> <span class="pre">A</span> <span class="pre">+</span> <span class="pre">2</span></code> complete.</p>
 </div>
 <div class="section" id="random-number-generation">
 <span id="random-number-generation"></span><h3>Random Number Generation<a class="headerlink" href="#random-number-generation" title="Permalink to this headline">¶</a></h3>
 <p>Random number generators, which are commonly used in machine learning,
 pose interesting challenges for dependency engines.
 Consider the following example:</p>
 <p><img alt="Dep Rand" src="https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/engine/dep_rand.png"/></p>
 <p>In this example, we are generating random numbers in a sequence.
 Although it seems that the two random number generations can be parallelized,
 this is usually not the case. A pseudo-random number generator (PRNG)
 is not thread-safe because it might cause some internal state
 to mutate when generating a new number.
 Even if the PRNG is thread-safe,
 it is preferable to serialize number generation,
 so we can get reproducible random numbers.</p>
 </div>
 </div>
 <div class="section" id="case-study-a-dependency-engine-for-a-multi-gpu-neural-network">
 <span id="case-study-a-dependency-engine-for-a-multi-gpu-neural-network"></span><h2>Case Study: A Dependency Engine for a Multi-GPU Neural Network<a class="headerlink" href="#case-study-a-dependency-engine-for-a-multi-gpu-neural-network" title="Permalink to this headline">¶</a></h2>
 <p>In the last section, we discussed the problems
 we might face in designing a dependency engine.
 Before thinking about how to design a generic engine to solve those problems,
 let’s consider how a dependency engine can help in multi-GPU training of a neural network.
 The following pseudocode Python program illustrates
 training one batch on a  two-layer neural network.</p>
 <div class="highlight-python"><div class="highlight"><pre><span></span>    <span class="c1"># Example of one iteration Two GPU neural Net</span>
     <span class="n">data</span> <span class="o">=</span> <span class="n">next_batch</span><span class="p">()</span>
     <span class="n">data</span><span class="p">[</span><span class="n">gpu0</span><span class="p">]</span><span class="o">.</span><span class="n">copyfrom</span><span class="p">(</span><span class="n">data</span><span class="p">[</span><span class="mi">0</span><span class="p">:</span><span class="mi">50</span><span class="p">])</span>
     <span class="n">data</span><span class="p">[</span><span class="n">gpu1</span><span class="p">]</span><span class="o">.</span><span class="n">copyfrom</span><span class="p">(</span><span class="n">data</span><span class="p">[</span><span class="mi">50</span><span class="p">:</span><span class="mi">100</span><span class="p">])</span>
     <span class="c1"># forward, backprop on GPU 0</span>
     <span class="n">fc1</span><span class="p">[</span><span class="n">gpu0</span><span class="p">]</span> <span class="o">=</span> <span class="n">FullcForward</span><span class="p">(</span><span class="n">data</span><span class="p">[</span><span class="n">gpu0</span><span class="p">],</span> <span class="n">fc1_weight</span><span class="p">[</span><span class="n">gpu0</span><span class="p">])</span>
     <span class="n">fc2</span><span class="p">[</span><span class="n">gpu0</span><span class="p">]</span> <span class="o">=</span> <span class="n">FullcForward</span><span class="p">(</span><span class="n">fc1</span><span class="p">[</span><span class="n">gpu0</span><span class="p">],</span> <span class="n">fc2_weight</span><span class="p">[</span><span class="n">gpu0</span><span class="p">])</span>
     <span class="n">fc2_ograd</span><span class="p">[</span><span class="n">gpu0</span><span class="p">]</span> <span class="o">=</span> <span class="n">LossGrad</span><span class="p">(</span><span class="n">fc2</span><span class="p">[</span><span class="n">gpu0</span><span class="p">],</span> <span class="n">label</span><span class="p">[</span><span class="mi">0</span><span class="p">:</span><span class="mi">50</span><span class="p">])</span>
     <span class="n">fc1_ograd</span><span class="p">[</span><span class="n">gpu0</span><span class="p">],</span> <span class="n">fc2_wgrad</span><span class="p">[</span><span class="n">gpu0</span><span class="p">]</span> <span class="o">=</span>
       <span class="n">FullcBackward</span><span class="p">(</span><span class="n">fc2_ograd</span><span class="p">[</span><span class="n">gpu0</span><span class="p">]</span> <span class="p">,</span> <span class="n">fc2_weight</span><span class="p">[</span><span class="n">gpu0</span><span class="p">])</span>
       <span class="n">_</span><span class="p">,</span> <span class="n">fc1_wgrad</span><span class="p">[</span><span class="n">gpu0</span><span class="p">]</span> <span class="o">=</span> <span class="n">FullcBackward</span><span class="p">(</span><span class="n">fc1_ograd</span><span class="p">[</span><span class="n">gpu0</span><span class="p">]</span> <span class="p">,</span> <span class="n">fc1_weight</span><span class="p">[</span><span class="n">gpu0</span><span class="p">])</span>
     <span class="c1"># forward, backprop on GPU 1</span>
     <span class="n">fc1</span><span class="p">[</span><span class="n">gpu1</span><span class="p">]</span> <span class="o">=</span> <span class="n">FullcForward</span><span class="p">(</span><span class="n">data</span><span class="p">[</span><span class="n">gpu1</span><span class="p">],</span> <span class="n">fc1_weight</span><span class="p">[</span><span class="n">gpu1</span><span class="p">])</span>
     <span class="n">fc2</span><span class="p">[</span><span class="n">gpu1</span><span class="p">]</span> <span class="o">=</span> <span class="n">FullcForward</span><span class="p">(</span><span class="n">fc1</span><span class="p">[</span><span class="n">gpu1</span><span class="p">],</span> <span class="n">fc2_weight</span><span class="p">[</span><span class="n">gpu1</span><span class="p">])</span>
     <span class="n">fc2_ograd</span><span class="p">[</span><span class="n">gpu1</span><span class="p">]</span> <span class="o">=</span> <span class="n">LossGrad</span><span class="p">(</span><span class="n">fc2</span><span class="p">[</span><span class="n">gpu1</span><span class="p">],</span> <span class="n">label</span><span class="p">[</span><span class="mi">50</span><span class="p">:</span><span class="mi">100</span><span class="p">])</span>
     <span class="n">fc1_ograd</span><span class="p">[</span><span class="n">gpu1</span><span class="p">],</span> <span class="n">fc2_wgrad</span><span class="p">[</span><span class="n">gpu1</span><span class="p">]</span> <span class="o">=</span>
          <span class="n">FullcBackward</span><span class="p">(</span><span class="n">fc2_ograd</span><span class="p">[</span><span class="n">gpu1</span><span class="p">]</span> <span class="p">,</span> <span class="n">fc2_weight</span><span class="p">[</span><span class="n">gpu1</span><span class="p">])</span>
          <span class="n">_</span><span class="p">,</span> <span class="n">fc1_wgrad</span><span class="p">[</span><span class="n">gpu1</span><span class="p">]</span> <span class="o">=</span> <span class="n">FullcBackward</span><span class="p">(</span><span class="n">fc1_ograd</span><span class="p">[</span><span class="n">gpu1</span><span class="p">]</span> <span class="p">,</span> <span class="n">fc1_weight</span><span class="p">[</span><span class="n">gpu1</span><span class="p">])</span>
     <span class="c1"># aggregate gradient and update</span>
     <span class="n">fc1_wgrad</span><span class="p">[</span><span class="n">cpu</span><span class="p">]</span>  <span class="o">=</span> <span class="n">fc1_wgrad</span><span class="p">[</span><span class="n">gpu0</span><span class="p">]</span> <span class="o">+</span> <span class="n">fc1_wgrad</span><span class="p">[</span><span class="n">gpu1</span><span class="p">]</span>
     <span class="n">fc2_wgrad</span><span class="p">[</span><span class="n">cpu</span><span class="p">]</span>  <span class="o">=</span> <span class="n">fc2_wgrad</span><span class="p">[</span><span class="n">gpu0</span><span class="p">]</span> <span class="o">+</span> <span class="n">fc2_wgrad</span><span class="p">[</span><span class="n">gpu1</span><span class="p">]</span>
     <span class="n">fc1_weight</span><span class="p">[</span><span class="n">cpu</span><span class="p">]</span> <span class="o">-=</span> <span class="n">lr</span> <span class="o">*</span>  <span class="n">fc1_wgrad</span><span class="p">[</span><span class="n">cpu</span><span class="p">]</span>
     <span class="n">fc2_weight</span><span class="p">[</span><span class="n">cpu</span><span class="p">]</span> <span class="o">-=</span> <span class="n">lr</span> <span class="o">*</span>  <span class="n">fc2_wgrad</span><span class="p">[</span><span class="n">cpu</span><span class="p">]</span>
     <span class="n">fc1_weight</span><span class="p">[</span><span class="n">cpu</span><span class="p">]</span><span class="o">.</span><span class="n">copyto</span><span class="p">(</span><span class="n">fc1_weight</span><span class="p">[</span><span class="n">gpu0</span><span class="p">]</span> <span class="p">,</span> <span class="n">fc1_weight</span><span class="p">[</span><span class="n">gpu1</span><span class="p">])</span>
     <span class="n">fc2_weight</span><span class="p">[</span><span class="n">cpu</span><span class="p">]</span><span class="o">.</span><span class="n">copyto</span><span class="p">(</span><span class="n">fc2_weight</span><span class="p">[</span><span class="n">gpu0</span><span class="p">]</span> <span class="p">,</span> <span class="n">fc2_weight</span><span class="p">[</span><span class="n">gpu1</span><span class="p">])</span>
 </pre></div>
 </div>
 <p>In this program, the data 0 to 50  is copied to GPU 0,
 and the data 50 to 100 is copied to GPU 1.
 The calculated gradients are aggregated in the CPU,
 which then performs a simple SGD update,
 and copies the updated weight back to each GPU.
 This is a common way to write a parallel program in a serial manner.
 The following dependency graph shows how it can be parallelized:</p>
 <p><img alt="Dep Net" src="https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/engine/dep_net.png"/></p>
 <p><strong><em>Notes:</em></strong></p>
 <ul class="simple">
 <li>The gradient can be copied to the CPU as soon as we get the gradient of a layer.</li>
 <li>The weight can be copied back soon as the weight is updated.</li>
 <li>In the forward pass, we have a dependency on <code class="docutils literal"><span class="pre">fc1_weight[cpu].copyto(fc1_weight[gpu0]</span> <span class="pre">,</span> <span class="pre">fc1_weight[gpu1])</span></code>
 from the previous iteration.</li>
 <li>There is a delay in computation between the last backward pass to layer k and the next forward call to layer k. We can synchronize the weight of layer k <em>in parallel</em> with other computation during this delay.</li>
 </ul>
 <p>This approach to optimization is used by multi-GPU deep learning libraries, such as CXXNet.
 The point is to overlap weight synchronization (communication) with computation.
 However, it’s not easy to do that, because the copy operation needs to be triggered
 as soon as the backward pass of the layer completes,
 which then triggers the reduction, updates, etc.</p>
 <p>A dependency engine can schedule these operations and perform multi-threading
 and dependency tracking.</p>
 </div>
 <div class="section" id="designing-a-generic-dependency-engine">
 <span id="designing-a-generic-dependency-engine"></span><h2>Designing a Generic Dependency Engine<a class="headerlink" href="#designing-a-generic-dependency-engine" title="Permalink to this headline">¶</a></h2>
 <p>We hope that you’re convinced that a dependency engine is useful
 for scaling deep learning programs to multiple devices.
 Now let’s discuss how we can design and implement
 a generic interface for a dependency engine.
 This solution isn’t the only possible design for a dependency engine.
 It’s an example that we think is useful in most cases.</p>
 <p>Our goal is to create a dependency engine that is <em>generic</em> and <em>lightweight</em>.
 Ideally, we’d like the engine that easily plugs into existing deep learning code,
 and that can scale up to multiple machines with minor modifications.
 To do that, we need to focus only on dependency tracking,
 not on assumptions about what users can or can’t do.</p>
 <p>Here’s a summary of goals for the engine:</p>
 <ul class="simple">
 <li>The engine should not be aware of what operations it performs, so that users can perform any operations they define.</li>
 <li>It should not be restricted in what type of objects it can schedule.<ul>
 <li>We should be able to schedule dependencies on GPU and CPU memory.</li>
 <li>We should be able to track dependencies on the random number generator, etc.</li>
 </ul>
 </li>
 <li>The engine should not allocate resources. It should only track dependencies. Users can allocate their own memory, PRNG, etc.</li>
 </ul>
 <p>The following Python snippet provides an engine interface that might help us reach our goal. Note that a real implementation will be closer to the metal, typically in C++.</p>
 <div class="highlight-python"><div class="highlight"><pre><span></span>    <span class="k">class</span> <span class="nc">DepEngine</span><span class="p">(</span><span class="nb">object</span><span class="p">):</span>
         <span class="k">def</span> <span class="nf">new_variable</span><span class="p">():</span>
             <span class="sd">"""Return a new variable tag</span>
 <span class="sd">            Returns</span>
 <span class="sd">            -------</span>
 <span class="sd">            vtag : Variable Tag</span>
 <span class="sd">                The token of the engine to represent dependencies.</span>
 <span class="sd">            """</span>
             <span class="k">pass</span>

         <span class="k">def</span> <span class="nf">push</span><span class="p">(</span><span class="n">exec_func</span><span class="p">,</span> <span class="n">read_vars</span><span class="p">,</span> <span class="n">mutate_vars</span><span class="p">):</span>
             <span class="sd">"""Push the operation to the engine.</span>

 <span class="sd">            Parameters</span>
 <span class="sd">            ----------</span>
 <span class="sd">            exec_func : callable</span>
 <span class="sd">                The real operation to be performed.</span>

 <span class="sd">            read_vars : list of Variable Tags</span>
 <span class="sd">                The list of variables this operation will read from.</span>

 <span class="sd">            mutate_vars : list of Variable Tags</span>
 <span class="sd">                The list of variables this operation will mutate.</span>
 <span class="sd">            """</span>
             <span class="k">pass</span>
 </pre></div>
 </div>
 <p>Because we can’t make assumptions about what objects we are scheduling, we ask the user to allocate a
 <em>virtual tag</em> that is associated with each object to represent what we need to schedule.
 So, at the beginning, the user can allocate the variable tag,
 and attach it to each of the objects that we want to schedule.</p>
 <p><img alt="Dep Net" src="https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/engine/tag_var.png"/></p>
 <p>The user then calls <code class="docutils literal"><span class="pre">push</span></code> to tell the engine about the function to execute.
 The user also needs to specify the dependencies of the operation,
 using <code class="docutils literal"><span class="pre">read_vars</span></code> and <code class="docutils literal"><span class="pre">write_vars</span></code>:</p>
 <ul class="simple">
 <li><code class="docutils literal"><span class="pre">read_vars</span></code> are variable tags for objects that the operation will <em>read from</em>, without changing their internal state.</li>
 <li><code class="docutils literal"><span class="pre">mutate_vars</span></code> are variable tags for objects whose internal states the operation will mutate.</li>
 </ul>
 <p><img alt="Push Op" src="https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/engine/push_var.png"/></p>
 <p>The preceding figure shows how to push operation <code class="docutils literal"><span class="pre">B</span> <span class="pre">=</span> <span class="pre">A</span> <span class="pre">+</span> <span class="pre">1</span></code> to the dependency engine. <code class="docutils literal"><span class="pre">B.data</span></code> and
 <code class="docutils literal"><span class="pre">A.data</span></code> are the allocated space. Note that the engine is <em>only aware of variable tags</em>.
 Any execution function can be processed.
 This interface is generic for the operations and resources we want to schedule.</p>
 <p>For fun, let’s look at how the engine internals work with the tags by considering the following code snippet:</p>
 <div class="highlight-python"><div class="highlight"><pre><span></span>    <span class="n">B</span> <span class="o">=</span> <span class="n">A</span> <span class="o">+</span> <span class="mi">1</span>
     <span class="n">C</span> <span class="o">=</span> <span class="n">A</span> <span class="o">+</span> <span class="mi">2</span>
     <span class="n">A</span> <span class="o">=</span> <span class="n">C</span> <span class="o">*</span> <span class="mi">2</span>
     <span class="n">D</span> <span class="o">=</span> <span class="n">A</span> <span class="o">+</span> <span class="mi">3</span>
 </pre></div>
 </div>
 <p>The first line reads variable <code class="docutils literal"><span class="pre">A</span></code> and mutates variable <code class="docutils literal"><span class="pre">B</span></code>. The second line reads variable <code class="docutils literal"><span class="pre">A</span></code> and mutates variable <code class="docutils literal"><span class="pre">C</span></code>. And so on.</p>
 <p>The engine maintains a queue for each variable, as the following animation shows for each of the four lines. Green blocks represents a read action, while red blocks represent mutations.</p>
 <p><img alt="Dependency Queue" src="https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/engine/dep_queue.gif"/></p>
 <p>Upon building this queue, the engine sees that the first two green blocks at the beginning of <code class="docutils literal"><span class="pre">A</span></code>‘s queue could actually be run in parallel because they are both read actions and won’t conflict with each other. The following graph illustrates this point.</p>
 <p><img alt="Dependency Parallelism" src="https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/engine/dep_parallel.png"/></p>
 <p>One cool thing about all this scheduling is that it’s not confined to numerical calculations.
 Because everything that is scheduled is only a tag, the engine could schedule everything!</p>
 <p>The following figure gives a complete push sequence of the programs we mentioned in previous sections.</p>
 <p><img alt="Push Seq" src="https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/engine/push_seq.png"/></p>
 <div class="section" id="porting-existing-code-to-the-dependency-engine">
 <span id="porting-existing-code-to-the-dependency-engine"></span><h3>Porting Existing Code to the Dependency Engine<a class="headerlink" href="#porting-existing-code-to-the-dependency-engine" title="Permalink to this headline">¶</a></h3>
 <p>Because the generic interface doesn’t control things like memory allocation and which operation to execute,
 most existing code can be scheduled by the dependency engine in two steps:</p>
 <ol class="simple">
 <li>Allocate the variable tags associated with resources like memory blob, PRNGS.<ul>
 <li>Call <code class="docutils literal"><span class="pre">push</span></code> with the execution function as the original code to execute, and put the variable tags of
 corresponding resources correctly in <code class="docutils literal"><span class="pre">read_vars</span></code> and <code class="docutils literal"><span class="pre">mutate_vars</span></code>.</li>
 </ul>
 </li>
 </ol>
 </div>
 </div>
 <div class="section" id="implementing-the-generic-dependency-engine">
 <span id="implementing-the-generic-dependency-engine"></span><h2>Implementing the Generic Dependency Engine<a class="headerlink" href="#implementing-the-generic-dependency-engine" title="Permalink to this headline">¶</a></h2>
 <p>We have described the generic engine interface and
 how it can be used to schedule various operations.
 In this section, we provide a high-level discussion
 of how to implement such an engine.</p>
 <p>The general idea is as follows:</p>
 <ul class="simple">
 <li>Use a queue to track all of the pending dependencies on each variable tag.</li>
 <li>Use a counter on each operation to track how many dependencies are yet to be fulfilled.</li>
 <li>When operations are completed, update the state of the queue and dependency counters to schedule new operations.</li>
 </ul>
 <p>The following figure illustrates the scheduling algorithm
 and might give you a better sense of what is going on in the engine.</p>
 <p><img alt="Dep Tracking" src="https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/engine/engine_queue_step.png"/></p>
 <p>Below, we show another example involving random number generators.</p>
 <p><img alt="Dep Rand" src="https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/engine/engine_queue_rand.png"/></p>
 <p>As you can see, the purpose of the algorithm is to update pending queues
 of operations and to make the right state transition when an operation has completed.
 More care should be taken to make sure the state transitions
 are done in a way that’s safe for threads.</p>
 <div class="section" id="separate-dependency-tracking-with-running-policy">
 <span id="separate-dependency-tracking-with-running-policy"></span><h3>Separate Dependency Tracking with Running Policy<a class="headerlink" href="#separate-dependency-tracking-with-running-policy" title="Permalink to this headline">¶</a></h3>
 <p>If you’re reading carefully, you might have noticed
 that the preceding section shows only the algorithm
 for deciding when an operation can be executed.
 We didn’t show how to actually run an operation.
 In practice, there can be many different policies.
 For example, we can either use a global thread-pool to run all operations,
 or use a specific thread to run operations on each device.</p>
 <p>This running policy is usually independent of dependency tracking,
 and can be separated out as either an independent module
 or a virtual interface of base-dependency tracking modules.
 Developing an elegant runtime policy that is fair
 to all operations and schedules is an interesting systems problem itself.</p>
 </div>
 </div>
 <div class="section" id="discussion">
 <span id="discussion"></span><h2>Discussion<a class="headerlink" href="#discussion" title="Permalink to this headline">¶</a></h2>
 <p>The design that we discussed in this article
 isn’t the only solution to the dependency tracking problem.
 It’s just one example of how we might approach this.
 To be sure, some of these design choices are debatable.
 We’ll discuss some of them in this section.</p>
 <div class="section" id="dynamic-vs-static">
 <span id="dynamic-vs-static"></span><h3>Dynamic vs. Static<a class="headerlink" href="#dynamic-vs-static" title="Permalink to this headline">¶</a></h3>
 <p>The dependency engine interface discussed in this topic is somewhat dynamic
 in the sense that the user can push operations one by one,
 instead of declaring the entire dependency graph (static).
 Dynamic scheduling can require more overhead
 than static declarations, in terms of data structure.
 However, it also enables more flexibility, such as supporting auto parallelism
 for imperative programs or a mixture of imperative and symbolic programs.
 You can also add some level of predeclared operations
 to the interface to enable data structure reuse.</p>
 </div>
 <div class="section" id="mutation-vs-immutable">
 <span id="mutation-vs-immutable"></span><h3>Mutation vs. Immutable<a class="headerlink" href="#mutation-vs-immutable" title="Permalink to this headline">¶</a></h3>
 <p>The generic engine interface presented in this page
 supports explicit scheduling for mutation.
 In a typical data flow engine, the data are usually immutable.
 Working with immutable data has a lot of benefits.
 For example, immutable data is generally more suitable for parallelization,
 and facilitates better fault tolerance in a distributed setting (by way of re-computation).</p>
 <p>However, immutability presents several challenges:</p>
 <ul class="simple">
 <li>It’s harder to schedule resource contention problems, as arise when dealing with random numbers and deletion.</li>
 <li>The engine usually needs to manage resources (memory, random number) to avoid conflicts. It’s harder to plug in user-allocated space, etc.</li>
 <li>Preallocated static memory isn’t available, again because the usual pattern is to write to a preallocated layer space, which is not supported if data is immutable.</li>
 </ul>
 <p>Allowing mutation mitigates these issues.</p>
 </div>
 </div>
 <div class="section" id="source-code-of-the-generic-dependency-engine">
 <span id="source-code-of-the-generic-dependency-engine"></span><h2>Source Code of the Generic Dependency Engine<a class="headerlink" href="#source-code-of-the-generic-dependency-engine" title="Permalink to this headline">¶</a></h2>
 <p><a class="reference external" href="https://github.com/dmlc/mxnet">MXNet</a> provides an implementation
 of the generic dependency engine described in this page.
 You can find more details in <a class="reference external" href="http://mxnet.io/architecture/note_engine.html">this topic</a>.
 We welcome your contributions.</p>
 </div>
 <div class="section" id="next-steps">
 <span id="next-steps"></span><h2>Next Steps<a class="headerlink" href="#next-steps" title="Permalink to this headline">¶</a></h2>
 <div class="toctree-wrapper compound">
 <ul>
 <li class="toctree-l1"><a class="reference external" href="http://mxnet.io/architecture/note_memory.html">Squeeze the Memory Consumption of Deep Learning</a></li>
 <li class="toctree-l1"><a class="reference external" href="http://mxnet.io/architecture/note_data_loading.html">Efficient Data Loading Module for Deep Learning</a></li>
 <li class="toctree-l1"><a class="reference external" href="http://mxnet.io/architecture/rnn_interface.html">Survey of RNN Interface</a></li>
 </ul>
 </div>
 </div>
 </div>
 <div class="container">
 <div class="footer">
 <p> © 2015-2017 DMLC. All rights reserved. </p>
 </div>
 </div>
 </div>
 <div aria-label="main navigation" class="sphinxsidebar rightsidebar" role="navigation">
 <div class="sphinxsidebarwrapper">
 <h3><a href="../index.html">Table Of Contents</a></h3>
 <ul>
 <li><a class="reference internal" href="#">Dependency Engine for Deep Learning</a><ul>
 <li><a class="reference internal" href="#dependency-scheduling">Dependency Scheduling</a></li>
 <li><a class="reference internal" href="#problems-in-dependency-scheduling">Problems in Dependency Scheduling</a><ul>
 <li><a class="reference internal" href="#data-flow-dependency">Data Flow Dependency</a></li>
 <li><a class="reference internal" href="#memory-recycling">Memory Recycling</a></li>
 <li><a class="reference internal" href="#random-number-generation">Random Number Generation</a></li>
 </ul>
 </li>
 <li><a class="reference internal" href="#case-study-a-dependency-engine-for-a-multi-gpu-neural-network">Case Study: A Dependency Engine for a Multi-GPU Neural Network</a></li>
 <li><a class="reference internal" href="#designing-a-generic-dependency-engine">Designing a Generic Dependency Engine</a><ul>
 <li><a class="reference internal" href="#porting-existing-code-to-the-dependency-engine">Porting Existing Code to the Dependency Engine</a></li>
 </ul>
 </li>
 <li><a class="reference internal" href="#implementing-the-generic-dependency-engine">Implementing the Generic Dependency Engine</a><ul>
 <li><a class="reference internal" href="#separate-dependency-tracking-with-running-policy">Separate Dependency Tracking with Running Policy</a></li>
 </ul>
 </li>
 <li><a class="reference internal" href="#discussion">Discussion</a><ul>
 <li><a class="reference internal" href="#dynamic-vs-static">Dynamic vs. Static</a></li>
 <li><a class="reference internal" href="#mutation-vs-immutable">Mutation vs. Immutable</a></li>
 </ul>
 </li>
 <li><a class="reference internal" href="#source-code-of-the-generic-dependency-engine">Source Code of the Generic Dependency Engine</a></li>
 <li><a class="reference internal" href="#next-steps">Next Steps</a></li>
 </ul>
 </li>
 </ul>
 </div>
 </div>
 </div> <!-- pagename != index -->
 <script crossorigin="anonymous" integrity="sha384-0mSbJDEHialfmuBBQP6A4Qrprq5OVfW37PRR3j5ELqxss1yVqOtnepnHVP9aJ7xS" src="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.6/js/bootstrap.min.js"></script>
 <script src="../_static/js/sidebar.js" type="text/javascript"></script>
 <script src="../_static/js/search.js" type="text/javascript"></script>
 <script src="../_static/js/navbar.js" type="text/javascript"></script>
 <script src="../_static/js/clipboard.min.js" type="text/javascript"></script>
 <script src="../_static/js/copycode.js" type="text/javascript"></script>
 <script type="text/javascript">
         $('body').ready(function () {
             $('body').css('visibility', 'visible');
         });
     </script>
 </div></body>
 </html>