<!DOCTYPE html>

<html lang="en">
<head>
<meta charset="utf-8"/>
<meta content="IE=edge" http-equiv="X-UA-Compatible"/>
<meta content="width=device-width, initial-scale=1" name="viewport"/>
<title>Some Tips for Improving MXNet Performance — mxnet  documentation</title>
<link crossorigin="anonymous" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.6/css/bootstrap.min.css" integrity="sha384-1q8mTJOASx8j1Au+a5WDVnPi2lkFfwwEAa8hDDdjZlpLegxhjVME1fgjWPGmkzs7" rel="stylesheet"/>
<link href="https://maxcdn.bootstrapcdn.com/font-awesome/4.5.0/css/font-awesome.min.css" rel="stylesheet"/>
<link href="../_static/basic.css" rel="stylesheet" type="text/css">
<link href="../_static/pygments.css" rel="stylesheet" type="text/css">
<link href="../_static/mxnet.css" rel="stylesheet" type="text/css"/>
<script type="text/javascript">
      var DOCUMENTATION_OPTIONS = {
        URL_ROOT:    '../',
        VERSION:     '',
        COLLAPSE_INDEX: false,
        FILE_SUFFIX: '.html',
        HAS_SOURCE:  true,
        SOURCELINK_SUFFIX: ''
      };
    </script>
<script src="../_static/jquery-1.11.1.js" type="text/javascript"></script>
<script src="../_static/underscore.js" type="text/javascript"></script>
<script src="../_static/searchtools_custom.js" type="text/javascript"></script>
<script src="../_static/doctools.js" type="text/javascript"></script>
<script src="../_static/selectlang.js" type="text/javascript"></script>
<script src="https://cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML" type="text/javascript"></script>
<script type="text/javascript"> jQuery(function() { Search.loadIndex("/searchindex.js"); Search.init();}); </script>
<script>
      (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
      (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new
      Date();a=s.createElement(o),
      m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
      })(window,document,'script','https://www.google-analytics.com/analytics.js','ga');

      ga('create', 'UA-96378503-1', 'auto');
      ga('send', 'pageview');

    </script>
<!-- -->
<!-- <script type="text/javascript" src="../_static/jquery.js"></script> -->
<!-- -->
<!-- <script type="text/javascript" src="../_static/underscore.js"></script> -->
<!-- -->
<!-- <script type="text/javascript" src="../_static/doctools.js"></script> -->
<!-- -->
<!-- <script type="text/javascript" src="https://cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML"></script> -->
<!-- -->
<link href="https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/image/mxnet-icon.png" rel="icon" type="image/png"/>
</link></link></head>
<body role="document"><!-- Previous Navbar Layout
<div class="navbar navbar-default navbar-fixed-top">
  <div class="container">
    <div class="navbar-header">
      <button type="button" class="navbar-toggle collapsed" data-toggle="collapse" data-target="#navbar" aria-expanded="false" aria-controls="navbar">
        <span class="sr-only">Toggle navigation</span>
        <span class="icon-bar"></span>
        <span class="icon-bar"></span>
        <span class="icon-bar"></span>
      </button>
      <a href="../" class="navbar-brand">
        <img src="http://data.mxnet.io/theme/mxnet.png">
      </a>
    </div>
    <div id="navbar" class="navbar-collapse collapse">
      <ul id="navbar" class="navbar navbar-left">
        
        <li> <a href="../get_started/index.html">Get Started</a> </li>
        
        <li> <a href="../tutorials/index.html">Tutorials</a> </li>
        
        <li> <a href="../how_to/index.html">How To</a> </li>
        
        
        <li class="dropdown">
          <a href="#" class="dropdown-toggle" data-toggle="dropdown" role="button" aria-haspopup="true" aria-expanded="true">Packages <span class="caret"></span></a>
          <ul class="dropdown-menu">
            
            <li><a href="../packages/python/index.html">
                Python
            </a></li>
            
            <li><a href="../packages/r/index.html">
                R
            </a></li>
            
            <li><a href="../packages/julia/index.html">
                Julia
            </a></li>
            
            <li><a href="../packages/c++/index.html">
                C++
            </a></li>
            
            <li><a href="../packages/scala/index.html">
                Scala
            </a></li>
            
            <li><a href="../packages/perl/index.html">
                Perl
            </a></li>
            
          </ul>
        </li>
        
        <li> <a href="../system/index.html">System</a> </li>
        <li> 
<form class="" role="search" action="../search.html" method="get" autocomplete="off">
  <div class="form-group inner-addon left-addon">
    <i class="glyphicon glyphicon-search"></i>
    <input type="text" name="q" class="form-control" placeholder="Search">
  </div>
  <input type="hidden" name="check_keywords" value="yes" />
  <input type="hidden" name="area" value="default" />
  
</form> </li>
      </ul>
      <ul id="navbar" class="navbar navbar-right">
        <li> <a href="../index.html"><span class="flag-icon flag-icon-us"></span></a> </li>
        <li> <a href="..//zh/index.html"><span class="flag-icon flag-icon-cn"></span></a> </li>
      </ul>
    </div>
  </div>
</div>
Previous Navbar Layout End -->
<div class="navbar navbar-fixed-top">
<div class="container" id="navContainer">
<div class="innder" id="header-inner">
<h1 id="logo-wrap">
<a href="../" id="logo"><img src="http://data.mxnet.io/theme/mxnet.png"/></a>
</h1>
<nav class="nav-bar" id="main-nav">
<a class="main-nav-link" href="../get_started/install.html">Install</a>
<a class="main-nav-link" href="../tutorials/index.html">Tutorials</a>
<a class="main-nav-link" href="../how_to/index.html">How To</a>
<span id="dropdown-menu-position-anchor">
<a aria-expanded="true" aria-haspopup="true" class="main-nav-link dropdown-toggle" data-toggle="dropdown" href="#" role="button">API <span class="caret"></span></a>
<ul class="dropdown-menu" id="package-dropdown-menu">
<li><a class="main-nav-link" href="../api/python/index.html">Python</a></li>
<li><a class="main-nav-link" href="../api/scala/index.html">Scala</a></li>
<li><a class="main-nav-link" href="../api/r/index.html">R</a></li>
<li><a class="main-nav-link" href="../api/julia/index.html">Julia</a></li>
<li><a class="main-nav-link" href="../api/c++/index.html">C++</a></li>
<li><a class="main-nav-link" href="../api/perl/index.html">Perl</a></li>
</ul>
</span>
<a class="main-nav-link" href="../architecture/index.html">Architecture</a>
<!-- <a class="main-nav-link" href="../community/index.html">Community</a> -->
<a class="main-nav-link" href="https://github.com/dmlc/mxnet">Github</a>
<span id="dropdown-menu-position-anchor-version" style="position: relative"><a href="#" class="main-nav-link dropdown-toggle" data-toggle="dropdown" role="button" aria-haspopup="true" aria-expanded="true">Versions(v0.10.14)<span class="caret"></span></a><ul id="package-dropdown-menu" class="dropdown-menu"><li><a class="main-nav-link" href=http://mxnet.incubator.apache.org/test/>v0.10.14</a></li><li><a class="main-nav-link" href=http://mxnet.incubator.apache.org/test/versions/0.10/index.html>0.10</a></li><li><a class="main-nav-link" href=http://mxnet.incubator.apache.org/test/versions/master/index.html>master</a></li></ul></span></nav>
<script> function getRootPath(){ return "../" } </script>
<div class="burgerIcon dropdown">
<a class="dropdown-toggle" data-toggle="dropdown" href="#" role="button">☰</a>
<ul class="dropdown-menu dropdown-menu-right" id="burgerMenu">
<li><a href="../get_started/install.html">Install</a></li>
<li><a href="../tutorials/index.html">Tutorials</a></li>
<li><a href="../how_to/index.html">How To</a></li>
<li class="dropdown-submenu">
<a href="#" tabindex="-1">API</a>
<ul class="dropdown-menu">
<li><a href="../api/python/index.html" tabindex="-1">Python</a>
</li>
<li><a href="../api/scala/index.html" tabindex="-1">Scala</a>
</li>
<li><a href="../api/r/index.html" tabindex="-1">R</a>
</li>
<li><a href="../api/julia/index.html" tabindex="-1">Julia</a>
</li>
<li><a href="../api/c++/index.html" tabindex="-1">C++</a>
</li>
<li><a href="../api/perl/index.html" tabindex="-1">Perl</a>
</li>
</ul>
</li>
<li><a href="../architecture/index.html">Architecture</a></li>
<li><a class="main-nav-link" href="https://github.com/dmlc/mxnet">Github</a></li>
<li id="dropdown-menu-position-anchor-version-mobile" class="dropdown-submenu" style="position: relative"><a href="#" tabindex="-1">Versions(v0.10.14)</a><ul class="dropdown-menu"><li><a tabindex="-1" href=http://mxnet.incubator.apache.org/test/>v0.10.14</a></li><li><a tabindex="-1" href=http://mxnet.incubator.apache.org/test/versions/0.10/index.html>0.10</a></li><li><a tabindex="-1" href=http://mxnet.incubator.apache.org/test/versions/master/index.html>master</a></li></ul></li></ul>
</div>
<div class="plusIcon dropdown">
<a class="dropdown-toggle" data-toggle="dropdown" href="#" role="button"><span aria-hidden="true" class="glyphicon glyphicon-plus"></span></a>
<ul class="dropdown-menu dropdown-menu-right" id="plusMenu"></ul>
</div>
<div id="search-input-wrap">
<form action="../search.html" autocomplete="off" class="" method="get" role="search">
<div class="form-group inner-addon left-addon">
<i class="glyphicon glyphicon-search"></i>
<input class="form-control" name="q" placeholder="Search" type="text"/>
</div>
<input name="check_keywords" type="hidden" value="yes">
<input name="area" type="hidden" value="default"/>
</input></form>
<div id="search-preview"></div>
</div>
<div id="searchIcon">
<span aria-hidden="true" class="glyphicon glyphicon-search"></span>
</div>
<!-- <div id="lang-select-wrap"> -->
<!--   <label id="lang-select-label"> -->
<!--     <\!-- <i class="fa fa-globe"></i> -\-> -->
<!--     <span></span> -->
<!--   </label> -->
<!--   <select id="lang-select"> -->
<!--     <option value="en">Eng</option> -->
<!--     <option value="zh">中文</option> -->
<!--   </select> -->
<!-- </div> -->
<!--     <a id="mobile-nav-toggle">
        <span class="mobile-nav-toggle-bar"></span>
        <span class="mobile-nav-toggle-bar"></span>
        <span class="mobile-nav-toggle-bar"></span>
      </a> -->
</div>
</div>
</div>
<div class="container">
<div class="row">
<div aria-label="main navigation" class="sphinxsidebar leftsidebar" role="navigation">
<div class="sphinxsidebarwrapper">
<ul>
<li class="toctree-l1"><a class="reference internal" href="../api/python/index.html">Python Documents</a></li>
<li class="toctree-l1"><a class="reference internal" href="../api/r/index.html">R Documents</a></li>
<li class="toctree-l1"><a class="reference internal" href="../api/julia/index.html">Julia Documents</a></li>
<li class="toctree-l1"><a class="reference internal" href="../api/c++/index.html">C++ Documents</a></li>
<li class="toctree-l1"><a class="reference internal" href="../api/scala/index.html">Scala Documents</a></li>
<li class="toctree-l1"><a class="reference internal" href="../api/perl/index.html">Perl Documents</a></li>
<li class="toctree-l1"><a class="reference internal" href="index.html">HowTo Documents</a></li>
<li class="toctree-l1"><a class="reference internal" href="../architecture/index.html">System Documents</a></li>
<li class="toctree-l1"><a class="reference internal" href="../tutorials/index.html">Tutorials</a></li>
</ul>
</div>
</div>
<div class="content">
<div class="section" id="some-tips-for-improving-mxnet-performance">
<span id="some-tips-for-improving-mxnet-performance"></span><h1>Some Tips for Improving MXNet Performance<a class="headerlink" href="#some-tips-for-improving-mxnet-performance" title="Permalink to this headline">¶</a></h1>
<p>Even after fixing the training or deployment environment and parallelization scheme,
a number of configuration settings and data-handling choices can impact the <em>MXNet</em> performance.
In this document, we address some tips for improving <em>MXNet</em> performance.</p>
<p>Performance is mainly affected by the following 4 factors:</p>
<ol class="simple">
<li>Implementation of operators (Convolution, Pooling, ..)<ul>
<li><a class="reference external" href="#intel-cpu">Intel CPU</a></li>
<li><a class="reference external" href="#nvidia-gpu">Nvidia GPU</a></li>
</ul>
</li>
<li>Input data loading and augmentation<ul>
<li><a class="reference external" href="#input-data">Input Data</a></li>
</ul>
</li>
<li>Workloads (computation graph) optimization and scheduling<ul>
<li><a class="reference external" href="#profiler">Profiler</a></li>
</ul>
</li>
<li>Communication for multi-devices training<ul>
<li><a class="reference external" href="#multiple-devices">Multiple Devices</a></li>
</ul>
</li>
</ol>
<div class="section" id="intel-cpu">
<span id="intel-cpu"></span><h2>Intel CPU<a class="headerlink" href="#intel-cpu" title="Permalink to this headline">¶</a></h2>
<p>For using Intel Xeon CPUs for training and inference, we suggest enabling
both <code class="docutils literal"><span class="pre">USE_MKL2017</span> <span class="pre">=</span> <span class="pre">1</span></code> and <code class="docutils literal"><span class="pre">USE_MKL2017_EXPERIMENTAL</span> <span class="pre">=</span> <span class="pre">1</span></code> in
<code class="docutils literal"><span class="pre">config.mk</span></code>. Check
<a class="reference external" href="https://github.com/dmlc/mxnet/blob/master/MKL_README.md">MKL_README.md</a> for
details.</p>
<p>We also find that setting the following two environment variables can help:</p>
<ul class="simple">
<li><code class="docutils literal"><span class="pre">export</span> <span class="pre">KMP_AFFINITY=granularity=fine,compact,1,0</span></code> if there are two physical CPUs</li>
<li><code class="docutils literal"><span class="pre">export</span> <span class="pre">OMP_NUM_THREADS=vCPUs</span> <span class="pre">/</span> <span class="pre">2</span></code> in which <code class="docutils literal"><span class="pre">vCPUs</span></code> is the number of virtual CPUs.
Whe using Linux, we can access this information by running <code class="docutils literal"><span class="pre">cat</span> <span class="pre">/proc/cpuinfo</span> <span class="pre">|</span> <span class="pre">grep</span> <span class="pre">processor</span> <span class="pre">|</span> <span class="pre">wc</span> <span class="pre">-l</span></code></li>
</ul>
<p>Note that <em>MXNet</em> treats all CPUs on a single machine as a single device.
So whether you specify <code class="docutils literal"><span class="pre">cpu(0)</span></code> or <code class="docutils literal"><span class="pre">cpu()</span></code>, <em>MXNet</em> will use all CPU cores on the machine.</p>
<div class="section" id="scoring-results">
<span id="scoring-results"></span><h3>Scoring results<a class="headerlink" href="#scoring-results" title="Permalink to this headline">¶</a></h3>
<p>The following table shows performance,
namely number of images that can be predicted per second.
We used <a class="reference external" href="https://github.com/dmlc/mxnet/blob/master/example/image-classification/benchmark_score.py">example/image-classification/benchmark_score.py</a>
to measure the performance on different AWS EC2 machines.</p>
<p>AWS EC2 C4.8xlarge:</p>
<table border="1" class="docutils">
<colgroup>
<col width="14%"/>
<col width="14%"/>
<col width="14%"/>
<col width="14%"/>
<col width="14%"/>
<col width="14%"/>
<col width="14%"/>
</colgroup>
<thead valign="bottom">
<tr class="row-odd"><th class="head">Batch</th>
<th class="head">Alexnet</th>
<th class="head">VGG</th>
<th class="head">Inception-BN</th>
<th class="head">Inception-v3</th>
<th class="head">Resnet 50</th>
<th class="head">Resnet 152</th>
</tr>
</thead>
<tbody valign="top">
<tr class="row-even"><td>1</td>
<td>119.57</td>
<td>34.23</td>
<td>111.36</td>
<td>54.42</td>
<td>42.83</td>
<td>19.51</td>
</tr>
<tr class="row-odd"><td>2</td>
<td>210.58</td>
<td>51.63</td>
<td>137.10</td>
<td>67.30</td>
<td>57.54</td>
<td>23.56</td>
</tr>
<tr class="row-even"><td>4</td>
<td>318.54</td>
<td>70.00</td>
<td>187.21</td>
<td>76.53</td>
<td>63.64</td>
<td>25.80</td>
</tr>
<tr class="row-odd"><td>8</td>
<td>389.34</td>
<td>77.39</td>
<td>211.90</td>
<td>84.26</td>
<td>63.89</td>
<td>28.11</td>
</tr>
<tr class="row-even"><td>16</td>
<td>489.12</td>
<td>85.26</td>
<td>220.52</td>
<td>82.00</td>
<td>63.93</td>
<td>27.08</td>
</tr>
<tr class="row-odd"><td>32</td>
<td>564.04</td>
<td>87.15</td>
<td>208.21</td>
<td>83.05</td>
<td>62.19</td>
<td>25.76</td>
</tr>
</tbody>
</table>
<p>AWS EC2 C4.4xlarge:</p>
<table border="1" class="docutils">
<colgroup>
<col width="14%"/>
<col width="14%"/>
<col width="14%"/>
<col width="14%"/>
<col width="14%"/>
<col width="14%"/>
<col width="14%"/>
</colgroup>
<thead valign="bottom">
<tr class="row-odd"><th class="head">Batch</th>
<th class="head">Alexnet</th>
<th class="head">VGG</th>
<th class="head">Inception-BN</th>
<th class="head">Inception-v3</th>
<th class="head">Resnet 50</th>
<th class="head">Resnet 152</th>
</tr>
</thead>
<tbody valign="top">
<tr class="row-even"><td>1</td>
<td>109.96</td>
<td>23.00</td>
<td>71.82</td>
<td>28.10</td>
<td>30.66</td>
<td>11.81</td>
</tr>
<tr class="row-odd"><td>2</td>
<td>124.56</td>
<td>24.86</td>
<td>81.61</td>
<td>31.32</td>
<td>32.73</td>
<td>12.82</td>
</tr>
<tr class="row-even"><td>4</td>
<td>157.01</td>
<td>26.60</td>
<td>86.77</td>
<td>32.94</td>
<td>33.32</td>
<td>13.16</td>
</tr>
<tr class="row-odd"><td>8</td>
<td>178.40</td>
<td>30.67</td>
<td>88.58</td>
<td>33.52</td>
<td>33.32</td>
<td>13.32</td>
</tr>
<tr class="row-even"><td>16</td>
<td>189.52</td>
<td>35.61</td>
<td>90.36</td>
<td>33.63</td>
<td>32.94</td>
<td>13.18</td>
</tr>
<tr class="row-odd"><td>32</td>
<td>196.61</td>
<td>38.98</td>
<td>105.27</td>
<td>33.77</td>
<td>32.65</td>
<td>13.00</td>
</tr>
</tbody>
</table>
<p>AWS EC2 C4.2xlarge:</p>
<table border="1" class="docutils">
<colgroup>
<col width="14%"/>
<col width="14%"/>
<col width="14%"/>
<col width="14%"/>
<col width="14%"/>
<col width="14%"/>
<col width="14%"/>
</colgroup>
<thead valign="bottom">
<tr class="row-odd"><th class="head">Batch</th>
<th class="head">Alexnet</th>
<th class="head">VGG</th>
<th class="head">Inception-BN</th>
<th class="head">Inception-v3</th>
<th class="head">Resnet 50</th>
<th class="head">Resnet 152</th>
</tr>
</thead>
<tbody valign="top">
<tr class="row-even"><td>1</td>
<td>70.75</td>
<td>12.87</td>
<td>42.86</td>
<td>16.53</td>
<td>18.14</td>
<td>7.01</td>
</tr>
<tr class="row-odd"><td>2</td>
<td>71.53</td>
<td>13.08</td>
<td>45.66</td>
<td>17.38</td>
<td>18.53</td>
<td>7.18</td>
</tr>
<tr class="row-even"><td>4</td>
<td>84.72</td>
<td>15.38</td>
<td>47.50</td>
<td>17.80</td>
<td>18.96</td>
<td>7.35</td>
</tr>
<tr class="row-odd"><td>8</td>
<td>93.44</td>
<td>18.33</td>
<td>48.08</td>
<td>17.93</td>
<td>18.99</td>
<td>7.40</td>
</tr>
<tr class="row-even"><td>16</td>
<td>97.03</td>
<td>20.12</td>
<td>55.73</td>
<td>18.00</td>
<td>18.91</td>
<td>7.36</td>
</tr>
<tr class="row-odd"><td>32</td>
<td>113.90</td>
<td>21.10</td>
<td>62.54</td>
<td>17.98</td>
<td>18.80</td>
<td>7.33</td>
</tr>
</tbody>
</table>
<p>AWS EC2 C4.xlarge:</p>
<table border="1" class="docutils">
<colgroup>
<col width="14%"/>
<col width="14%"/>
<col width="14%"/>
<col width="14%"/>
<col width="14%"/>
<col width="14%"/>
<col width="14%"/>
</colgroup>
<thead valign="bottom">
<tr class="row-odd"><th class="head">Batch</th>
<th class="head">Alexnet</th>
<th class="head">VGG</th>
<th class="head">Inception-BN</th>
<th class="head">Inception-v3</th>
<th class="head">Resnet 50</th>
<th class="head">Resnet 152</th>
</tr>
</thead>
<tbody valign="top">
<tr class="row-even"><td>1</td>
<td>37.92</td>
<td>6.57</td>
<td>23.09</td>
<td>8.79</td>
<td>9.65</td>
<td>3.73</td>
</tr>
<tr class="row-odd"><td>2</td>
<td>36.77</td>
<td>7.31</td>
<td>24.00</td>
<td>9.00</td>
<td>9.84</td>
<td>3.78</td>
</tr>
<tr class="row-even"><td>4</td>
<td>43.18</td>
<td>8.94</td>
<td>24.42</td>
<td>9.12</td>
<td>9.91</td>
<td>3.83</td>
</tr>
<tr class="row-odd"><td>8</td>
<td>47.05</td>
<td>10.01</td>
<td>28.32</td>
<td>9.13</td>
<td>9.88</td>
<td>3.83</td>
</tr>
<tr class="row-even"><td>16</td>
<td>55.74</td>
<td>10.61</td>
<td>31.96</td>
<td>9.14</td>
<td>9.86</td>
<td>3.80</td>
</tr>
<tr class="row-odd"><td>32</td>
<td>65.05</td>
<td>10.91</td>
<td>33.86</td>
<td>9.34</td>
<td>10.31</td>
<td>3.86</td>
</tr>
</tbody>
</table>
<p>AWS EC2 C4.large:</p>
<table border="1" class="docutils">
<colgroup>
<col width="14%"/>
<col width="14%"/>
<col width="14%"/>
<col width="14%"/>
<col width="14%"/>
<col width="14%"/>
<col width="14%"/>
</colgroup>
<thead valign="bottom">
<tr class="row-odd"><th class="head">Batch</th>
<th class="head">Alexnet</th>
<th class="head">VGG</th>
<th class="head">Inception-BN</th>
<th class="head">Inception-v3</th>
<th class="head">Resnet 50</th>
<th class="head">Resnet 152</th>
</tr>
</thead>
<tbody valign="top">
<tr class="row-even"><td>1</td>
<td>19.86</td>
<td>3.67</td>
<td>12.20</td>
<td>4.59</td>
<td>5.11</td>
<td>1.97</td>
</tr>
<tr class="row-odd"><td>2</td>
<td>19.37</td>
<td>4.24</td>
<td>12.41</td>
<td>4.64</td>
<td>5.15</td>
<td>1.98</td>
</tr>
<tr class="row-even"><td>4</td>
<td>22.64</td>
<td>4.89</td>
<td>14.34</td>
<td>4.66</td>
<td>5.16</td>
<td>2.00</td>
</tr>
<tr class="row-odd"><td>8</td>
<td>27.19</td>
<td>5.25</td>
<td>16.17</td>
<td>4.66</td>
<td>5.16</td>
<td>1.99</td>
</tr>
<tr class="row-even"><td>16</td>
<td>31.82</td>
<td>5.46</td>
<td>17.24</td>
<td>4.76</td>
<td>5.35</td>
<td>OOM</td>
</tr>
<tr class="row-odd"><td>32</td>
<td>34.67</td>
<td>5.55</td>
<td>17.64</td>
<td>4.88</td>
<td>OOM</td>
<td>OOM</td>
</tr>
</tbody>
</table>
</div>
</div>
<div class="section" id="other-cpu">
<span id="other-cpu"></span><h2>Other CPU<a class="headerlink" href="#other-cpu" title="Permalink to this headline">¶</a></h2>
<p>If using CPUs (not just Intel CPUs – ARMs also), NNPACK can improve the running performance with 2x~7x, please check <a class="reference internal" href="nnpack.html"><em>nnpack.md</em></a> for details.</p>
</div>
<div class="section" id="nvidia-gpu">
<span id="nvidia-gpu"></span><h2>Nvidia GPU<a class="headerlink" href="#nvidia-gpu" title="Permalink to this headline">¶</a></h2>
<p><code class="docutils literal"><span class="pre">cuDNN</span></code> typically accelerates <em>MXNet</em> performance on NVIDIA GPUs significantly,
especially for convolution layers.
We suggest always checking to make sure that a recent cuDNN version is used.</p>
<p>Setting the environment <code class="docutils literal"><span class="pre">export</span> <span class="pre">MXNET_CUDNN_AUTOTUNE_DEFAULT=1</span></code> sometimes also helps.</p>
<p>We show results when using various GPUs including K80 (EC2 p2.2xlarge), M40,
and P100 (DGX-1).</p>
<div class="section" id="scoring-results">
<span id="id1"></span><h3>Scoring results<a class="headerlink" href="#scoring-results" title="Permalink to this headline">¶</a></h3>
<p>Based on
<a class="reference external" href="https://github.com/dmlc/mxnet/blob/master/example/image-classification/benchmark_score.py">example/image-classification/benchmark_score.py</a>
and MXNet commit <code class="docutils literal"><span class="pre">0a03417</span></code>, with cuDNN 5.1</p>
<ul class="simple">
<li>K80 (single GPU)</li>
</ul>
<table border="1" class="docutils">
<colgroup>
<col width="14%"/>
<col width="14%"/>
<col width="14%"/>
<col width="14%"/>
<col width="14%"/>
<col width="14%"/>
<col width="14%"/>
</colgroup>
<thead valign="bottom">
<tr class="row-odd"><th class="head">Batch</th>
<th class="head">Alexnet</th>
<th class="head">VGG</th>
<th class="head">Inception-BN</th>
<th class="head">Inception-v3</th>
<th class="head">Resnet 50</th>
<th class="head">Resnet 152</th>
</tr>
</thead>
<tbody valign="top">
<tr class="row-even"><td>1</td>
<td>202.66</td>
<td>70.76</td>
<td>74.91</td>
<td>42.61</td>
<td>70.94</td>
<td>24.87</td>
</tr>
<tr class="row-odd"><td>2</td>
<td>233.76</td>
<td>63.53</td>
<td>119.60</td>
<td>60.09</td>
<td>92.28</td>
<td>34.23</td>
</tr>
<tr class="row-even"><td>4</td>
<td>367.91</td>
<td>78.16</td>
<td>164.41</td>
<td>72.30</td>
<td>116.68</td>
<td>44.76</td>
</tr>
<tr class="row-odd"><td>8</td>
<td>624.14</td>
<td>119.06</td>
<td>195.24</td>
<td>79.62</td>
<td>129.37</td>
<td>50.96</td>
</tr>
<tr class="row-even"><td>16</td>
<td>1071.19</td>
<td>195.83</td>
<td>256.06</td>
<td>99.38</td>
<td>160.40</td>
<td>66.51</td>
</tr>
<tr class="row-odd"><td>32</td>
<td>1443.90</td>
<td>228.96</td>
<td>287.93</td>
<td>106.43</td>
<td>167.12</td>
<td>69.73</td>
</tr>
</tbody>
</table>
<ul class="simple">
<li>M40</li>
</ul>
<table border="1" class="docutils">
<colgroup>
<col width="14%"/>
<col width="14%"/>
<col width="14%"/>
<col width="14%"/>
<col width="14%"/>
<col width="14%"/>
<col width="14%"/>
</colgroup>
<thead valign="bottom">
<tr class="row-odd"><th class="head">Batch</th>
<th class="head">Alexnet</th>
<th class="head">VGG</th>
<th class="head">Inception-BN</th>
<th class="head">Inception-v3</th>
<th class="head">Resnet 50</th>
<th class="head">Resnet 152</th>
</tr>
</thead>
<tbody valign="top">
<tr class="row-even"><td>1</td>
<td>412.09</td>
<td>142.10</td>
<td>115.89</td>
<td>64.40</td>
<td>126.90</td>
<td>46.15</td>
</tr>
<tr class="row-odd"><td>2</td>
<td>743.49</td>
<td>212.21</td>
<td>205.31</td>
<td>108.06</td>
<td>202.17</td>
<td>75.05</td>
</tr>
<tr class="row-even"><td>4</td>
<td>1155.43</td>
<td>280.92</td>
<td>335.69</td>
<td>161.59</td>
<td>266.53</td>
<td>106.83</td>
</tr>
<tr class="row-odd"><td>8</td>
<td>1606.87</td>
<td>332.76</td>
<td>491.12</td>
<td>224.22</td>
<td>317.20</td>
<td>128.67</td>
</tr>
<tr class="row-even"><td>16</td>
<td>2070.97</td>
<td>400.10</td>
<td>618.25</td>
<td>251.87</td>
<td>335.62</td>
<td>134.60</td>
</tr>
<tr class="row-odd"><td>32</td>
<td>2694.91</td>
<td>466.95</td>
<td>624.27</td>
<td>258.59</td>
<td>373.35</td>
<td>152.71</td>
</tr>
</tbody>
</table>
<ul class="simple">
<li>P100</li>
</ul>
<table border="1" class="docutils">
<colgroup>
<col width="14%"/>
<col width="14%"/>
<col width="14%"/>
<col width="14%"/>
<col width="14%"/>
<col width="14%"/>
<col width="14%"/>
</colgroup>
<thead valign="bottom">
<tr class="row-odd"><th class="head">Batch</th>
<th class="head">Alexnet</th>
<th class="head">VGG</th>
<th class="head">Inception-BN</th>
<th class="head">Inception-v3</th>
<th class="head">Resnet 50</th>
<th class="head">Resnet 152</th>
</tr>
</thead>
<tbody valign="top">
<tr class="row-even"><td>1</td>
<td>624.84</td>
<td>294.6</td>
<td>139.82</td>
<td>80.17</td>
<td>162.27</td>
<td>58.99</td>
</tr>
<tr class="row-odd"><td>2</td>
<td>1226.85</td>
<td>282.3</td>
<td>267.41</td>
<td>142.63</td>
<td>278.02</td>
<td>102.95</td>
</tr>
<tr class="row-even"><td>4</td>
<td>1934.97</td>
<td>399.3</td>
<td>463.38</td>
<td>225.56</td>
<td>423.63</td>
<td>168.91</td>
</tr>
<tr class="row-odd"><td>8</td>
<td>2900.54</td>
<td>522.9</td>
<td>709.30</td>
<td>319.52</td>
<td>529.34</td>
<td>210.10</td>
</tr>
<tr class="row-even"><td>16</td>
<td>4063.70</td>
<td>755.3</td>
<td>949.22</td>
<td>444.65</td>
<td>647.43</td>
<td>270.07</td>
</tr>
<tr class="row-odd"><td>32</td>
<td>4883.77</td>
<td>854.4</td>
<td>1197.74</td>
<td>493.72</td>
<td>713.17</td>
<td>294.17</td>
</tr>
</tbody>
</table>
</div>
<div class="section" id="training-results">
<span id="training-results"></span><h3>Training results<a class="headerlink" href="#training-results" title="Permalink to this headline">¶</a></h3>
<p>Based on
<a class="reference external" href="https://github.com/dmlc/mxnet/blob/master/example/image-classification/train_imagenet.py">example/image-classification/train_imagenet.py</a>
and MXNet commit <code class="docutils literal"><span class="pre">0a03417</span></code>, with CUDNN 5.1. The benchmark script is available at
<a class="reference external" href="https://github.com/mli/mxnet-benchmark/blob/master/run_vary_batch.sh">here</a>,
where the batch size for Alexnet is increased by 8x.</p>
<ul class="simple">
<li>K80 (single GPU)</li>
</ul>
<table border="1" class="docutils">
<colgroup>
<col width="25%"/>
<col width="25%"/>
<col width="25%"/>
<col width="25%"/>
</colgroup>
<thead valign="bottom">
<tr class="row-odd"><th class="head">Batch</th>
<th class="head">Alexnet(*8)</th>
<th class="head">Inception-v3</th>
<th class="head">Resnet 50</th>
</tr>
</thead>
<tbody valign="top">
<tr class="row-even"><td>1</td>
<td>230.69</td>
<td>9.81</td>
<td>13.83</td>
</tr>
<tr class="row-odd"><td>2</td>
<td>348.10</td>
<td>15.31</td>
<td>21.85</td>
</tr>
<tr class="row-even"><td>4</td>
<td>457.28</td>
<td>20.48</td>
<td>29.58</td>
</tr>
<tr class="row-odd"><td>8</td>
<td>533.51</td>
<td>24.47</td>
<td>36.83</td>
</tr>
<tr class="row-even"><td>16</td>
<td>582.36</td>
<td>28.46</td>
<td>43.60</td>
</tr>
<tr class="row-odd"><td>32</td>
<td>483.37</td>
<td>29.62</td>
<td>45.52</td>
</tr>
</tbody>
</table>
<ul class="simple">
<li>M40</li>
</ul>
<table border="1" class="docutils">
<colgroup>
<col width="25%"/>
<col width="25%"/>
<col width="25%"/>
<col width="25%"/>
</colgroup>
<thead valign="bottom">
<tr class="row-odd"><th class="head">Batch</th>
<th class="head">Alexnet(*8)</th>
<th class="head">Inception-v3</th>
<th class="head">Resnet 50</th>
</tr>
</thead>
<tbody valign="top">
<tr class="row-even"><td>1</td>
<td>405.17</td>
<td>14.35</td>
<td>21.56</td>
</tr>
<tr class="row-odd"><td>2</td>
<td>606.32</td>
<td>23.96</td>
<td>36.48</td>
</tr>
<tr class="row-even"><td>4</td>
<td>792.66</td>
<td>37.38</td>
<td>52.96</td>
</tr>
<tr class="row-odd"><td>8</td>
<td>1016.51</td>
<td>52.69</td>
<td>70.21</td>
</tr>
<tr class="row-even"><td>16</td>
<td>1105.18</td>
<td>62.35</td>
<td>83.13</td>
</tr>
<tr class="row-odd"><td>32</td>
<td>1046.23</td>
<td>68.87</td>
<td>90.74</td>
</tr>
</tbody>
</table>
<ul class="simple">
<li>P100</li>
</ul>
<table border="1" class="docutils">
<colgroup>
<col width="25%"/>
<col width="25%"/>
<col width="25%"/>
<col width="25%"/>
</colgroup>
<thead valign="bottom">
<tr class="row-odd"><th class="head">Batch</th>
<th class="head">Alexnet(*8)</th>
<th class="head">Inception-v3</th>
<th class="head">Resnet 50</th>
</tr>
</thead>
<tbody valign="top">
<tr class="row-even"><td>1</td>
<td>809.94</td>
<td>15.14</td>
<td>27.20</td>
</tr>
<tr class="row-odd"><td>2</td>
<td>1202.93</td>
<td>30.34</td>
<td>49.55</td>
</tr>
<tr class="row-even"><td>4</td>
<td>1631.37</td>
<td>50.59</td>
<td>78.31</td>
</tr>
<tr class="row-odd"><td>8</td>
<td>1882.74</td>
<td>77.75</td>
<td>122.45</td>
</tr>
<tr class="row-even"><td>16</td>
<td>2012.04</td>
<td>111.11</td>
<td>156.79</td>
</tr>
<tr class="row-odd"><td>32</td>
<td>1869.69</td>
<td>129.98</td>
<td>181.53</td>
</tr>
</tbody>
</table>
</div>
</div>
<div class="section" id="multiple-devices">
<span id="multiple-devices"></span><h2>Multiple Devices<a class="headerlink" href="#multiple-devices" title="Permalink to this headline">¶</a></h2>
<p>If more than one GPU or machine are used, MXNet uses <code class="docutils literal"><span class="pre">kvstore</span></code> to communicate data.
It’s critical to use the proper type of <code class="docutils literal"><span class="pre">kvstore</span></code> to get the best performance.
Refer to <a class="reference external" href="http://mxnet.io/how_to/multi_devices.html">multi_device.md</a> for more
details.</p>
<p>Besides, we can use <a class="reference external" href="https://github.com/dmlc/mxnet/tree/master/tools/bandwidth">tools/bandwidth</a>
to find the communication cost per batch.
Ideally, the communication cost should be less than the time to compute a batch.
To reduce the communication cost, we can consider:</p>
<ul class="simple">
<li>Exploring different <code class="docutils literal"><span class="pre">--kv-store</span></code> options.</li>
<li>Increasing the batch size to improve the computation to communication ratio.</li>
</ul>
</div>
<div class="section" id="input-data">
<span id="input-data"></span><h2>Input Data<a class="headerlink" href="#input-data" title="Permalink to this headline">¶</a></h2>
<p>To make sure you’re handling input data in a reasonable way consider the following:</p>
<ul class="simple">
<li>Data format: If you are using the <code class="docutils literal"><span class="pre">rec</span></code> format, then everything should be fine.</li>
<li>Decoding: By default, <em>MXNet</em> uses 4 CPU threads for decoding images.
This is often sufficient to decode more than 1K images per second.
If you are using a low-end CPU or your GPUs are very powerful, you can increase the number of threads.</li>
<li>Storage location. Any local or distributed file system (HDFS, Amazon S3) should be fine.
If multiple devices read the data from the shared network file system (NFS) at the same time, problems might occur.</li>
<li>Use a large batch size. We often choose the largest one that fits into GPU memory.
A value that’s too large can slow down convergence.
For example, the safe batch size for CIFAR 10 is approximately 200, while for ImageNet 1K, the batch size can exceed 1K.</li>
</ul>
</div>
<div class="section" id="profiler">
<span id="profiler"></span><h2>Profiler<a class="headerlink" href="#profiler" title="Permalink to this headline">¶</a></h2>
<p>As of v0.9.1 (with the NNVM merge), <em>MXNet</em> has a built-in profiler
that gives detailed information about execution time at the symbol level.
This feature complements general profiling tools like <em>nvprof</em> and <em>gprof</em>
by summarizing at the operator level, instead of a function, kernel, or instruction level.</p>
<p>In order to be able to use the profiler, you must compile <em>MXNet</em> with the <code class="docutils literal"><span class="pre">USE_PROFILER=1</span></code> flag in <code class="docutils literal"><span class="pre">config.mk</span></code>.</p>
<p>The profiler can then be turned on with an <a class="reference external" href="http://mxnet.io/how_to/env_var.html#control-the-profiler">environment variable</a>
for an entire program run, or programmatically for just part of a run.
See <a class="reference external" href="https://github.com/dmlc/mxnet/tree/master/example/profiler">example/profiler</a>
for complete examples of how to use the profiler in code, but briefly, the Python code looks like:</p>
<div class="highlight-python"><div class="highlight"><pre><span></span>    <span class="n">mx</span><span class="o">.</span><span class="n">profiler</span><span class="o">.</span><span class="n">profiler_set_config</span><span class="p">(</span><span class="n">mode</span><span class="o">=</span><span class="s1">'all'</span><span class="p">,</span> <span class="n">filename</span><span class="o">=</span><span class="s1">'profile_output.json'</span><span class="p">)</span>
    <span class="n">mx</span><span class="o">.</span><span class="n">profiler</span><span class="o">.</span><span class="n">profiler_set_state</span><span class="p">(</span><span class="s1">'run'</span><span class="p">)</span>

    <span class="c1"># Code to be profiled goes here...</span>

    <span class="n">mx</span><span class="o">.</span><span class="n">profiler</span><span class="o">.</span><span class="n">profiler_set_state</span><span class="p">(</span><span class="s1">'stop'</span><span class="p">)</span>
</pre></div>
</div>
<p>The <code class="docutils literal"><span class="pre">mode</span></code> parameter can be set to</p>
<ul class="simple">
<li><code class="docutils literal"><span class="pre">symbolic</span></code> to only include symbolic operations</li>
<li><code class="docutils literal"><span class="pre">all</span></code> to include all operations</li>
</ul>
<p>After the program finishes, navigate to your browser’s tracing (Example - chrome://tracing in a Chrome browser) and load the <code class="docutils literal"><span class="pre">profile_output.json</span></code> file output by the profiler to inspect the results.</p>
<p><img alt="MLP Profile" src="https://cloud.githubusercontent.com/assets/17693755/18035938/0a43484a-6d93-11e6-80d4-241c6ca552ea.png"/></p>
<p>Note that the output file can grow extremely large, so this approach is not recommended for general use.</p>
</div>
</div>
<div class="container">
<div class="footer">
<p> © 2015-2017 DMLC. All rights reserved. </p>
</div>
</div>
</div>
<div aria-label="main navigation" class="sphinxsidebar rightsidebar" role="navigation">
<div class="sphinxsidebarwrapper">
<h3><a href="../index.html">Table Of Contents</a></h3>
<ul>
<li><a class="reference internal" href="#">Some Tips for Improving MXNet Performance</a><ul>
<li><a class="reference internal" href="#intel-cpu">Intel CPU</a><ul>
<li><a class="reference internal" href="#scoring-results">Scoring results</a></li>
</ul>
</li>
<li><a class="reference internal" href="#other-cpu">Other CPU</a></li>
<li><a class="reference internal" href="#nvidia-gpu">Nvidia GPU</a><ul>
<li><a class="reference internal" href="#scoring-results">Scoring results</a></li>
<li><a class="reference internal" href="#training-results">Training results</a></li>
</ul>
</li>
<li><a class="reference internal" href="#multiple-devices">Multiple Devices</a></li>
<li><a class="reference internal" href="#input-data">Input Data</a></li>
<li><a class="reference internal" href="#profiler">Profiler</a></li>
</ul>
</li>
</ul>
</div>
</div>
</div> <!-- pagename != index -->
<script crossorigin="anonymous" integrity="sha384-0mSbJDEHialfmuBBQP6A4Qrprq5OVfW37PRR3j5ELqxss1yVqOtnepnHVP9aJ7xS" src="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.6/js/bootstrap.min.js"></script>
<script src="../_static/js/sidebar.js" type="text/javascript"></script>
<script src="../_static/js/search.js" type="text/javascript"></script>
<script src="../_static/js/navbar.js" type="text/javascript"></script>
<script src="../_static/js/clipboard.min.js" type="text/javascript"></script>
<script src="../_static/js/copycode.js" type="text/javascript"></script>
<script type="text/javascript">
        $('body').ready(function () {
            $('body').css('visibility', 'visible');
        });
    </script>
</div></body>
</html>