blob: 5cb4154d1512069ea82252b04c201fe80f48ccaa [file] [log] [blame]
<!DOCTYPE html>
<!---
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
distributed with this work for additional information
regarding copyright ownership. The ASF licenses this file
to you under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing,
software distributed under the License is distributed on an
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, either express or implied. See the License for the
specific language governing permissions and limitations
under the License.
-->
<html lang=" en"><head>
<meta charset="utf-8">
<meta http-equiv="X-UA-Compatible" content="IE=edge">
<meta name="viewport" content="width=device-width, initial-scale=1">
<link href="/versions/master/assets/img/mxnet-icon.png" rel="icon" type="image/png"><!-- Begin Jekyll SEO tag v2.6.1 -->
<title>Some Tips for Improving MXNet Performance | Apache MXNet</title>
<meta name="generator" content="Jekyll v4.0.0" />
<meta property="og:title" content="Some Tips for Improving MXNet Performance" />
<meta property="og:locale" content="en_US" />
<meta name="description" content="A flexible and efficient library for deep learning." />
<meta property="og:description" content="A flexible and efficient library for deep learning." />
<link rel="canonical" href="https://mxnet.apache.org/versions/master/api/faq/perf" />
<meta property="og:url" content="https://mxnet.apache.org/versions/master/api/faq/perf" />
<meta property="og:site_name" content="Apache MXNet" />
<script type="application/ld+json">
{"url":"https://mxnet.apache.org/versions/master/api/faq/perf","headline":"Some Tips for Improving MXNet Performance","description":"A flexible and efficient library for deep learning.","@type":"WebPage","@context":"https://schema.org"}</script>
<!-- End Jekyll SEO tag -->
<link rel="stylesheet" href="/versions/master/assets/docsearch.min.css" /><link rel="stylesheet" href="/versions/master/assets/main.css"><link type="application/atom+xml" rel="alternate" href="https://mxnet.apache.org/versions/master/feed.xml" title="Apache MXNet" /><!-- Matomo -->
<script>
var _paq = window._paq = window._paq || [];
/* tracker methods like "setCustomDimension" should be called before "trackPageView" */
/* We explicitly disable cookie tracking to avoid privacy issues */
_paq.push(['disableCookies']);
_paq.push(['trackPageView']);
_paq.push(['enableLinkTracking']);
(function() {
var u="https://analytics.apache.org/";
_paq.push(['setTrackerUrl', u+'matomo.php']);
_paq.push(['setSiteId', '23']);
var d=document, g=d.createElement('script'), s=d.getElementsByTagName('script')[0];
g.async=true; g.src=u+'matomo.js'; s.parentNode.insertBefore(g,s);
})();
</script>
<!-- End Matomo Code -->
<script src="/versions/master/assets/js/jquery-3.3.1.min.js"></script>
<script src="/versions/master/assets/js/docsearch.min.js"></script><script src="/versions/master/assets/js/globalSearch.js" defer></script>
<script src="/versions/master/assets/js/clipboard.js" defer></script>
<script src="/versions/master/assets/js/copycode.js" defer></script></head>
<body><header class="site-header" role="banner">
<script>
$(document).ready(function () {
// HEADER OPACITY LOGIC
function opacity_header() {
var value = "rgba(4,140,204," + ($(window).scrollTop() / 300 + 0.4) + ")"
$('.site-header').css("background-color", value)
}
$(window).scroll(function () {
opacity_header()
})
opacity_header();
// MENU SELECTOR LOGIC
$('.page-link').each( function () {
if (window.location.href.includes(this.href)) {
$(this).addClass("page-current");
}
});
})
</script>
<div class="wrapper">
<a class="site-title" rel="author" href="/versions/master/"><img
src="/versions/master/assets/img/mxnet_logo.png" class="site-header-logo"></a>
<nav class="site-nav">
<input type="checkbox" id="nav-trigger" class="nav-trigger"/>
<label for="nav-trigger">
<span class="menu-icon">
<svg viewBox="0 0 18 15" width="18px" height="15px">
<path d="M18,1.484c0,0.82-0.665,1.484-1.484,1.484H1.484C0.665,2.969,0,2.304,0,1.484l0,0C0,0.665,0.665,0,1.484,0 h15.032C17.335,0,18,0.665,18,1.484L18,1.484z M18,7.516C18,8.335,17.335,9,16.516,9H1.484C0.665,9,0,8.335,0,7.516l0,0 c0-0.82,0.665-1.484,1.484-1.484h15.032C17.335,6.031,18,6.696,18,7.516L18,7.516z M18,13.516C18,14.335,17.335,15,16.516,15H1.484 C0.665,15,0,14.335,0,13.516l0,0c0-0.82,0.665-1.483,1.484-1.483h15.032C17.335,12.031,18,12.695,18,13.516L18,13.516z"/>
</svg>
</span>
</label>
<div class="gs-search-border">
<div id="gs-search-icon"></div>
<form id="global-search-form">
<input id="global-search" type="text" title="Search" placeholder="Search" />
<div id="global-search-dropdown-container">
<button class="gs-current-version btn" type="button" data-toggle="dropdown">
<span id="gs-current-version-label">master</span>
<svg class="gs-dropdown-caret" viewBox="0 0 32 32" class="icon icon-caret-bottom" aria-hidden="true">
<path class="dropdown-caret-path" d="M24 11.305l-7.997 11.39L8 11.305z"></path>
</svg>
</button>
<ul class="gs-opt-group gs-version-dropdown">
<li class="gs-opt gs-versions active">master</li>
<li class="gs-opt gs-versions">1.9.1</li>
<li class="gs-opt gs-versions">1.8.0</li>
<li class="gs-opt gs-versions">1.7.0</li>
<li class="gs-opt gs-versions">1.6.0</li>
<li class="gs-opt gs-versions">1.5.0</li>
<li class="gs-opt gs-versions">1.4.1</li>
<li class="gs-opt gs-versions">1.3.1</li>
<li class="gs-opt gs-versions">1.2.1</li>
<li class="gs-opt gs-versions">1.1.0</li>
<li class="gs-opt gs-versions">1.0.0</li>
<li class="gs-opt gs-versions">0.12.1</li>
<li class="gs-opt gs-versions">0.11.0</li>
</ul>
</div>
<span id="global-search-close">x</span>
</form>
</div>
<div class="trigger">
<div id="global-search-mobile-border">
<div id="gs-search-icon-mobile"></div>
<input id="global-search-mobile" placeholder="Search..." type="text"/>
<div id="global-search-dropdown-container-mobile">
<button class="gs-current-version-mobile btn" type="button" data-toggle="dropdown">
<svg class="gs-dropdown-caret" viewBox="0 0 32 32" class="icon icon-caret-bottom" aria-hidden="true">
<path class="dropdown-caret-path" d="M24 11.305l-7.997 11.39L8 11.305z"></path>
</svg>
</button>
<ul class="gs-opt-group gs-version-dropdown-mobile">
<li class="gs-opt gs-versions active">master</li>
<li class="gs-opt gs-versions">1.9.1</li>
<li class="gs-opt gs-versions">1.8.0</li>
<li class="gs-opt gs-versions">1.7.0</li>
<li class="gs-opt gs-versions">1.6.0</li>
<li class="gs-opt gs-versions">1.5.0</li>
<li class="gs-opt gs-versions">1.4.1</li>
<li class="gs-opt gs-versions">1.3.1</li>
<li class="gs-opt gs-versions">1.2.1</li>
<li class="gs-opt gs-versions">1.1.0</li>
<li class="gs-opt gs-versions">1.0.0</li>
<li class="gs-opt gs-versions">0.12.1</li>
<li class="gs-opt gs-versions">0.11.0</li>
</ul>
</div>
</div>
<a class="page-link" href="/versions/master/get_started">Get Started</a>
<a class="page-link" href="/versions/master/features">Features</a>
<a class="page-link" href="/versions/master/ecosystem">Ecosystem</a>
<a class="page-link" href="/versions/master/api">Docs & Tutorials</a>
<a class="page-link" href="/versions/master/trusted_by">Trusted By</a>
<a class="page-link" href="https://github.com/apache/mxnet">GitHub</a>
<div class="dropdown" style="min-width:100px">
<span class="dropdown-header">Apache
<svg class="dropdown-caret" viewBox="0 0 32 32" class="icon icon-caret-bottom" aria-hidden="true"><path class="dropdown-caret-path" d="M24 11.305l-7.997 11.39L8 11.305z"></path></svg>
</span>
<div class="dropdown-content" style="min-width:250px">
<a href="https://www.apache.org/foundation/">Apache Software Foundation</a>
<a href="https://www.apache.org/licenses/">License</a>
<a href="/versions/master/api/faq/security.html">Security</a>
<a href="https://privacy.apache.org/policies/privacy-policy-public.html">Privacy</a>
<a href="https://www.apache.org/events/current-event">Events</a>
<a href="https://www.apache.org/foundation/sponsorship.html">Sponsorship</a>
<a href="https://www.apache.org/foundation/thanks.html">Thanks</a>
</div>
</div>
<div class="dropdown">
<span class="dropdown-header">master
<svg class="dropdown-caret" viewBox="0 0 32 32" class="icon icon-caret-bottom" aria-hidden="true"><path class="dropdown-caret-path" d="M24 11.305l-7.997 11.39L8 11.305z"></path></svg>
</span>
<div class="dropdown-content">
<a class="dropdown-option-active" href="/">master</a>
<a href="/versions/1.9.1/">1.9.1</a>
<a href="/versions/1.8.0/">1.8.0</a>
<a href="/versions/1.7.0/">1.7.0</a>
<a href="/versions/1.6.0/">1.6.0</a>
<a href="/versions/1.5.0/">1.5.0</a>
<a href="/versions/1.4.1/">1.4.1</a>
<a href="/versions/1.3.1/">1.3.1</a>
<a href="/versions/1.2.1/">1.2.1</a>
<a href="/versions/1.1.0/">1.1.0</a>
<a href="/versions/1.0.0/">1.0.0</a>
<a href="/versions/0.12.1/">0.12.1</a>
<a href="/versions/0.11.0/">0.11.0</a>
</div>
</div>
</div>
</nav>
</div>
</header>
<main class="page-content" aria-label="Content">
<script>
</script>
<article class="post">
<header class="post-header wrapper">
<h1 class="post-title">Some Tips for Improving MXNet Performance</h1>
<h3></h3></header>
<div class="post-content">
<div class="wrapper">
<div class="row">
<div class="col-3 docs-side-bar">
<h3 style="text-transform: capitalize; padding-left:10px">faq</h3>
<ul>
<!-- page-category -->
<!-- page-category -->
<li><a href="/versions/master/api/faq/add_op_in_backend">A Beginner's Guide to Implementing Operators in MXNet Backend</a></li>
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<li><a href="/versions/master/api/faq/cloud">MXNet on the Cloud</a></li>
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<li><a href="/versions/master/api/faq/distributed_training">Distributed Training in MXNet</a></li>
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<li><a href="/versions/master/api/faq/env_var">Environment Variables</a></li>
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<li><a href="/versions/master/api/faq/float16">Float16</a></li>
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<li><a href="/versions/master/api/faq/large_tensor_support">Using MXNet with Large Tensor Support</a></li>
<!-- page-category -->
<!-- page-category -->
<li><a href="/versions/master/api/faq/model_parallel_lstm">Model Parallel</a></li>
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<li><a href="/versions/master/api/faq/new_op">Create New Operators</a></li>
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<li><a href="/versions/master/api/faq/perf">Some Tips for Improving MXNet Performance</a></li>
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<li><a href="/versions/master/api/faq/recordio">Create a Dataset Using RecordIO</a></li>
<!-- page-category -->
<li><a href="/versions/master/api/faq/s3_integration">Use data from S3 for training</a></li>
<!-- page-category -->
<li><a href="/versions/master/api/faq/security">MXNet Security Best Practices</a></li>
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<li><a href="/versions/master/api/faq/tensor_inspector_tutorial">Use TensorInspector to Help Debug Operators</a></li>
<!-- page-category -->
<!-- page-category -->
<li><a href="/versions/master/api/faq/using_rtc">Using runtime compilation (RTC) to write CUDA kernels in MXNet</a></li>
<!-- page-category -->
<!-- page-category -->
<li><a href="/versions/master/api/faq/why_mxnet">Why MXNet came to be?</a></li>
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- resource-p -->
</ul>
</div>
<div class="col-9">
<!--- Licensed to the Apache Software Foundation (ASF) under one -->
<!--- or more contributor license agreements. See the NOTICE file -->
<!--- distributed with this work for additional information -->
<!--- regarding copyright ownership. The ASF licenses this file -->
<!--- to you under the Apache License, Version 2.0 (the -->
<!--- "License"); you may not use this file except in compliance -->
<!--- with the License. You may obtain a copy of the License at -->
<!--- http://www.apache.org/licenses/LICENSE-2.0 -->
<!--- Unless required by applicable law or agreed to in writing, -->
<!--- software distributed under the License is distributed on an -->
<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
<!--- KIND, either express or implied. See the License for the -->
<!--- specific language governing permissions and limitations -->
<!--- under the License. -->
<h1 id="some-tips-for-improving-mxnet-performance">Some Tips for Improving MXNet Performance</h1>
<p>Even after fixing the training or deployment environment and parallelization scheme,
a number of configuration settings and data-handling choices can impact the <em>MXNet</em> performance.
In this document, we address some tips for improving <em>MXNet</em> performance.</p>
<p>Performance is mainly affected by the following 4 factors:</p>
<ol>
<li>Implementation of operators (Convolution, Pooling, ..)
<ul>
<li><a href="#intel-cpu">Intel CPU</a></li>
<li><a href="#nvidia-gpu">Nvidia GPU</a></li>
</ul>
</li>
<li>Input data loading and augmentation
<ul>
<li><a href="#input-data">Input Data</a></li>
</ul>
</li>
<li>Workloads (computation graph) optimization and scheduling
<ul>
<li><a href="#profiler">Profiler</a></li>
</ul>
</li>
<li>Communication for multi-devices training
<ul>
<li><a href="#multiple-devices">Multiple Devices</a></li>
</ul>
</li>
</ol>
<h2 id="intel-cpu">Intel CPU</h2>
<p>When using Intel Xeon CPUs for training and inference, the <code class="highlighter-rouge">mxnet-mkl</code> package is recommended. Adding <code class="highlighter-rouge">--pre</code> installs a nightly build from master. Without it you will install the latest patched release of MXNet:</p>
<div class="highlighter-rouge"><div class="highlight"><pre class="highlight"><code>$ pip install mxnet-mkl [--pre]
</code></pre></div></div>
<p>Or build MXNet from source code with <code class="highlighter-rouge">USE_ONEDNN=1</code>. For Linux users, <code class="highlighter-rouge">USE_ONEDNN=1</code> will be turned on by default.</p>
<p>We also find that setting the following environment variables can help:</p>
<table>
<thead>
<tr>
<th style="text-align: left">Variable</th>
<th style="text-align: left">Description</th>
</tr>
</thead>
<tbody>
<tr>
<td style="text-align: left"><code class="highlighter-rouge">OMP_NUM_THREADS</code></td>
<td style="text-align: left">Suggested value: <code class="highlighter-rouge">vCPUs / 2</code> in which <code class="highlighter-rouge">vCPUs</code> is the number of virtual CPUs. For more information, please see the guide for <a href="https://software.intel.com/en-us/mkl-windows-developer-guide-setting-the-number-of-threads-using-an-openmp-environment-variable">setting the number of threads using an OpenMP environment variable</a></td>
</tr>
<tr>
<td style="text-align: left"><code class="highlighter-rouge">KMP_AFFINITY</code></td>
<td style="text-align: left">Suggested value: <code class="highlighter-rouge">granularity=fine,compact,1,0</code>. For more information, please see the guide for <a href="https://software.intel.com/en-us/node/522691">Thread Affinity Interface (Linux* and Windows*)</a>.</td>
</tr>
</tbody>
</table>
<p>Note that <em>MXNet</em> treats all CPUs on a single machine as a single device.
So whether you specify <code class="highlighter-rouge">cpu(0)</code> or <code class="highlighter-rouge">cpu()</code>, <em>MXNet</em> will use all CPU cores on the machine.</p>
<h3 id="scoring-results">Scoring results</h3>
<p>The following table shows performance of MXNet-1.2.0.rc1,
namely number of images that can be predicted per second.
We used <a href="https://github.com/apache/mxnet/blob/master/example/image-classification/benchmark_score.py">example/image-classification/benchmark_score.py</a>
to measure the performance on different AWS EC2 machines.</p>
<p>AWS EC2 C5.18xlarge:</p>
<table>
<thead>
<tr>
<th>Batch</th>
<th>Alexnet</th>
<th>VGG 16</th>
<th>Inception-BN</th>
<th>Inception-v3</th>
<th>Resnet 50</th>
<th>Resnet 152</th>
</tr>
</thead>
<tbody>
<tr>
<td>1</td>
<td>390.53</td>
<td>81.57</td>
<td>124.13</td>
<td>62.26</td>
<td>76.22</td>
<td>32.92</td>
</tr>
<tr>
<td>2</td>
<td>596.45</td>
<td>100.84</td>
<td>206.58</td>
<td>93.36</td>
<td>119.55</td>
<td>46.80</td>
</tr>
<tr>
<td>4</td>
<td>710.77</td>
<td>119.04</td>
<td>275.55</td>
<td>127.86</td>
<td>148.62</td>
<td>59.36</td>
</tr>
<tr>
<td>8</td>
<td>921.40</td>
<td>120.38</td>
<td>380.82</td>
<td>157.11</td>
<td>167.95</td>
<td>70.78</td>
</tr>
<tr>
<td>16</td>
<td>1018.43</td>
<td>115.30</td>
<td>411.67</td>
<td>168.71</td>
<td>178.54</td>
<td>75.13</td>
</tr>
<tr>
<td>32</td>
<td>1290.31</td>
<td>107.19</td>
<td>483.34</td>
<td>179.38</td>
<td>193.47</td>
<td>85.86</td>
</tr>
</tbody>
</table>
<p>AWS EC2 C5.9xlarge:</p>
<table>
<thead>
<tr>
<th>Batch</th>
<th>Alexnet</th>
<th>VGG 16</th>
<th>Inception-BN</th>
<th>Inception-v3</th>
<th>Resnet 50</th>
<th>Resnet 152</th>
</tr>
</thead>
<tbody>
<tr>
<td>1</td>
<td>257.77</td>
<td>50.61</td>
<td>130.99</td>
<td>66.95</td>
<td>75.38</td>
<td>32.33</td>
</tr>
<tr>
<td>2</td>
<td>410.60</td>
<td>63.02</td>
<td>195.14</td>
<td>87.84</td>
<td>102.67</td>
<td>41.57</td>
</tr>
<tr>
<td>4</td>
<td>462.59</td>
<td>62.64</td>
<td>263.15</td>
<td>109.87</td>
<td>127.15</td>
<td>50.69</td>
</tr>
<tr>
<td>8</td>
<td>573.79</td>
<td>63.95</td>
<td>309.99</td>
<td>121.36</td>
<td>140.84</td>
<td>59.01</td>
</tr>
<tr>
<td>16</td>
<td>709.47</td>
<td>67.79</td>
<td>350.19</td>
<td>128.26</td>
<td>147.41</td>
<td>64.15</td>
</tr>
<tr>
<td>32</td>
<td>831.46</td>
<td>69.58</td>
<td>354.91</td>
<td>129.92</td>
<td>149.18</td>
<td>64.25</td>
</tr>
</tbody>
</table>
<p>AWS EC2 C5.4xlarge:</p>
<table>
<thead>
<tr>
<th>Batch</th>
<th>Alexnet</th>
<th>VGG 16</th>
<th>Inception-BN</th>
<th>Inception-v3</th>
<th>Resnet 50</th>
<th>Resnet 152</th>
</tr>
</thead>
<tbody>
<tr>
<td>1</td>
<td>214.15</td>
<td>29.32</td>
<td>114.97</td>
<td>47.96</td>
<td>61.01</td>
<td>23.92</td>
</tr>
<tr>
<td>2</td>
<td>310.04</td>
<td>34.81</td>
<td>150.09</td>
<td>60.89</td>
<td>71.16</td>
<td>27.92</td>
</tr>
<tr>
<td>4</td>
<td>330.69</td>
<td>34.56</td>
<td>186.63</td>
<td>74.15</td>
<td>86.86</td>
<td>34.37</td>
</tr>
<tr>
<td>8</td>
<td>378.88</td>
<td>35.46</td>
<td>204.89</td>
<td>77.05</td>
<td>91.10</td>
<td>36.93</td>
</tr>
<tr>
<td>16</td>
<td>424.00</td>
<td>36.49</td>
<td>211.55</td>
<td>78.39</td>
<td>91.23</td>
<td>37.34</td>
</tr>
<tr>
<td>32</td>
<td>481.95</td>
<td>37.23</td>
<td>213.71</td>
<td>78.23</td>
<td>91.68</td>
<td>37.26</td>
</tr>
</tbody>
</table>
<p>AWS EC2 C5.2xlarge:</p>
<table>
<thead>
<tr>
<th>Batch</th>
<th>Alexnet</th>
<th>VGG 16</th>
<th>Inception-BN</th>
<th>Inception-v3</th>
<th>Resnet 50</th>
<th>Resnet 152</th>
</tr>
</thead>
<tbody>
<tr>
<td>1</td>
<td>131.01</td>
<td>15.67</td>
<td>78.75</td>
<td>31.12</td>
<td>37.30</td>
<td>14.75</td>
</tr>
<tr>
<td>2</td>
<td>182.29</td>
<td>18.01</td>
<td>98.59</td>
<td>39.13</td>
<td>45.98</td>
<td>17.84</td>
</tr>
<tr>
<td>4</td>
<td>189.31</td>
<td>18.25</td>
<td>110.26</td>
<td>41.35</td>
<td>49.21</td>
<td>19.32</td>
</tr>
<tr>
<td>8</td>
<td>211.75</td>
<td>18.57</td>
<td>115.46</td>
<td>42.53</td>
<td>49.98</td>
<td>19.81</td>
</tr>
<tr>
<td>16</td>
<td>236.06</td>
<td>19.11</td>
<td>117.18</td>
<td>42.59</td>
<td>50.20</td>
<td>19.92</td>
</tr>
<tr>
<td>32</td>
<td>261.13</td>
<td>19.46</td>
<td>116.20</td>
<td>42.72</td>
<td>49.95</td>
<td>19.80</td>
</tr>
</tbody>
</table>
<p>AWS EC2 C5.xlarge:</p>
<table>
<thead>
<tr>
<th>Batch</th>
<th>Alexnet</th>
<th>VGG 16</th>
<th>Inception-BN</th>
<th>Inception-v3</th>
<th>Resnet 50</th>
<th>Resnet 152</th>
</tr>
</thead>
<tbody>
<tr>
<td>1</td>
<td>36.64</td>
<td>3.93</td>
<td>27.06</td>
<td>10.09</td>
<td>12.98</td>
<td>5.06</td>
</tr>
<tr>
<td>2</td>
<td>49.21</td>
<td>4.49</td>
<td>29.67</td>
<td>10.80</td>
<td>12.94</td>
<td>5.14</td>
</tr>
<tr>
<td>4</td>
<td>50.12</td>
<td>4.50</td>
<td>30.31</td>
<td>10.83</td>
<td>13.17</td>
<td>5.19</td>
</tr>
<tr>
<td>8</td>
<td>54.71</td>
<td>4.58</td>
<td>30.22</td>
<td>10.89</td>
<td>13.19</td>
<td>5.20</td>
</tr>
<tr>
<td>16</td>
<td>60.23</td>
<td>4.70</td>
<td>30.20</td>
<td>10.91</td>
<td>13.23</td>
<td>5.19</td>
</tr>
<tr>
<td>32</td>
<td>66.37</td>
<td>4.76</td>
<td>30.10</td>
<td>10.90</td>
<td>13.22</td>
<td>5.15</td>
</tr>
</tbody>
</table>
<h2 id="other-cpu">Other CPU</h2>
<p>If using CPUs (not just Intel CPUs – ARMs also), NNPACK can improve the running performance with 2x~7x, please check <a href="nnpack">nnpack.md</a> for details.</p>
<h2 id="nvidia-gpu">Nvidia GPU</h2>
<p><code class="highlighter-rouge">cuDNN</code> typically accelerates <em>MXNet</em> performance on NVIDIA GPUs significantly,
especially for convolution layers.
We suggest always checking to make sure that a recent cuDNN version is used.</p>
<p>Setting the environment <code class="highlighter-rouge">export MXNET_CUDNN_AUTOTUNE_DEFAULT=1</code> sometimes also helps.</p>
<p>We show results when using various GPUs including K80 (EC2 p2.2xlarge), M60 (EC2 g3.4xlarge),
and V100 (EC2 p3.2xlarge).</p>
<h3 id="scoring-results-1">Scoring results</h3>
<p>Based on
<a href="https://github.com/apache/mxnet/blob/master/example/image-classification/benchmark_score.py">example/image-classification/benchmark_score.py</a>
and MXNet-1.2.0.rc1, with cuDNN 7.0.5</p>
<ul>
<li>K80 (single GPU)</li>
</ul>
<table>
<thead>
<tr>
<th>Batch</th>
<th>Alexnet</th>
<th>VGG 16</th>
<th>Inception-BN</th>
<th>Inception-v3</th>
<th>Resnet 50</th>
<th>Resnet 152</th>
</tr>
</thead>
<tbody>
<tr>
<td>1</td>
<td>243.93</td>
<td>43.59</td>
<td>68.62</td>
<td>35.52</td>
<td>67.41</td>
<td>23.65</td>
</tr>
<tr>
<td>2</td>
<td>338.16</td>
<td>49.14</td>
<td>113.41</td>
<td>56.29</td>
<td>93.35</td>
<td>33.88</td>
</tr>
<tr>
<td>4</td>
<td>478.92</td>
<td>53.44</td>
<td>159.61</td>
<td>74.43</td>
<td>119.18</td>
<td>45.23</td>
</tr>
<tr>
<td>8</td>
<td>683.52</td>
<td>70.50</td>
<td>190.49</td>
<td>86.23</td>
<td>131.32</td>
<td>50.54</td>
</tr>
<tr>
<td>16</td>
<td>1004.66</td>
<td>109.01</td>
<td>254.20</td>
<td>105.70</td>
<td>155.40</td>
<td>62.55</td>
</tr>
<tr>
<td>32</td>
<td>1238.55</td>
<td>114.98</td>
<td>285.49</td>
<td>116.79</td>
<td>159.42</td>
<td>64.99</td>
</tr>
<tr>
<td>64</td>
<td>1346.72</td>
<td>123.56</td>
<td>308.73</td>
<td>122.21</td>
<td>167.58</td>
<td>70.21</td>
</tr>
<tr>
<td>128</td>
<td>1416.91</td>
<td>OOM</td>
<td>320.98</td>
<td>123.11</td>
<td>171.55</td>
<td>71.85</td>
</tr>
<tr>
<td>256</td>
<td>1462.97</td>
<td>OOM</td>
<td>329.16</td>
<td>127.53</td>
<td>153.01</td>
<td>57.23</td>
</tr>
</tbody>
</table>
<ul>
<li>M60</li>
</ul>
<table>
<thead>
<tr>
<th>Batch</th>
<th>Alexnet</th>
<th>VGG 16</th>
<th>Inception-BN</th>
<th>Inception-v3</th>
<th>Resnet 50</th>
<th>Resnet 152</th>
</tr>
</thead>
<tbody>
<tr>
<td>1</td>
<td>243.49</td>
<td>59.95</td>
<td>101.97</td>
<td>48.30</td>
<td>95.46</td>
<td>39.29</td>
</tr>
<tr>
<td>2</td>
<td>491.04</td>
<td>69.14</td>
<td>170.35</td>
<td>80.27</td>
<td>142.61</td>
<td>60.17</td>
</tr>
<tr>
<td>4</td>
<td>711.54</td>
<td>78.94</td>
<td>257.89</td>
<td>123.09</td>
<td>182.36</td>
<td>76.51</td>
</tr>
<tr>
<td>8</td>
<td>1077.73</td>
<td>109.34</td>
<td>343.42</td>
<td>152.82</td>
<td>208.74</td>
<td>87.27</td>
</tr>
<tr>
<td>16</td>
<td>1447.21</td>
<td>144.93</td>
<td>390.25</td>
<td>166.32</td>
<td>220.73</td>
<td>92.41</td>
</tr>
<tr>
<td>32</td>
<td>1797.66</td>
<td>151.86</td>
<td>416.69</td>
<td>176.56</td>
<td>230.19</td>
<td>97.03</td>
</tr>
<tr>
<td>64</td>
<td>1779.38</td>
<td>150.18</td>
<td>427.51</td>
<td>183.47</td>
<td>239.12</td>
<td>101.59</td>
</tr>
<tr>
<td>128</td>
<td>1787.36</td>
<td>OOM</td>
<td>439.04</td>
<td>185.29</td>
<td>243.31</td>
<td>103.39</td>
</tr>
<tr>
<td>256</td>
<td>1899.10</td>
<td>OOM</td>
<td>450.22</td>
<td>183.42</td>
<td>242.36</td>
<td>100.98</td>
</tr>
</tbody>
</table>
<ul>
<li>V100</li>
</ul>
<table>
<thead>
<tr>
<th>Batch</th>
<th>Alexnet</th>
<th>VGG 16</th>
<th>Inception-BN</th>
<th>Inception-v3</th>
<th>Resnet 50</th>
<th>Resnet 152</th>
</tr>
</thead>
<tbody>
<tr>
<td>1</td>
<td>659.51</td>
<td>205.16</td>
<td>157.37</td>
<td>87.71</td>
<td>162.15</td>
<td>61.38</td>
</tr>
<tr>
<td>2</td>
<td>1248.21</td>
<td>265.40</td>
<td>297.34</td>
<td>159.24</td>
<td>293.74</td>
<td>116.30</td>
</tr>
<tr>
<td>4</td>
<td>2122.41</td>
<td>333.97</td>
<td>520.91</td>
<td>279.84</td>
<td>479.14</td>
<td>195.17</td>
</tr>
<tr>
<td>8</td>
<td>3894.30</td>
<td>420.26</td>
<td>898.09</td>
<td>455.03</td>
<td>699.39</td>
<td>294.19</td>
</tr>
<tr>
<td>16</td>
<td>5815.58</td>
<td>654.16</td>
<td>1430.97</td>
<td>672.54</td>
<td>947.45</td>
<td>398.79</td>
</tr>
<tr>
<td>32</td>
<td>7906.09</td>
<td>708.43</td>
<td>1847.26</td>
<td>814.59</td>
<td>1076.81</td>
<td>451.82</td>
</tr>
<tr>
<td>64</td>
<td>9486.26</td>
<td>701.59</td>
<td>2134.89</td>
<td>899.01</td>
<td>1168.37</td>
<td>480.44</td>
</tr>
<tr>
<td>128</td>
<td>10177.84</td>
<td>703.30</td>
<td>2318.32</td>
<td>904.33</td>
<td>1233.15</td>
<td>511.79</td>
</tr>
<tr>
<td>256</td>
<td>10990.46</td>
<td>473.62</td>
<td>2425.28</td>
<td>960.20</td>
<td>1155.07</td>
<td>449.35</td>
</tr>
</tbody>
</table>
<p>Below is the performance result on V100 using float 16.</p>
<table>
<thead>
<tr>
<th>Batch</th>
<th>VGG 16</th>
<th>Inception-BN</th>
<th>Inception-v3</th>
<th>Resnet 50</th>
<th>Resnet 152</th>
</tr>
</thead>
<tbody>
<tr>
<td>1</td>
<td>276.29</td>
<td>155.53</td>
<td>150.99</td>
<td>270.89</td>
<td>96.79</td>
</tr>
<tr>
<td>2</td>
<td>476.91</td>
<td>296.45</td>
<td>282.02</td>
<td>493.99</td>
<td>176.88</td>
</tr>
<tr>
<td>4</td>
<td>711.92</td>
<td>525.05</td>
<td>492.45</td>
<td>851.15</td>
<td>321.52</td>
</tr>
<tr>
<td>8</td>
<td>1047.11</td>
<td>900.26</td>
<td>807.94</td>
<td>1282.36</td>
<td>517.66</td>
</tr>
<tr>
<td>16</td>
<td>1299.88</td>
<td>1441.41</td>
<td>1192.21</td>
<td>1722.97</td>
<td>724.57</td>
</tr>
<tr>
<td>32</td>
<td>1486.63</td>
<td>1854.30</td>
<td>1512.08</td>
<td>2085.51</td>
<td>887.34</td>
</tr>
<tr>
<td>64</td>
<td>1219.65</td>
<td>2138.61</td>
<td>1687.35</td>
<td>2341.67</td>
<td>1002.90</td>
</tr>
<tr>
<td>128</td>
<td>1169.81</td>
<td>2317.39</td>
<td>1818.26</td>
<td>2355.04</td>
<td>1046.98</td>
</tr>
<tr>
<td>256</td>
<td>764.16</td>
<td>2425.16</td>
<td>1653.74</td>
<td>1991.88</td>
<td>976.73</td>
</tr>
</tbody>
</table>
<h3 id="training-results">Training results</h3>
<p>Based on
<a href="https://github.com/apache/mxnet/blob/master/example/image-classification/train_imagenet.py">example/image-classification/train_imagenet.py</a>
and MXNet-1.2.0.rc1, with CUDNN 7.0.5. The benchmark script is available at
<a href="https://github.com/mli/mxnet-benchmark/blob/master/run_vary_batch.sh">here</a>,
where the batch size for Alexnet is increased by 16x.</p>
<ul>
<li>K80 (single GPU)</li>
</ul>
<table>
<thead>
<tr>
<th>Batch</th>
<th>Alexnet(*16)</th>
<th>Inception-v3</th>
<th>Resnet 50</th>
</tr>
</thead>
<tbody>
<tr>
<td>1</td>
<td>300.30</td>
<td>10.48</td>
<td>15.61</td>
</tr>
<tr>
<td>2</td>
<td>406.08</td>
<td>16.00</td>
<td>23.88</td>
</tr>
<tr>
<td>4</td>
<td>461.01</td>
<td>22.10</td>
<td>32.26</td>
</tr>
<tr>
<td>8</td>
<td>484.00</td>
<td>26.80</td>
<td>39.42</td>
</tr>
<tr>
<td>16</td>
<td>490.45</td>
<td>31.62</td>
<td>46.69</td>
</tr>
<tr>
<td>32</td>
<td>414.72</td>
<td>33.78</td>
<td>49.48</td>
</tr>
</tbody>
</table>
<ul>
<li>M60</li>
</ul>
<table>
<thead>
<tr>
<th>Batch</th>
<th>Alexnet(*16)</th>
<th>Inception-v3</th>
<th>Resnet 50</th>
</tr>
</thead>
<tbody>
<tr>
<td>1</td>
<td>380.96</td>
<td>14.06</td>
<td>20.55</td>
</tr>
<tr>
<td>2</td>
<td>530.53</td>
<td>21.90</td>
<td>32.65</td>
</tr>
<tr>
<td>4</td>
<td>600.17</td>
<td>31.96</td>
<td>45.57</td>
</tr>
<tr>
<td>8</td>
<td>633.60</td>
<td>40.58</td>
<td>54.92</td>
</tr>
<tr>
<td>16</td>
<td>639.37</td>
<td>46.88</td>
<td>64.44</td>
</tr>
<tr>
<td>32</td>
<td>576.54</td>
<td>50.05</td>
<td>68.34</td>
</tr>
</tbody>
</table>
<ul>
<li>V100</li>
</ul>
<table>
<thead>
<tr>
<th>Batch</th>
<th>Alexnet(*16)</th>
<th>Inception-v3</th>
<th>Resnet 50</th>
</tr>
</thead>
<tbody>
<tr>
<td>1</td>
<td>1629.52</td>
<td>21.83</td>
<td>34.54</td>
</tr>
<tr>
<td>2</td>
<td>2359.73</td>
<td>40.11</td>
<td>65.01</td>
</tr>
<tr>
<td>4</td>
<td>2687.89</td>
<td>72.79</td>
<td>113.49</td>
</tr>
<tr>
<td>8</td>
<td>2919.02</td>
<td>118.43</td>
<td>174.81</td>
</tr>
<tr>
<td>16</td>
<td>2994.32</td>
<td>173.15</td>
<td>251.22</td>
</tr>
<tr>
<td>32</td>
<td>2585.61</td>
<td>214.48</td>
<td>298.51</td>
</tr>
<tr>
<td>64</td>
<td>1984.21</td>
<td>247.43</td>
<td>343.19</td>
</tr>
<tr>
<td>128</td>
<td>OOM</td>
<td>253.68</td>
<td>363.69</td>
</tr>
</tbody>
</table>
<h2 id="multiple-devices">Multiple Devices</h2>
<p>If more than one GPU or machine are used, MXNet uses <code class="highlighter-rouge">kvstore</code> to communicate data.
It’s critical to use the proper type of <code class="highlighter-rouge">kvstore</code> to get the best performance.
Refer to <a href="https://mxnet.apache.org/api/faq/distributed_training.html">Distributed Training</a> for more
details.</p>
<p>Besides, we can use <a href="https://github.com/apache/mxnet/tree/master/tools/bandwidth">tools/bandwidth</a>
to find the communication cost per batch.
Ideally, the communication cost should be less than the time to compute a batch.
To reduce the communication cost, we can consider:</p>
<ul>
<li>Exploring different <code class="highlighter-rouge">--kv-store</code> options.</li>
<li>Increasing the batch size to improve the computation to communication ratio.</li>
</ul>
<p>Finally, MXNet is integrated with other distributed training frameworks, including <a href="https://github.com/apache/mxnet/tree/master/example/distributed_training-horovod">horovod</a> and <a href="https://github.com/bytedance/byteps#use-byteps-in-your-code">BytePS</a>.</p>
<h2 id="input-data">Input Data</h2>
<p>To make sure you’re handling input data in a reasonable way consider the following:</p>
<ul>
<li>Data format: If you are using the <code class="highlighter-rouge">rec</code> format, then everything should be fine.</li>
<li>Decoding: By default, <em>MXNet</em> uses 4 CPU threads for decoding images.
This is often sufficient to decode more than 1K images per second.
If you are using a low-end CPU or your GPUs are very powerful, you can increase the number of threads.</li>
<li>Storage location. Any local or distributed file system (HDFS, Amazon S3) should be fine.
If multiple devices read the data from the shared network file system (NFS) at the same time, problems might occur.</li>
<li>Use a large batch size. We often choose the largest one that fits into GPU memory.
A value that’s too large can slow down convergence.
For example, the safe batch size for CIFAR 10 is approximately 200, while for ImageNet 1K, the batch size can exceed 1K.</li>
</ul>
<h2 id="profiler">Profiler</h2>
<p><em>MXNet</em> has a built-in profiler
that gives detailed information about execution time at the operator level.
This feature complements general profiling tools like <em>nvprof</em> and <em>gprof</em>
by summarizing at the operator level, instead of a function, kernel, or instruction level.</p>
<p>The profiler can be turned on with an <a href="/versions/master/api/faq/env_var#control-the-profiler">environment variable</a>
for an entire program run, or programmatically for just part of a run. Note that by default the profiler hides the details of each individual operator, and you can reveal the details by setting environment variables <code class="highlighter-rouge">MXNET_EXEC_BULK_EXEC_INFERENCE</code>, <code class="highlighter-rouge">MXNET_EXEC_BULK_EXEC_MAX_NODE_TRAIN</code> and <code class="highlighter-rouge">MXNET_EXEC_BULK_EXEC_TRAIN</code> to 0.
See <a href="https://github.com/apache/mxnet/tree/master/example/profiler">example/profiler</a>
for complete examples of how to use the profiler in code, or <a href="https://mxnet.apache.org/api/python/docs/tutorials/performance/backend/profiler.html">this tutorial</a> on how to profile MXNet performance.</p>
<p>Briefly, the Python code looks like:</p>
<div class="language-python highlighter-rouge"><div class="highlight"><pre class="highlight"><code> <span class="c1"># wait for previous operations to complete
</span> <span class="n">mx</span><span class="o">.</span><span class="n">nd</span><span class="o">.</span><span class="n">waitall</span><span class="p">()</span>
<span class="n">mx</span><span class="o">.</span><span class="n">profiler</span><span class="o">.</span><span class="n">set_config</span><span class="p">(</span><span class="n">profile_all</span><span class="o">=</span><span class="bp">True</span><span class="p">,</span> <span class="n">aggregate_stats</span><span class="o">=</span><span class="bp">True</span><span class="p">,</span> <span class="n">filename</span><span class="o">=</span><span class="s">'profile_output.json'</span><span class="p">)</span>
<span class="n">mx</span><span class="o">.</span><span class="n">profiler</span><span class="o">.</span><span class="n">set_state</span><span class="p">(</span><span class="s">'run'</span><span class="p">)</span>
<span class="c1"># Code to be profiled goes here...
</span>
<span class="c1"># wait for previous operations to complete
</span> <span class="n">mx</span><span class="o">.</span><span class="n">nd</span><span class="o">.</span><span class="n">waitall</span><span class="p">()</span>
<span class="n">mx</span><span class="o">.</span><span class="n">profiler</span><span class="o">.</span><span class="n">set_state</span><span class="p">(</span><span class="s">'stop'</span><span class="p">)</span>
</code></pre></div></div>
<p>After the program finishes, navigate to your browser’s tracing (Example - chrome://tracing in a Chrome browser) and load the <code class="highlighter-rouge">profile_output.json</code> file output by the profiler to inspect the results.</p>
<p><img src="https://cloud.githubusercontent.com/assets/17693755/18035938/0a43484a-6d93-11e6-80d4-241c6ca552ea.png" alt="MLP Profile" /></p>
<p>Note that the output file can grow extremely large, so this approach is not recommended for general use.</p>
</div>
</div>
</div>
</div>
</article>
</main><footer class="site-footer h-card">
<div class="wrapper">
<div class="row">
<div class="col-4">
<h4 class="footer-category-title">Resources</h4>
<ul class="contact-list">
<li><a href="/versions/master/community#stay-connected">Mailing lists</a></li>
<li><a href="/versions/master/community#github-issues">Github Issues</a></li>
<li><a href="https://github.com/apache/mxnet/projects">Projects</a></li>
<li><a href="https://cwiki.apache.org/confluence/display/MXNET/Apache+MXNet+Home">Developer Wiki</a></li>
<li><a href="https://discuss.mxnet.io">Forum</a></li>
<li><a href="/versions/master/community">Contribute To MXNet</a></li>
</ul>
</div>
<div class="col-4"><ul class="social-media-list"><li><a href="https://github.com/apache/mxnet"><svg class="svg-icon"><use xlink:href="/versions/master/assets/minima-social-icons.svg#github"></use></svg> <span class="username">apache/mxnet</span></a></li><li><a href="https://www.twitter.com/apachemxnet"><svg class="svg-icon"><use xlink:href="/versions/master/assets/minima-social-icons.svg#twitter"></use></svg> <span class="username">apachemxnet</span></a></li><li><a href="https://youtube.com/apachemxnet"><svg class="svg-icon"><use xlink:href="/versions/master/assets/minima-social-icons.svg#youtube"></use></svg> <span class="username">apachemxnet</span></a></li></ul>
</div>
<div class="col-4 footer-text">
<p>A flexible and efficient library for deep learning.</p>
</div>
</div>
</div>
</footer>
<footer class="site-footer2">
<div class="wrapper">
<div class="row">
<div class="col-3">
<img src="/versions/master/assets/img/asf_logo.svg" class="footer-logo col-2">
</div>
<div class="footer-bottom-warning col-9">
</p><p>"Copyright © 2017-2022, The Apache Software Foundation. Licensed under the Apache License, Version 2.0. Apache MXNet, MXNet, Apache, the Apache
feather, and the Apache MXNet project logo are either registered trademarks or trademarks of the
Apache Software Foundation."</p>
</div>
</div>
</div>
</footer>
</body>
</html>