blob: 50fa29045a5a24fbd999bab6ec309dd2f324ae2d [file] [log] [blame]
<!DOCTYPE html>
<!---
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
distributed with this work for additional information
regarding copyright ownership. The ASF licenses this file
to you under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing,
software distributed under the License is distributed on an
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, either express or implied. See the License for the
specific language governing permissions and limitations
under the License.
-->
<html lang=" en"><head>
<meta charset="utf-8">
<meta http-equiv="X-UA-Compatible" content="IE=edge">
<meta name="viewport" content="width=device-width, initial-scale=1">
<link href="/versions/1.9.1/assets/img/mxnet-icon.png" rel="icon" type="image/png"><!-- Begin Jekyll SEO tag v2.6.1 -->
<title>Some Tips for Improving MXNet Performance | Apache MXNet</title>
<meta name="generator" content="Jekyll v3.8.6" />
<meta property="og:title" content="Some Tips for Improving MXNet Performance" />
<meta property="og:locale" content="en_US" />
<meta name="description" content="A flexible and efficient library for deep learning." />
<meta property="og:description" content="A flexible and efficient library for deep learning." />
<link rel="canonical" href="https://mxnet.apache.org/versions/1.9.1/api/faq/perf" />
<meta property="og:url" content="https://mxnet.apache.org/versions/1.9.1/api/faq/perf" />
<meta property="og:site_name" content="Apache MXNet" />
<script type="application/ld+json">
{"description":"A flexible and efficient library for deep learning.","headline":"Some Tips for Improving MXNet Performance","@type":"WebPage","url":"https://mxnet.apache.org/versions/1.9.1/api/faq/perf","@context":"https://schema.org"}</script>
<!-- End Jekyll SEO tag -->
<link rel="stylesheet" href="/versions/1.9.1/assets/docsearch.min.css" /><link rel="stylesheet" href="/versions/1.9.1/assets/main.css"><link type="application/atom+xml" rel="alternate" href="https://mxnet.apache.org/versions/1.9.1/feed.xml" title="Apache MXNet" /><!-- Matomo -->
<script>
var _paq = window._paq = window._paq || [];
/* tracker methods like "setCustomDimension" should be called before "trackPageView" */
/* We explicitly disable cookie tracking to avoid privacy issues */
_paq.push(['disableCookies']);
_paq.push(['trackPageView']);
_paq.push(['enableLinkTracking']);
(function() {
var u="https://analytics.apache.org/";
_paq.push(['setTrackerUrl', u+'matomo.php']);
_paq.push(['setSiteId', '23']);
var d=document, g=d.createElement('script'), s=d.getElementsByTagName('script')[0];
g.async=true; g.src=u+'matomo.js'; s.parentNode.insertBefore(g,s);
})();
</script>
<!-- End Matomo Code -->
<script src="/versions/1.9.1/assets/js/jquery-3.3.1.min.js"></script>
<script src="/versions/1.9.1/assets/js/docsearch.min.js"></script><script src="/versions/1.9.1/assets/js/globalSearch.js" defer></script>
<script src="/versions/1.9.1/assets/js/clipboard.js" defer></script>
<script src="/versions/1.9.1/assets/js/copycode.js" defer></script></head>
<body><header class="site-header" role="banner">
<script>
$(document).ready(function () {
// HEADER OPACITY LOGIC
function opacity_header() {
var value = "rgba(4,140,204," + ($(window).scrollTop() / 300 + 0.4) + ")"
$('.site-header').css("background-color", value)
}
$(window).scroll(function () {
opacity_header()
})
opacity_header();
// MENU SELECTOR LOGIC
$('.page-link').each( function () {
if (window.location.href.includes(this.href)) {
$(this).addClass("page-current");
}
});
})
</script>
<div class="wrapper">
<a class="site-title" rel="author" href="/versions/1.9.1/"><img
src="/versions/1.9.1/assets/img/mxnet_logo.png" class="site-header-logo"></a>
<nav class="site-nav">
<input type="checkbox" id="nav-trigger" class="nav-trigger"/>
<label for="nav-trigger">
<span class="menu-icon">
<svg viewBox="0 0 18 15" width="18px" height="15px">
<path d="M18,1.484c0,0.82-0.665,1.484-1.484,1.484H1.484C0.665,2.969,0,2.304,0,1.484l0,0C0,0.665,0.665,0,1.484,0 h15.032C17.335,0,18,0.665,18,1.484L18,1.484z M18,7.516C18,8.335,17.335,9,16.516,9H1.484C0.665,9,0,8.335,0,7.516l0,0 c0-0.82,0.665-1.484,1.484-1.484h15.032C17.335,6.031,18,6.696,18,7.516L18,7.516z M18,13.516C18,14.335,17.335,15,16.516,15H1.484 C0.665,15,0,14.335,0,13.516l0,0c0-0.82,0.665-1.483,1.484-1.483h15.032C17.335,12.031,18,12.695,18,13.516L18,13.516z"/>
</svg>
</span>
</label>
<div class="gs-search-border">
<div id="gs-search-icon"></div>
<form id="global-search-form">
<input id="global-search" type="text" title="Search" placeholder="Search" />
<div id="global-search-dropdown-container">
<button class="gs-current-version btn" type="button" data-toggle="dropdown">
<span id="gs-current-version-label">1.9.1</span>
<svg class="gs-dropdown-caret" viewBox="0 0 32 32" class="icon icon-caret-bottom" aria-hidden="true">
<path class="dropdown-caret-path" d="M24 11.305l-7.997 11.39L8 11.305z"></path>
</svg>
</button>
<ul class="gs-opt-group gs-version-dropdown">
<li class="gs-opt gs-versions">master</li>
<li class="gs-opt gs-versions active">1.9.1</li>
<li class="gs-opt gs-versions">1.8.0</li>
<li class="gs-opt gs-versions">1.7.0</li>
<li class="gs-opt gs-versions">1.6.0</li>
<li class="gs-opt gs-versions">1.5.0</li>
<li class="gs-opt gs-versions">1.4.1</li>
<li class="gs-opt gs-versions">1.3.1</li>
<li class="gs-opt gs-versions">1.2.1</li>
<li class="gs-opt gs-versions">1.1.0</li>
<li class="gs-opt gs-versions">1.0.0</li>
<li class="gs-opt gs-versions">0.12.1</li>
<li class="gs-opt gs-versions">0.11.0</li>
</ul>
</div>
<span id="global-search-close">x</span>
</form>
</div>
<div class="trigger">
<div id="global-search-mobile-border">
<div id="gs-search-icon-mobile"></div>
<input id="global-search-mobile" placeholder="Search..." type="text"/>
<div id="global-search-dropdown-container-mobile">
<button class="gs-current-version-mobile btn" type="button" data-toggle="dropdown">
<svg class="gs-dropdown-caret" viewBox="0 0 32 32" class="icon icon-caret-bottom" aria-hidden="true">
<path class="dropdown-caret-path" d="M24 11.305l-7.997 11.39L8 11.305z"></path>
</svg>
</button>
<ul class="gs-opt-group gs-version-dropdown-mobile">
<li class="gs-opt gs-versions">master</li>
<li class="gs-opt gs-versions active">1.9.1</li>
<li class="gs-opt gs-versions">1.8.0</li>
<li class="gs-opt gs-versions">1.7.0</li>
<li class="gs-opt gs-versions">1.6.0</li>
<li class="gs-opt gs-versions">1.5.0</li>
<li class="gs-opt gs-versions">1.4.1</li>
<li class="gs-opt gs-versions">1.3.1</li>
<li class="gs-opt gs-versions">1.2.1</li>
<li class="gs-opt gs-versions">1.1.0</li>
<li class="gs-opt gs-versions">1.0.0</li>
<li class="gs-opt gs-versions">0.12.1</li>
<li class="gs-opt gs-versions">0.11.0</li>
</ul>
</div>
</div>
<a class="page-link" href="/versions/1.9.1/get_started">Get Started</a>
<a class="page-link" href="/versions/1.9.1/features">Features</a>
<a class="page-link" href="/versions/1.9.1/ecosystem">Ecosystem</a>
<a class="page-link" href="/versions/1.9.1/api">Docs & Tutorials</a>
<a class="page-link" href="/versions/1.9.1/trusted_by">Trusted By</a>
<a class="page-link" href="https://github.com/apache/mxnet">GitHub</a>
<div class="dropdown" style="min-width:100px">
<span class="dropdown-header">Apache
<svg class="dropdown-caret" viewBox="0 0 32 32" class="icon icon-caret-bottom" aria-hidden="true"><path class="dropdown-caret-path" d="M24 11.305l-7.997 11.39L8 11.305z"></path></svg>
</span>
<div class="dropdown-content" style="min-width:250px">
<a href="https://www.apache.org/foundation/">Apache Software Foundation</a>
<a href="https://www.apache.org/licenses/">License</a>
<a href="/versions/1.9.1/api/faq/security.html">Security</a>
<a href="https://privacy.apache.org/policies/privacy-policy-public.html">Privacy</a>
<a href="https://www.apache.org/events/current-event">Events</a>
<a href="https://www.apache.org/foundation/sponsorship.html">Sponsorship</a>
<a href="https://www.apache.org/foundation/thanks.html">Thanks</a>
</div>
</div>
<div class="dropdown">
<span class="dropdown-header">1.9.1
<svg class="dropdown-caret" viewBox="0 0 32 32" class="icon icon-caret-bottom" aria-hidden="true"><path class="dropdown-caret-path" d="M24 11.305l-7.997 11.39L8 11.305z"></path></svg>
</span>
<div class="dropdown-content">
<a href="/">master</a>
<a class="dropdown-option-active" href="/versions/1.9.1/">1.9.1</a>
<a href="/versions/1.8.0/">1.8.0</a>
<a href="/versions/1.7.0/">1.7.0</a>
<a href="/versions/1.6.0/">1.6.0</a>
<a href="/versions/1.5.0/">1.5.0</a>
<a href="/versions/1.4.1/">1.4.1</a>
<a href="/versions/1.3.1/">1.3.1</a>
<a href="/versions/1.2.1/">1.2.1</a>
<a href="/versions/1.1.0/">1.1.0</a>
<a href="/versions/1.0.0/">1.0.0</a>
<a href="/versions/0.12.1/">0.12.1</a>
<a href="/versions/0.11.0/">0.11.0</a>
</div>
</div>
</div>
</nav>
</div>
</header>
<main class="page-content" aria-label="Content">
<script>
</script>
<article class="post">
<header class="post-header wrapper">
<h1 class="post-title">Some Tips for Improving MXNet Performance</h1>
<h3></h3></header>
<div class="post-content">
<div class="wrapper">
<div class="row">
<div class="col-3 docs-side-bar">
<h3 style="text-transform: capitalize; padding-left:10px">faq</h3>
<ul>
<!-- page-category -->
<!-- page-category -->
<li><a href="/versions/1.9.1/api/faq/add_op_in_backend">A Beginner's Guide to Implementing Operators in MXNet Backend</a></li>
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<li><a href="/versions/1.9.1/api/faq/caffe">Convert from Caffe to MXNet</a></li>
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<li><a href="/versions/1.9.1/api/faq/cloud">MXNet on the Cloud</a></li>
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<li><a href="/versions/1.9.1/api/faq/distributed_training">Distributed Training in MXNet</a></li>
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<li><a href="/versions/1.9.1/api/faq/env_var">Environment Variables</a></li>
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<li><a href="/versions/1.9.1/api/faq/float16">Float16</a></li>
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<li><a href="/versions/1.9.1/api/faq/large_tensor_support">Using MXNet with Large Tensor Support</a></li>
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<li><a href="/versions/1.9.1/api/faq/model_parallel_lstm">Model Parallel</a></li>
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<li><a href="/versions/1.9.1/api/faq/multi_device">Data Parallelism with Multiple CPU/GPUs on MXNet</a></li>
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<li><a href="/versions/1.9.1/api/faq/new_op">Create New Operators</a></li>
<!-- page-category -->
<li><a href="/versions/1.9.1/api/faq/nnpack">NNPACK for Multi-Core CPU Support in MXNet</a></li>
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<li><a href="/versions/1.9.1/api/faq/perf">Some Tips for Improving MXNet Performance</a></li>
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<li><a href="/versions/1.9.1/api/faq/recordio">Create a Dataset Using RecordIO</a></li>
<!-- page-category -->
<li><a href="/versions/1.9.1/api/faq/s3_integration">Use data from S3 for training</a></li>
<!-- page-category -->
<!-- page-category -->
<li><a href="/versions/1.9.1/api/faq/security">MXNet Security Best Practices</a></li>
<!-- page-category -->
<li><a href="/versions/1.9.1/api/faq/smart_device">Deep Learning at the Edge</a></li>
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<li><a href="/versions/1.9.1/api/faq/visualize_graph">Visualize Neural Networks</a></li>
<!-- page-category -->
<li><a href="/versions/1.9.1/api/faq/why_mxnet">Why MXNet came to be?</a></li>
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- resource-p -->
</ul>
</div>
<div class="col-9">
<!--- Licensed to the Apache Software Foundation (ASF) under one -->
<!--- or more contributor license agreements. See the NOTICE file -->
<!--- distributed with this work for additional information -->
<!--- regarding copyright ownership. The ASF licenses this file -->
<!--- to you under the Apache License, Version 2.0 (the -->
<!--- "License"); you may not use this file except in compliance -->
<!--- with the License. You may obtain a copy of the License at -->
<!--- http://www.apache.org/licenses/LICENSE-2.0 -->
<!--- Unless required by applicable law or agreed to in writing, -->
<!--- software distributed under the License is distributed on an -->
<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
<!--- KIND, either express or implied. See the License for the -->
<!--- specific language governing permissions and limitations -->
<!--- under the License. -->
<h1 id="some-tips-for-improving-mxnet-performance">Some Tips for Improving MXNet Performance</h1>
<p>Even after fixing the training or deployment environment and parallelization scheme,
a number of configuration settings and data-handling choices can impact the <em>MXNet</em> performance.
In this document, we address some tips for improving <em>MXNet</em> performance.</p>
<p>Performance is mainly affected by the following 4 factors:</p>
<ol>
<li>Implementation of operators (Convolution, Pooling, ..)
<ul>
<li><a href="#intel-cpu">Intel CPU</a></li>
<li><a href="#nvidia-gpu">Nvidia GPU</a></li>
</ul></li>
<li>Input data loading and augmentation
<ul>
<li><a href="#input-data">Input Data</a></li>
</ul></li>
<li>Workloads (computation graph) optimization and scheduling
<ul>
<li><a href="#profiler">Profiler</a></li>
</ul></li>
<li>Communication for multi-devices training
<ul>
<li><a href="#multiple-devices">Multiple Devices</a></li>
</ul></li>
</ol>
<h2 id="intel-cpu">Intel CPU</h2>
<p>When using Intel Xeon CPUs for training and inference, the <code>mxnet-mkl</code> package is recommended. Adding <code>--pre</code> installs a nightly build from master. Without it you will install the latest patched release of MXNet:</p>
<div class="highlight"><pre><code class="language-" data-lang="">$ pip install mxnet-mkl [--pre]
</code></pre></div>
<p>Or build MXNet from source code with <code>USE_MKLDNN=1</code>. For Linux users, <code>USE_MKLDNN=1</code> will be turned on by default.</p>
<p>We also find that setting the following environment variables can help:</p>
<table><thead>
<tr>
<th style="text-align: left">Variable</th>
<th style="text-align: left">Description</th>
</tr>
</thead><tbody>
<tr>
<td style="text-align: left"><code>OMP_NUM_THREADS</code></td>
<td style="text-align: left">Suggested value: <code>vCPUs / 2</code> in which <code>vCPUs</code> is the number of virtual CPUs. For more information, please see the guide for <a href="https://software.intel.com/en-us/mkl-windows-developer-guide-setting-the-number-of-threads-using-an-openmp-environment-variable">setting the number of threads using an OpenMP environment variable</a></td>
</tr>
<tr>
<td style="text-align: left"><code>KMP_AFFINITY</code></td>
<td style="text-align: left">Suggested value: <code>granularity=fine,compact,1,0</code>. For more information, please see the guide for <a href="https://software.intel.com/en-us/node/522691">Thread Affinity Interface (Linux* and Windows*)</a>.</td>
</tr>
<tr>
<td style="text-align: left"><code>MXNET_SUBGRAPH_BACKEND</code></td>
<td style="text-align: left">Set to MKLDNN to enable the <a href="https://cwiki.apache.org/confluence/display/MXNET/MXNet+Graph+Optimization+and+Quantization+based+on+subgraph+and+MKL-DNN">subgraph feature</a> for better performance. For more information please see <a href="https://mxnet.apache.org/api/python/docs/tutorials/performance/backend/mkldnn/mkldnn_readme.html">Build/Install MXNet with MKL-DNN</a></td>
</tr>
</tbody></table>
<p>Note that <em>MXNet</em> treats all CPUs on a single machine as a single device.
So whether you specify <code>cpu(0)</code> or <code>cpu()</code>, <em>MXNet</em> will use all CPU cores on the machine.</p>
<h3 id="scoring-results">Scoring results</h3>
<p>The following table shows performance of MXNet-1.2.0.rc1,
namely number of images that can be predicted per second.
We used <a href="https://github.com/apache/mxnet/blob/v1.x/example/image-classification/benchmark_score.py">example/image-classification/benchmark_score.py</a>
to measure the performance on different AWS EC2 machines.</p>
<p>AWS EC2 C5.18xlarge:</p>
<table><thead>
<tr>
<th>Batch</th>
<th>Alexnet</th>
<th>VGG 16</th>
<th>Inception-BN</th>
<th>Inception-v3</th>
<th>Resnet 50</th>
<th>Resnet 152</th>
</tr>
</thead><tbody>
<tr>
<td>1</td>
<td>390.53</td>
<td>81.57</td>
<td>124.13</td>
<td>62.26</td>
<td>76.22</td>
<td>32.92</td>
</tr>
<tr>
<td>2</td>
<td>596.45</td>
<td>100.84</td>
<td>206.58</td>
<td>93.36</td>
<td>119.55</td>
<td>46.80</td>
</tr>
<tr>
<td>4</td>
<td>710.77</td>
<td>119.04</td>
<td>275.55</td>
<td>127.86</td>
<td>148.62</td>
<td>59.36</td>
</tr>
<tr>
<td>8</td>
<td>921.40</td>
<td>120.38</td>
<td>380.82</td>
<td>157.11</td>
<td>167.95</td>
<td>70.78</td>
</tr>
<tr>
<td>16</td>
<td>1018.43</td>
<td>115.30</td>
<td>411.67</td>
<td>168.71</td>
<td>178.54</td>
<td>75.13</td>
</tr>
<tr>
<td>32</td>
<td>1290.31</td>
<td>107.19</td>
<td>483.34</td>
<td>179.38</td>
<td>193.47</td>
<td>85.86</td>
</tr>
</tbody></table>
<p>AWS EC2 C5.9xlarge:</p>
<table><thead>
<tr>
<th>Batch</th>
<th>Alexnet</th>
<th>VGG 16</th>
<th>Inception-BN</th>
<th>Inception-v3</th>
<th>Resnet 50</th>
<th>Resnet 152</th>
</tr>
</thead><tbody>
<tr>
<td>1</td>
<td>257.77</td>
<td>50.61</td>
<td>130.99</td>
<td>66.95</td>
<td>75.38</td>
<td>32.33</td>
</tr>
<tr>
<td>2</td>
<td>410.60</td>
<td>63.02</td>
<td>195.14</td>
<td>87.84</td>
<td>102.67</td>
<td>41.57</td>
</tr>
<tr>
<td>4</td>
<td>462.59</td>
<td>62.64</td>
<td>263.15</td>
<td>109.87</td>
<td>127.15</td>
<td>50.69</td>
</tr>
<tr>
<td>8</td>
<td>573.79</td>
<td>63.95</td>
<td>309.99</td>
<td>121.36</td>
<td>140.84</td>
<td>59.01</td>
</tr>
<tr>
<td>16</td>
<td>709.47</td>
<td>67.79</td>
<td>350.19</td>
<td>128.26</td>
<td>147.41</td>
<td>64.15</td>
</tr>
<tr>
<td>32</td>
<td>831.46</td>
<td>69.58</td>
<td>354.91</td>
<td>129.92</td>
<td>149.18</td>
<td>64.25</td>
</tr>
</tbody></table>
<p>AWS EC2 C5.4xlarge:</p>
<table><thead>
<tr>
<th>Batch</th>
<th>Alexnet</th>
<th>VGG 16</th>
<th>Inception-BN</th>
<th>Inception-v3</th>
<th>Resnet 50</th>
<th>Resnet 152</th>
</tr>
</thead><tbody>
<tr>
<td>1</td>
<td>214.15</td>
<td>29.32</td>
<td>114.97</td>
<td>47.96</td>
<td>61.01</td>
<td>23.92</td>
</tr>
<tr>
<td>2</td>
<td>310.04</td>
<td>34.81</td>
<td>150.09</td>
<td>60.89</td>
<td>71.16</td>
<td>27.92</td>
</tr>
<tr>
<td>4</td>
<td>330.69</td>
<td>34.56</td>
<td>186.63</td>
<td>74.15</td>
<td>86.86</td>
<td>34.37</td>
</tr>
<tr>
<td>8</td>
<td>378.88</td>
<td>35.46</td>
<td>204.89</td>
<td>77.05</td>
<td>91.10</td>
<td>36.93</td>
</tr>
<tr>
<td>16</td>
<td>424.00</td>
<td>36.49</td>
<td>211.55</td>
<td>78.39</td>
<td>91.23</td>
<td>37.34</td>
</tr>
<tr>
<td>32</td>
<td>481.95</td>
<td>37.23</td>
<td>213.71</td>
<td>78.23</td>
<td>91.68</td>
<td>37.26</td>
</tr>
</tbody></table>
<p>AWS EC2 C5.2xlarge:</p>
<table><thead>
<tr>
<th>Batch</th>
<th>Alexnet</th>
<th>VGG 16</th>
<th>Inception-BN</th>
<th>Inception-v3</th>
<th>Resnet 50</th>
<th>Resnet 152</th>
</tr>
</thead><tbody>
<tr>
<td>1</td>
<td>131.01</td>
<td>15.67</td>
<td>78.75</td>
<td>31.12</td>
<td>37.30</td>
<td>14.75</td>
</tr>
<tr>
<td>2</td>
<td>182.29</td>
<td>18.01</td>
<td>98.59</td>
<td>39.13</td>
<td>45.98</td>
<td>17.84</td>
</tr>
<tr>
<td>4</td>
<td>189.31</td>
<td>18.25</td>
<td>110.26</td>
<td>41.35</td>
<td>49.21</td>
<td>19.32</td>
</tr>
<tr>
<td>8</td>
<td>211.75</td>
<td>18.57</td>
<td>115.46</td>
<td>42.53</td>
<td>49.98</td>
<td>19.81</td>
</tr>
<tr>
<td>16</td>
<td>236.06</td>
<td>19.11</td>
<td>117.18</td>
<td>42.59</td>
<td>50.20</td>
<td>19.92</td>
</tr>
<tr>
<td>32</td>
<td>261.13</td>
<td>19.46</td>
<td>116.20</td>
<td>42.72</td>
<td>49.95</td>
<td>19.80</td>
</tr>
</tbody></table>
<p>AWS EC2 C5.xlarge:</p>
<table><thead>
<tr>
<th>Batch</th>
<th>Alexnet</th>
<th>VGG 16</th>
<th>Inception-BN</th>
<th>Inception-v3</th>
<th>Resnet 50</th>
<th>Resnet 152</th>
</tr>
</thead><tbody>
<tr>
<td>1</td>
<td>36.64</td>
<td>3.93</td>
<td>27.06</td>
<td>10.09</td>
<td>12.98</td>
<td>5.06</td>
</tr>
<tr>
<td>2</td>
<td>49.21</td>
<td>4.49</td>
<td>29.67</td>
<td>10.80</td>
<td>12.94</td>
<td>5.14</td>
</tr>
<tr>
<td>4</td>
<td>50.12</td>
<td>4.50</td>
<td>30.31</td>
<td>10.83</td>
<td>13.17</td>
<td>5.19</td>
</tr>
<tr>
<td>8</td>
<td>54.71</td>
<td>4.58</td>
<td>30.22</td>
<td>10.89</td>
<td>13.19</td>
<td>5.20</td>
</tr>
<tr>
<td>16</td>
<td>60.23</td>
<td>4.70</td>
<td>30.20</td>
<td>10.91</td>
<td>13.23</td>
<td>5.19</td>
</tr>
<tr>
<td>32</td>
<td>66.37</td>
<td>4.76</td>
<td>30.10</td>
<td>10.90</td>
<td>13.22</td>
<td>5.15</td>
</tr>
</tbody></table>
<h2 id="other-cpu">Other CPU</h2>
<p>If using CPUs (not just Intel CPUs -- ARMs also), NNPACK can improve the running performance with 2x~7x, please check <a href="nnpack">nnpack.md</a> for details.</p>
<h2 id="nvidia-gpu">Nvidia GPU</h2>
<p><code>cuDNN</code> typically accelerates <em>MXNet</em> performance on NVIDIA GPUs significantly,
especially for convolution layers.
We suggest always checking to make sure that a recent cuDNN version is used.</p>
<p>Setting the environment <code>export MXNET_CUDNN_AUTOTUNE_DEFAULT=1</code> sometimes also helps.</p>
<p>We show results when using various GPUs including K80 (EC2 p2.2xlarge), M60 (EC2 g3.4xlarge),
and V100 (EC2 p3.2xlarge).</p>
<h3 id="scoring-results">Scoring results</h3>
<p>Based on
<a href="https://github.com/apache/mxnet/blob/v1.x/example/image-classification/benchmark_score.py">example/image-classification/benchmark_score.py</a>
and MXNet-1.2.0.rc1, with cuDNN 7.0.5</p>
<ul>
<li>K80 (single GPU)</li>
</ul>
<table><thead>
<tr>
<th>Batch</th>
<th>Alexnet</th>
<th>VGG 16</th>
<th>Inception-BN</th>
<th>Inception-v3</th>
<th>Resnet 50</th>
<th>Resnet 152</th>
</tr>
</thead><tbody>
<tr>
<td>1</td>
<td>243.93</td>
<td>43.59</td>
<td>68.62</td>
<td>35.52</td>
<td>67.41</td>
<td>23.65</td>
</tr>
<tr>
<td>2</td>
<td>338.16</td>
<td>49.14</td>
<td>113.41</td>
<td>56.29</td>
<td>93.35</td>
<td>33.88</td>
</tr>
<tr>
<td>4</td>
<td>478.92</td>
<td>53.44</td>
<td>159.61</td>
<td>74.43</td>
<td>119.18</td>
<td>45.23</td>
</tr>
<tr>
<td>8</td>
<td>683.52</td>
<td>70.50</td>
<td>190.49</td>
<td>86.23</td>
<td>131.32</td>
<td>50.54</td>
</tr>
<tr>
<td>16</td>
<td>1004.66</td>
<td>109.01</td>
<td>254.20</td>
<td>105.70</td>
<td>155.40</td>
<td>62.55</td>
</tr>
<tr>
<td>32</td>
<td>1238.55</td>
<td>114.98</td>
<td>285.49</td>
<td>116.79</td>
<td>159.42</td>
<td>64.99</td>
</tr>
<tr>
<td>64</td>
<td>1346.72</td>
<td>123.56</td>
<td>308.73</td>
<td>122.21</td>
<td>167.58</td>
<td>70.21</td>
</tr>
<tr>
<td>128</td>
<td>1416.91</td>
<td>OOM</td>
<td>320.98</td>
<td>123.11</td>
<td>171.55</td>
<td>71.85</td>
</tr>
<tr>
<td>256</td>
<td>1462.97</td>
<td>OOM</td>
<td>329.16</td>
<td>127.53</td>
<td>153.01</td>
<td>57.23</td>
</tr>
</tbody></table>
<ul>
<li>M60</li>
</ul>
<table><thead>
<tr>
<th>Batch</th>
<th>Alexnet</th>
<th>VGG 16</th>
<th>Inception-BN</th>
<th>Inception-v3</th>
<th>Resnet 50</th>
<th>Resnet 152</th>
</tr>
</thead><tbody>
<tr>
<td>1</td>
<td>243.49</td>
<td>59.95</td>
<td>101.97</td>
<td>48.30</td>
<td>95.46</td>
<td>39.29</td>
</tr>
<tr>
<td>2</td>
<td>491.04</td>
<td>69.14</td>
<td>170.35</td>
<td>80.27</td>
<td>142.61</td>
<td>60.17</td>
</tr>
<tr>
<td>4</td>
<td>711.54</td>
<td>78.94</td>
<td>257.89</td>
<td>123.09</td>
<td>182.36</td>
<td>76.51</td>
</tr>
<tr>
<td>8</td>
<td>1077.73</td>
<td>109.34</td>
<td>343.42</td>
<td>152.82</td>
<td>208.74</td>
<td>87.27</td>
</tr>
<tr>
<td>16</td>
<td>1447.21</td>
<td>144.93</td>
<td>390.25</td>
<td>166.32</td>
<td>220.73</td>
<td>92.41</td>
</tr>
<tr>
<td>32</td>
<td>1797.66</td>
<td>151.86</td>
<td>416.69</td>
<td>176.56</td>
<td>230.19</td>
<td>97.03</td>
</tr>
<tr>
<td>64</td>
<td>1779.38</td>
<td>150.18</td>
<td>427.51</td>
<td>183.47</td>
<td>239.12</td>
<td>101.59</td>
</tr>
<tr>
<td>128</td>
<td>1787.36</td>
<td>OOM</td>
<td>439.04</td>
<td>185.29</td>
<td>243.31</td>
<td>103.39</td>
</tr>
<tr>
<td>256</td>
<td>1899.10</td>
<td>OOM</td>
<td>450.22</td>
<td>183.42</td>
<td>242.36</td>
<td>100.98</td>
</tr>
</tbody></table>
<ul>
<li>V100</li>
</ul>
<table><thead>
<tr>
<th>Batch</th>
<th>Alexnet</th>
<th>VGG 16</th>
<th>Inception-BN</th>
<th>Inception-v3</th>
<th>Resnet 50</th>
<th>Resnet 152</th>
</tr>
</thead><tbody>
<tr>
<td>1</td>
<td>659.51</td>
<td>205.16</td>
<td>157.37</td>
<td>87.71</td>
<td>162.15</td>
<td>61.38</td>
</tr>
<tr>
<td>2</td>
<td>1248.21</td>
<td>265.40</td>
<td>297.34</td>
<td>159.24</td>
<td>293.74</td>
<td>116.30</td>
</tr>
<tr>
<td>4</td>
<td>2122.41</td>
<td>333.97</td>
<td>520.91</td>
<td>279.84</td>
<td>479.14</td>
<td>195.17</td>
</tr>
<tr>
<td>8</td>
<td>3894.30</td>
<td>420.26</td>
<td>898.09</td>
<td>455.03</td>
<td>699.39</td>
<td>294.19</td>
</tr>
<tr>
<td>16</td>
<td>5815.58</td>
<td>654.16</td>
<td>1430.97</td>
<td>672.54</td>
<td>947.45</td>
<td>398.79</td>
</tr>
<tr>
<td>32</td>
<td>7906.09</td>
<td>708.43</td>
<td>1847.26</td>
<td>814.59</td>
<td>1076.81</td>
<td>451.82</td>
</tr>
<tr>
<td>64</td>
<td>9486.26</td>
<td>701.59</td>
<td>2134.89</td>
<td>899.01</td>
<td>1168.37</td>
<td>480.44</td>
</tr>
<tr>
<td>128</td>
<td>10177.84</td>
<td>703.30</td>
<td>2318.32</td>
<td>904.33</td>
<td>1233.15</td>
<td>511.79</td>
</tr>
<tr>
<td>256</td>
<td>10990.46</td>
<td>473.62</td>
<td>2425.28</td>
<td>960.20</td>
<td>1155.07</td>
<td>449.35</td>
</tr>
</tbody></table>
<p>Below is the performance result on V100 using float 16.</p>
<table><thead>
<tr>
<th>Batch</th>
<th>VGG 16</th>
<th>Inception-BN</th>
<th>Inception-v3</th>
<th>Resnet 50</th>
<th>Resnet 152</th>
</tr>
</thead><tbody>
<tr>
<td>1</td>
<td>276.29</td>
<td>155.53</td>
<td>150.99</td>
<td>270.89</td>
<td>96.79</td>
</tr>
<tr>
<td>2</td>
<td>476.91</td>
<td>296.45</td>
<td>282.02</td>
<td>493.99</td>
<td>176.88</td>
</tr>
<tr>
<td>4</td>
<td>711.92</td>
<td>525.05</td>
<td>492.45</td>
<td>851.15</td>
<td>321.52</td>
</tr>
<tr>
<td>8</td>
<td>1047.11</td>
<td>900.26</td>
<td>807.94</td>
<td>1282.36</td>
<td>517.66</td>
</tr>
<tr>
<td>16</td>
<td>1299.88</td>
<td>1441.41</td>
<td>1192.21</td>
<td>1722.97</td>
<td>724.57</td>
</tr>
<tr>
<td>32</td>
<td>1486.63</td>
<td>1854.30</td>
<td>1512.08</td>
<td>2085.51</td>
<td>887.34</td>
</tr>
<tr>
<td>64</td>
<td>1219.65</td>
<td>2138.61</td>
<td>1687.35</td>
<td>2341.67</td>
<td>1002.90</td>
</tr>
<tr>
<td>128</td>
<td>1169.81</td>
<td>2317.39</td>
<td>1818.26</td>
<td>2355.04</td>
<td>1046.98</td>
</tr>
<tr>
<td>256</td>
<td>764.16</td>
<td>2425.16</td>
<td>1653.74</td>
<td>1991.88</td>
<td>976.73</td>
</tr>
</tbody></table>
<h3 id="training-results">Training results</h3>
<p>Based on
<a href="https://github.com/apache/mxnet/blob/v1.x/example/image-classification/train_imagenet.py">example/image-classification/train_imagenet.py</a>
and MXNet-1.2.0.rc1, with CUDNN 7.0.5. The benchmark script is available at
<a href="https://github.com/mli/mxnet-benchmark/blob/master/run_vary_batch.sh">here</a>,
where the batch size for Alexnet is increased by 16x.</p>
<ul>
<li>K80 (single GPU)</li>
</ul>
<table><thead>
<tr>
<th>Batch</th>
<th>Alexnet(*16)</th>
<th>Inception-v3</th>
<th>Resnet 50</th>
</tr>
</thead><tbody>
<tr>
<td>1</td>
<td>300.30</td>
<td>10.48</td>
<td>15.61</td>
</tr>
<tr>
<td>2</td>
<td>406.08</td>
<td>16.00</td>
<td>23.88</td>
</tr>
<tr>
<td>4</td>
<td>461.01</td>
<td>22.10</td>
<td>32.26</td>
</tr>
<tr>
<td>8</td>
<td>484.00</td>
<td>26.80</td>
<td>39.42</td>
</tr>
<tr>
<td>16</td>
<td>490.45</td>
<td>31.62</td>
<td>46.69</td>
</tr>
<tr>
<td>32</td>
<td>414.72</td>
<td>33.78</td>
<td>49.48</td>
</tr>
</tbody></table>
<ul>
<li>M60</li>
</ul>
<table><thead>
<tr>
<th>Batch</th>
<th>Alexnet(*16)</th>
<th>Inception-v3</th>
<th>Resnet 50</th>
</tr>
</thead><tbody>
<tr>
<td>1</td>
<td>380.96</td>
<td>14.06</td>
<td>20.55</td>
</tr>
<tr>
<td>2</td>
<td>530.53</td>
<td>21.90</td>
<td>32.65</td>
</tr>
<tr>
<td>4</td>
<td>600.17</td>
<td>31.96</td>
<td>45.57</td>
</tr>
<tr>
<td>8</td>
<td>633.60</td>
<td>40.58</td>
<td>54.92</td>
</tr>
<tr>
<td>16</td>
<td>639.37</td>
<td>46.88</td>
<td>64.44</td>
</tr>
<tr>
<td>32</td>
<td>576.54</td>
<td>50.05</td>
<td>68.34</td>
</tr>
</tbody></table>
<ul>
<li>V100</li>
</ul>
<table><thead>
<tr>
<th>Batch</th>
<th>Alexnet(*16)</th>
<th>Inception-v3</th>
<th>Resnet 50</th>
</tr>
</thead><tbody>
<tr>
<td>1</td>
<td>1629.52</td>
<td>21.83</td>
<td>34.54</td>
</tr>
<tr>
<td>2</td>
<td>2359.73</td>
<td>40.11</td>
<td>65.01</td>
</tr>
<tr>
<td>4</td>
<td>2687.89</td>
<td>72.79</td>
<td>113.49</td>
</tr>
<tr>
<td>8</td>
<td>2919.02</td>
<td>118.43</td>
<td>174.81</td>
</tr>
<tr>
<td>16</td>
<td>2994.32</td>
<td>173.15</td>
<td>251.22</td>
</tr>
<tr>
<td>32</td>
<td>2585.61</td>
<td>214.48</td>
<td>298.51</td>
</tr>
<tr>
<td>64</td>
<td>1984.21</td>
<td>247.43</td>
<td>343.19</td>
</tr>
<tr>
<td>128</td>
<td>OOM</td>
<td>253.68</td>
<td>363.69</td>
</tr>
</tbody></table>
<h2 id="multiple-devices">Multiple Devices</h2>
<p>If more than one GPU or machine are used, MXNet uses <code>kvstore</code> to communicate data.
It&#39;s critical to use the proper type of <code>kvstore</code> to get the best performance.
Refer to <a href="https://mxnet.io/api/faq/distributed_training.html">multi_device.md</a> for more
details.</p>
<p>Besides, we can use <a href="https://github.com/apache/mxnet/tree/v1.x/tools/bandwidth">tools/bandwidth</a>
to find the communication cost per batch.
Ideally, the communication cost should be less than the time to compute a batch.
To reduce the communication cost, we can consider:</p>
<ul>
<li>Exploring different <code>--kv-store</code> options.</li>
<li>Increasing the batch size to improve the computation to communication ratio.</li>
</ul>
<p>Finally, MXNet is integrated with other distributed training frameworks, including <a href="https://github.com/apache/mxnet/tree/master/example/distributed_training-horovod">horovod</a> and <a href="https://github.com/bytedance/byteps#use-byteps-in-your-code">BytePS</a>.</p>
<h2 id="input-data">Input Data</h2>
<p>To make sure you&#39;re handling input data in a reasonable way consider the following:</p>
<ul>
<li>Data format: If you are using the <code>rec</code> format, then everything should be fine.</li>
<li>Decoding: By default, <em>MXNet</em> uses 4 CPU threads for decoding images.
This is often sufficient to decode more than 1K images per second.
If you are using a low-end CPU or your GPUs are very powerful, you can increase the number of threads.</li>
<li>Storage location. Any local or distributed file system (HDFS, Amazon S3) should be fine.
If multiple devices read the data from the shared network file system (NFS) at the same time, problems might occur.</li>
<li>Use a large batch size. We often choose the largest one that fits into GPU memory.
A value that&#39;s too large can slow down convergence.
For example, the safe batch size for CIFAR 10 is approximately 200, while for ImageNet 1K, the batch size can exceed 1K.</li>
</ul>
<h2 id="profiler">Profiler</h2>
<p><em>MXNet</em> has a built-in profiler
that gives detailed information about execution time at the operator level.
This feature complements general profiling tools like <em>nvprof</em> and <em>gprof</em>
by summarizing at the operator level, instead of a function, kernel, or instruction level.</p>
<p>The profiler can be turned on with an <a href="/versions/1.9.1/api/faq/env_var#control-the-profiler">environment variable</a>
for an entire program run, or programmatically for just part of a run. Note that by default the profiler hides the details of each individual operator, and you can reveal the details by setting environment variables <code>MXNET_EXEC_BULK_EXEC_INFERENCE</code>, <code>MXNET_EXEC_BULK_EXEC_MAX_NODE_TRAIN</code> and <code>MXNET_EXEC_BULK_EXEC_TRAIN</code> to 0.
See <a href="https://github.com/apache/mxnet/tree/v1.x/example/profiler">example/profiler</a>
for complete examples of how to use the profiler in code, or <a href="https://mxnet.apache.org/api/python/docs/tutorials/performance/backend/profiler.html">this tutorial</a> on how to profile MXNet performance.</p>
<p>Briefly, the Python code looks like:</p>
<div class="highlight"><pre><code class="language-python" data-lang="python"> <span class="c1"># wait for previous operations to complete
</span> <span class="n">mx</span><span class="o">.</span><span class="n">nd</span><span class="o">.</span><span class="n">waitall</span><span class="p">()</span>
<span class="n">mx</span><span class="o">.</span><span class="n">profiler</span><span class="o">.</span><span class="n">set_config</span><span class="p">(</span><span class="n">profile_all</span><span class="o">=</span><span class="bp">True</span><span class="p">,</span> <span class="n">aggregate_stats</span><span class="o">=</span><span class="bp">True</span><span class="p">,</span> <span class="n">filename</span><span class="o">=</span><span class="s">'profile_output.json'</span><span class="p">)</span>
<span class="n">mx</span><span class="o">.</span><span class="n">profiler</span><span class="o">.</span><span class="n">set_state</span><span class="p">(</span><span class="s">'run'</span><span class="p">)</span>
<span class="c1"># Code to be profiled goes here...
</span>
<span class="c1"># wait for previous operations to complete
</span> <span class="n">mx</span><span class="o">.</span><span class="n">nd</span><span class="o">.</span><span class="n">waitall</span><span class="p">()</span>
<span class="n">mx</span><span class="o">.</span><span class="n">profiler</span><span class="o">.</span><span class="n">set_state</span><span class="p">(</span><span class="s">'stop'</span><span class="p">)</span>
</code></pre></div>
<p>After the program finishes, navigate to your browser&#39;s tracing (Example - chrome://tracing in a Chrome browser) and load the <code>profile_output.json</code> file output by the profiler to inspect the results.</p>
<p><img src="https://cloud.githubusercontent.com/assets/17693755/18035938/0a43484a-6d93-11e6-80d4-241c6ca552ea.png" alt="MLP Profile"></p>
<p>Note that the output file can grow extremely large, so this approach is not recommended for general use.</p>
</div>
</div>
</div>
</div>
</article>
</main><footer class="site-footer h-card">
<div class="wrapper">
<div class="row">
<div class="col-4">
<h4 class="footer-category-title">Resources</h4>
<ul class="contact-list">
<li><a href="/versions/1.9.1/community/contribute#mxnet-dev-communications">Mailing lists</a></li>
<li><a href="https://cwiki.apache.org/confluence/display/MXNET/Apache+MXNet+Home">Developer Wiki</a></li>
<li><a href="https://issues.apache.org/jira/projects/MXNET/issues">Jira Tracker</a></li>
<li><a href="https://github.com/apache/mxnet/labels/Roadmap">Github Roadmap</a></li>
<li><a href="https://medium.com/apache-mxnet">Blog</a></li>
<li><a href="https://discuss.mxnet.io">Forum</a></li>
<li><a href="/versions/1.9.1/community/contribute">Contribute</a></li>
</ul>
</div>
<div class="col-4"><ul class="social-media-list"><li><a href="https://github.com/apache/mxnet"><svg class="svg-icon"><use xlink:href="/versions/1.9.1/assets/minima-social-icons.svg#github"></use></svg> <span class="username">apache/mxnet</span></a></li><li><a href="https://www.twitter.com/apachemxnet"><svg class="svg-icon"><use xlink:href="/versions/1.9.1/assets/minima-social-icons.svg#twitter"></use></svg> <span class="username">apachemxnet</span></a></li><li><a href="https://youtube.com/apachemxnet"><svg class="svg-icon"><use xlink:href="/versions/1.9.1/assets/minima-social-icons.svg#youtube"></use></svg> <span class="username">apachemxnet</span></a></li></ul>
</div>
<div class="col-4 footer-text">
<p>A flexible and efficient library for deep learning.</p>
</div>
</div>
</div>
</footer>
<footer class="site-footer2">
<div class="wrapper">
<div class="row">
<div class="col-3">
<img src="/versions/1.9.1/assets/img/asf_logo.svg" class="footer-logo col-2">
</div>
<div class="footer-bottom-warning col-9">
</p><p>"Copyright © 2017-2022, The Apache Software Foundation. Licensed under the Apache License, Version 2.0. Apache MXNet, MXNet, Apache, the Apache
feather, and the Apache MXNet project logo are either registered trademarks or trademarks of the
Apache Software Foundation."</p>
</div>
</div>
</div>
</footer>
</body>
</html>