blob: fec8afda00e2c2a421d73cae96d9ae201681e6ef [file] [log] [blame]
<!DOCTYPE html>
<!---
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
distributed with this work for additional information
regarding copyright ownership. The ASF licenses this file
to you under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing,
software distributed under the License is distributed on an
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, either express or implied. See the License for the
specific language governing permissions and limitations
under the License.
-->
<html lang=" en"><head>
<meta charset="utf-8">
<meta http-equiv="X-UA-Compatible" content="IE=edge">
<meta name="viewport" content="width=device-width, initial-scale=1">
<link href="/versions/1.9.1/assets/img/mxnet-icon.png" rel="icon" type="image/png"><!-- Begin Jekyll SEO tag v2.6.1 -->
<title>Float16 | Apache MXNet</title>
<meta name="generator" content="Jekyll v3.8.6" />
<meta property="og:title" content="Float16" />
<meta property="og:locale" content="en_US" />
<meta name="description" content="A flexible and efficient library for deep learning." />
<meta property="og:description" content="A flexible and efficient library for deep learning." />
<link rel="canonical" href="https://mxnet.apache.org/versions/1.9.1/api/faq/float16" />
<meta property="og:url" content="https://mxnet.apache.org/versions/1.9.1/api/faq/float16" />
<meta property="og:site_name" content="Apache MXNet" />
<script type="application/ld+json">
{"description":"A flexible and efficient library for deep learning.","headline":"Float16","@type":"WebPage","url":"https://mxnet.apache.org/versions/1.9.1/api/faq/float16","@context":"https://schema.org"}</script>
<!-- End Jekyll SEO tag -->
<link rel="stylesheet" href="/versions/1.9.1/assets/docsearch.min.css" /><link rel="stylesheet" href="/versions/1.9.1/assets/main.css"><link type="application/atom+xml" rel="alternate" href="https://mxnet.apache.org/versions/1.9.1/feed.xml" title="Apache MXNet" /><!-- Matomo -->
<script>
var _paq = window._paq = window._paq || [];
/* tracker methods like "setCustomDimension" should be called before "trackPageView" */
/* We explicitly disable cookie tracking to avoid privacy issues */
_paq.push(['disableCookies']);
_paq.push(['trackPageView']);
_paq.push(['enableLinkTracking']);
(function() {
var u="https://analytics.apache.org/";
_paq.push(['setTrackerUrl', u+'matomo.php']);
_paq.push(['setSiteId', '23']);
var d=document, g=d.createElement('script'), s=d.getElementsByTagName('script')[0];
g.async=true; g.src=u+'matomo.js'; s.parentNode.insertBefore(g,s);
})();
</script>
<!-- End Matomo Code -->
<script src="/versions/1.9.1/assets/js/jquery-3.3.1.min.js"></script>
<script src="/versions/1.9.1/assets/js/docsearch.min.js"></script><script src="/versions/1.9.1/assets/js/globalSearch.js" defer></script>
<script src="/versions/1.9.1/assets/js/clipboard.js" defer></script>
<script src="/versions/1.9.1/assets/js/copycode.js" defer></script></head>
<body><header class="site-header" role="banner">
<script>
$(document).ready(function () {
// HEADER OPACITY LOGIC
function opacity_header() {
var value = "rgba(4,140,204," + ($(window).scrollTop() / 300 + 0.4) + ")"
$('.site-header').css("background-color", value)
}
$(window).scroll(function () {
opacity_header()
})
opacity_header();
// MENU SELECTOR LOGIC
$('.page-link').each( function () {
if (window.location.href.includes(this.href)) {
$(this).addClass("page-current");
}
});
})
</script>
<div class="wrapper">
<a class="site-title" rel="author" href="/versions/1.9.1/"><img
src="/versions/1.9.1/assets/img/mxnet_logo.png" class="site-header-logo"></a>
<nav class="site-nav">
<input type="checkbox" id="nav-trigger" class="nav-trigger"/>
<label for="nav-trigger">
<span class="menu-icon">
<svg viewBox="0 0 18 15" width="18px" height="15px">
<path d="M18,1.484c0,0.82-0.665,1.484-1.484,1.484H1.484C0.665,2.969,0,2.304,0,1.484l0,0C0,0.665,0.665,0,1.484,0 h15.032C17.335,0,18,0.665,18,1.484L18,1.484z M18,7.516C18,8.335,17.335,9,16.516,9H1.484C0.665,9,0,8.335,0,7.516l0,0 c0-0.82,0.665-1.484,1.484-1.484h15.032C17.335,6.031,18,6.696,18,7.516L18,7.516z M18,13.516C18,14.335,17.335,15,16.516,15H1.484 C0.665,15,0,14.335,0,13.516l0,0c0-0.82,0.665-1.483,1.484-1.483h15.032C17.335,12.031,18,12.695,18,13.516L18,13.516z"/>
</svg>
</span>
</label>
<div class="gs-search-border">
<div id="gs-search-icon"></div>
<form id="global-search-form">
<input id="global-search" type="text" title="Search" placeholder="Search" />
<div id="global-search-dropdown-container">
<button class="gs-current-version btn" type="button" data-toggle="dropdown">
<span id="gs-current-version-label">1.9.1</span>
<svg class="gs-dropdown-caret" viewBox="0 0 32 32" class="icon icon-caret-bottom" aria-hidden="true">
<path class="dropdown-caret-path" d="M24 11.305l-7.997 11.39L8 11.305z"></path>
</svg>
</button>
<ul class="gs-opt-group gs-version-dropdown">
<li class="gs-opt gs-versions">master</li>
<li class="gs-opt gs-versions active">1.9.1</li>
<li class="gs-opt gs-versions">1.8.0</li>
<li class="gs-opt gs-versions">1.7.0</li>
<li class="gs-opt gs-versions">1.6.0</li>
<li class="gs-opt gs-versions">1.5.0</li>
<li class="gs-opt gs-versions">1.4.1</li>
<li class="gs-opt gs-versions">1.3.1</li>
<li class="gs-opt gs-versions">1.2.1</li>
<li class="gs-opt gs-versions">1.1.0</li>
<li class="gs-opt gs-versions">1.0.0</li>
<li class="gs-opt gs-versions">0.12.1</li>
<li class="gs-opt gs-versions">0.11.0</li>
</ul>
</div>
<span id="global-search-close">x</span>
</form>
</div>
<div class="trigger">
<div id="global-search-mobile-border">
<div id="gs-search-icon-mobile"></div>
<input id="global-search-mobile" placeholder="Search..." type="text"/>
<div id="global-search-dropdown-container-mobile">
<button class="gs-current-version-mobile btn" type="button" data-toggle="dropdown">
<svg class="gs-dropdown-caret" viewBox="0 0 32 32" class="icon icon-caret-bottom" aria-hidden="true">
<path class="dropdown-caret-path" d="M24 11.305l-7.997 11.39L8 11.305z"></path>
</svg>
</button>
<ul class="gs-opt-group gs-version-dropdown-mobile">
<li class="gs-opt gs-versions">master</li>
<li class="gs-opt gs-versions active">1.9.1</li>
<li class="gs-opt gs-versions">1.8.0</li>
<li class="gs-opt gs-versions">1.7.0</li>
<li class="gs-opt gs-versions">1.6.0</li>
<li class="gs-opt gs-versions">1.5.0</li>
<li class="gs-opt gs-versions">1.4.1</li>
<li class="gs-opt gs-versions">1.3.1</li>
<li class="gs-opt gs-versions">1.2.1</li>
<li class="gs-opt gs-versions">1.1.0</li>
<li class="gs-opt gs-versions">1.0.0</li>
<li class="gs-opt gs-versions">0.12.1</li>
<li class="gs-opt gs-versions">0.11.0</li>
</ul>
</div>
</div>
<a class="page-link" href="/versions/1.9.1/get_started">Get Started</a>
<a class="page-link" href="/versions/1.9.1/features">Features</a>
<a class="page-link" href="/versions/1.9.1/ecosystem">Ecosystem</a>
<a class="page-link" href="/versions/1.9.1/api">Docs & Tutorials</a>
<a class="page-link" href="/versions/1.9.1/trusted_by">Trusted By</a>
<a class="page-link" href="https://github.com/apache/mxnet">GitHub</a>
<div class="dropdown" style="min-width:100px">
<span class="dropdown-header">Apache
<svg class="dropdown-caret" viewBox="0 0 32 32" class="icon icon-caret-bottom" aria-hidden="true"><path class="dropdown-caret-path" d="M24 11.305l-7.997 11.39L8 11.305z"></path></svg>
</span>
<div class="dropdown-content" style="min-width:250px">
<a href="https://www.apache.org/foundation/">Apache Software Foundation</a>
<a href="https://www.apache.org/licenses/">License</a>
<a href="/versions/1.9.1/api/faq/security.html">Security</a>
<a href="https://privacy.apache.org/policies/privacy-policy-public.html">Privacy</a>
<a href="https://www.apache.org/events/current-event">Events</a>
<a href="https://www.apache.org/foundation/sponsorship.html">Sponsorship</a>
<a href="https://www.apache.org/foundation/thanks.html">Thanks</a>
</div>
</div>
<div class="dropdown">
<span class="dropdown-header">1.9.1
<svg class="dropdown-caret" viewBox="0 0 32 32" class="icon icon-caret-bottom" aria-hidden="true"><path class="dropdown-caret-path" d="M24 11.305l-7.997 11.39L8 11.305z"></path></svg>
</span>
<div class="dropdown-content">
<a href="/">master</a>
<a class="dropdown-option-active" href="/versions/1.9.1/">1.9.1</a>
<a href="/versions/1.8.0/">1.8.0</a>
<a href="/versions/1.7.0/">1.7.0</a>
<a href="/versions/1.6.0/">1.6.0</a>
<a href="/versions/1.5.0/">1.5.0</a>
<a href="/versions/1.4.1/">1.4.1</a>
<a href="/versions/1.3.1/">1.3.1</a>
<a href="/versions/1.2.1/">1.2.1</a>
<a href="/versions/1.1.0/">1.1.0</a>
<a href="/versions/1.0.0/">1.0.0</a>
<a href="/versions/0.12.1/">0.12.1</a>
<a href="/versions/0.11.0/">0.11.0</a>
</div>
</div>
</div>
</nav>
</div>
</header>
<main class="page-content" aria-label="Content">
<script>
</script>
<article class="post">
<header class="post-header wrapper">
<h1 class="post-title">Float16</h1>
<h3></h3></header>
<div class="post-content">
<div class="wrapper">
<div class="row">
<div class="col-3 docs-side-bar">
<h3 style="text-transform: capitalize; padding-left:10px">faq</h3>
<ul>
<!-- page-category -->
<!-- page-category -->
<li><a href="/versions/1.9.1/api/faq/add_op_in_backend">A Beginner's Guide to Implementing Operators in MXNet Backend</a></li>
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<li><a href="/versions/1.9.1/api/faq/caffe">Convert from Caffe to MXNet</a></li>
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<li><a href="/versions/1.9.1/api/faq/cloud">MXNet on the Cloud</a></li>
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<li><a href="/versions/1.9.1/api/faq/distributed_training">Distributed Training in MXNet</a></li>
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<li><a href="/versions/1.9.1/api/faq/env_var">Environment Variables</a></li>
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<li><a href="/versions/1.9.1/api/faq/float16">Float16</a></li>
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<li><a href="/versions/1.9.1/api/faq/large_tensor_support">Using MXNet with Large Tensor Support</a></li>
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<li><a href="/versions/1.9.1/api/faq/model_parallel_lstm">Model Parallel</a></li>
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<li><a href="/versions/1.9.1/api/faq/multi_device">Data Parallelism with Multiple CPU/GPUs on MXNet</a></li>
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<li><a href="/versions/1.9.1/api/faq/new_op">Create New Operators</a></li>
<!-- page-category -->
<li><a href="/versions/1.9.1/api/faq/nnpack">NNPACK for Multi-Core CPU Support in MXNet</a></li>
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<li><a href="/versions/1.9.1/api/faq/perf">Some Tips for Improving MXNet Performance</a></li>
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<li><a href="/versions/1.9.1/api/faq/recordio">Create a Dataset Using RecordIO</a></li>
<!-- page-category -->
<li><a href="/versions/1.9.1/api/faq/s3_integration">Use data from S3 for training</a></li>
<!-- page-category -->
<!-- page-category -->
<li><a href="/versions/1.9.1/api/faq/security">MXNet Security Best Practices</a></li>
<!-- page-category -->
<li><a href="/versions/1.9.1/api/faq/smart_device">Deep Learning at the Edge</a></li>
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<li><a href="/versions/1.9.1/api/faq/visualize_graph">Visualize Neural Networks</a></li>
<!-- page-category -->
<li><a href="/versions/1.9.1/api/faq/why_mxnet">Why MXNet came to be?</a></li>
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- resource-p -->
</ul>
</div>
<div class="col-9">
<!--- Licensed to the Apache Software Foundation (ASF) under one -->
<!--- or more contributor license agreements. See the NOTICE file -->
<!--- distributed with this work for additional information -->
<!--- regarding copyright ownership. The ASF licenses this file -->
<!--- to you under the Apache License, Version 2.0 (the -->
<!--- "License"); you may not use this file except in compliance -->
<!--- with the License. You may obtain a copy of the License at -->
<!--- http://www.apache.org/licenses/LICENSE-2.0 -->
<!--- Unless required by applicable law or agreed to in writing, -->
<!--- software distributed under the License is distributed on an -->
<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
<!--- KIND, either express or implied. See the License for the -->
<!--- specific language governing permissions and limitations -->
<!--- under the License. -->
<h1 id="mixed-precision-training-using-float16">Mixed precision training using float16</h1>
<p>In this tutorial we will walk through how one can train deep learning neural networks with mixed precision on supported hardware. We will first see how to use float16 (both with Gluon and Symbolic APIs) and then some techniques on achieving good performance and accuracy.</p>
<h2 id="background">Background</h2>
<p>The computational resources required for training deep neural networks have been lately increasing because of growing complexity and model size. Mixed precision training allows us to reduce the utilization of the resources by using lower precision arithmetic which is computationally less expensive and less costly in terms of space utilization. In this approach you can train using 16 bit floating point (half precision) while using 32 bit floating point (single precision) for output buffers of float16 computation. This allows one to achieve the same accuracy as training with single precision, while decreasing the required memory and training or inference time.</p>
<p>The float16 data type is a 16 bit floating point representation according to the <a href="https://ieeexplore.ieee.org/document/4610935">IEEE 754 standard</a>. It has a dynamic range where the precision can go from 0.0000000596046 (highest, for values closest to 0) to 32 (lowest, for values in the range 32768-65536). Despite the inherent reduced precision when compared to single precision float (float32), using float16 has many advantages. The most obvious advantages are that you can reduce the size of the model by half allowing the training of larger models and using larger batch sizes. The reduced memory footprint also helps in reducing the pressure on memory bandwidth and lowering communication costs. On hardware with specialized support for float16 computation you can also greatly improve the speed of training and inference. The Volta range of Graphics Processing Units (GPUs) from Nvidia have <a href="https://www.nvidia.com/en-us/data-center/tensorcore/">Tensor Cores</a> which perform efficient float16 computation. A tensor core allows accumulation of half precision products into single or half precision outputs. For the rest of this tutorial we assume that we are working with Nvidia&#39;s Tensor Cores on a Volta GPU.</p>
<h2 id="prerequisites">Prerequisites</h2>
<ul>
<li><a href="https://www.nvidia.com/en-us/data-center/volta-gpu-architecture/">Volta</a> range of Nvidia GPUs (e.g. AWS P3 instance)</li>
<li>CUDA 9 or higher</li>
<li>cuDNN v7 or higher</li>
</ul>
<p>This tutorial also assumes understanding of how to train a network with float32 (the default). Please refer to <a href="/api/python/docs/tutorials/getting-started/logistic_regression_explained.html">logistic regression tutorial</a> to get started with Apache MXNet and Gluon API. This tutorial focuses on the changes needed to switch from float32 to mixed precision and tips on achieving the best performance with mixed precision.</p>
<h2 id="using-the-gluon-api">Using the Gluon API</h2>
<h3 id="training-or-inference">Training or Inference</h3>
<p>With Gluon API, you need to take care of three things to convert a model to support computation with float16.</p>
<ol>
<li>Cast Gluon <code>Block</code>&#39;s parameters and expected input type to float16 by calling the <a href="/api/python/docs/api/gluon/block.html?cast#mxnet.gluon.Block.cast">cast</a> method of the <code>Block</code> representing the network.</li>
</ol>
<div class="highlight"><pre><code class="language-python" data-lang="python"><span class="n">net</span><span class="o">.</span><span class="n">cast</span><span class="p">(</span><span class="s">'float16'</span><span class="p">)</span>
</code></pre></div>
<ol>
<li>Ensure the data input to the network is of float16 type. If your <code>DataLoader</code> or <code>Iterator</code> produces output in another datatype, then you would have to cast your data. There are different ways you can do this. The easiest would be to use the <a href="/api/python/docs/api/ndarray/ndarray.html?astype#mxnet.ndarray.NDArray.astype">astype</a> method of NDArrays.</li>
</ol>
<div class="highlight"><pre><code class="language-python" data-lang="python"><span class="n">data</span> <span class="o">=</span> <span class="n">data</span><span class="o">.</span><span class="n">astype</span><span class="p">(</span><span class="s">'float16'</span><span class="p">,</span> <span class="n">copy</span><span class="o">=</span><span class="bp">False</span><span class="p">)</span>
</code></pre></div>
<p>If you are using images and DataLoader, you can also use a <a href="/api/python/docs/api/gluon/data/vision/transforms/index.html#mxnet.gluon.data.vision.transforms.Cast">Cast transform</a>.</p>
<ol>
<li>It is preferable to use <strong>multi_precision mode of optimizer</strong> when training in float16. This mode of optimizer maintains a master copy of the weights in float32 even when the training (i.e. forward and backward pass) is in float16. This helps increase precision of the weight updates and can lead to faster convergence in some scenarios.</li>
</ol>
<div class="highlight"><pre><code class="language-python" data-lang="python"><span class="n">optimizer</span> <span class="o">=</span> <span class="n">mx</span><span class="o">.</span><span class="n">optimizer</span><span class="o">.</span><span class="n">create</span><span class="p">(</span><span class="s">'sgd'</span><span class="p">,</span> <span class="n">multi_precision</span><span class="o">=</span><span class="bp">True</span><span class="p">,</span> <span class="n">lr</span><span class="o">=</span><span class="mf">0.01</span><span class="p">)</span>
</code></pre></div>
<p>You can play around with mixed precision using the image classification <a href="https://github.com/apache/mxnet/blob/master/example/image-classification/train_imagenet.py">example</a>. We suggest using the Caltech101 dataset option in that example and using a ResNet50V1 network so you can quickly see the performance improvement and how the accuracy is unaffected. Here&#39;s the starter command to run this example.</p>
<div class="highlight"><pre><code class="language-bash" data-lang="bash">python image_classification.py <span class="nt">--model</span> resnet50_v1 <span class="nt">--dataset</span> caltech101 <span class="nt">--gpus</span> 0 <span class="nt">--num-worker</span> 30 <span class="nt">--dtype</span> float16
</code></pre></div>
<h3 id="fine-tuning">Fine-tuning</h3>
<p>You can also fine-tune a model, which was originally trained in float32, to use float16. Below is an example of how to fine-tune a pretrained model from the Model Zoo. You would first need to fetch the pretrained network and then cast that network to float16.</p>
<div class="highlight"><pre><code class="language-python" data-lang="python"><span class="kn">import</span> <span class="nn">numpy</span> <span class="k">as</span> <span class="n">np</span>
<span class="kn">import</span> <span class="nn">mxnet</span> <span class="k">as</span> <span class="n">mx</span>
<span class="kn">from</span> <span class="nn">mxnet.gluon.model_zoo.vision</span> <span class="kn">import</span> <span class="n">get_model</span>
<span class="n">pretrained_net</span> <span class="o">=</span> <span class="n">get_model</span><span class="p">(</span><span class="n">name</span><span class="o">=</span><span class="s">'resnet50_v2'</span><span class="p">,</span> <span class="n">ctx</span><span class="o">=</span><span class="n">mx</span><span class="o">.</span><span class="n">cpu</span><span class="p">(),</span>
<span class="n">pretrained</span><span class="o">=</span><span class="bp">True</span><span class="p">,</span> <span class="n">classes</span><span class="o">=</span><span class="mi">1000</span><span class="p">)</span>
<span class="n">pretrained_net</span><span class="o">.</span><span class="n">cast</span><span class="p">(</span><span class="s">'float16'</span><span class="p">)</span>
</code></pre></div>
<p>Then, if you have another Resnet50V2 model you want to fine-tune, you can just assign the features to that network and then cast it.</p>
<div class="highlight"><pre><code class="language-python" data-lang="python"><span class="n">net</span> <span class="o">=</span> <span class="n">get_model</span><span class="p">(</span><span class="n">name</span><span class="o">=</span><span class="s">'resnet50_v2'</span><span class="p">,</span> <span class="n">ctx</span><span class="o">=</span><span class="n">mx</span><span class="o">.</span><span class="n">cpu</span><span class="p">(),</span>
<span class="n">pretrained</span><span class="o">=</span><span class="bp">False</span><span class="p">,</span> <span class="n">classes</span><span class="o">=</span><span class="mi">101</span><span class="p">)</span>
<span class="n">net</span><span class="o">.</span><span class="n">collect_params</span><span class="p">()</span><span class="o">.</span><span class="n">initialize</span><span class="p">(</span><span class="n">mx</span><span class="o">.</span><span class="n">init</span><span class="o">.</span><span class="n">Xavier</span><span class="p">(</span><span class="n">magnitude</span><span class="o">=</span><span class="mf">2.24</span><span class="p">),</span> <span class="n">ctx</span><span class="o">=</span><span class="n">mx</span><span class="o">.</span><span class="n">cpu</span><span class="p">())</span>
<span class="n">net</span><span class="o">.</span><span class="n">features</span> <span class="o">=</span> <span class="n">pretrained_net</span><span class="o">.</span><span class="n">features</span>
<span class="n">net</span><span class="o">.</span><span class="n">cast</span><span class="p">(</span><span class="s">'float16'</span><span class="p">)</span>
</code></pre></div>
<p>You can check the parameters of the model by calling <a href="/api/python/docs/api/gluon/block.html?block%20summary#mxnet.gluon.Block.summary">summary</a> with some fake data. Notice the provided <code>dtype=np.float16</code> in the line below. As it was mentioned earlier, we have to provide data as float16 as well.</p>
<div class="highlight"><pre><code class="language-python" data-lang="python"><span class="n">net</span><span class="o">.</span><span class="n">summary</span><span class="p">(</span><span class="n">mx</span><span class="o">.</span><span class="n">nd</span><span class="o">.</span><span class="n">uniform</span><span class="p">(</span><span class="n">shape</span><span class="o">=</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="mi">3</span><span class="p">,</span> <span class="mi">224</span><span class="p">,</span> <span class="mi">224</span><span class="p">),</span> <span class="n">dtype</span><span class="o">=</span><span class="n">np</span><span class="o">.</span><span class="n">float16</span><span class="p">))</span>
</code></pre></div>
<h2 id="using-the-symbolic-api">Using the Symbolic API</h2>
<p>Training a network in float16 with the Symbolic API involves the following steps.</p>
<ol>
<li>Add a layer at the beginning of the network, to cast the data to float16. This will ensure that all the following layers compute in float16.</li>
<li>It is advisable to cast the output of the layers before softmax to float32, so that the softmax computation is done in float32. This is because softmax involves large reductions and it helps to keep that in float32 for more precise answer.</li>
<li>It is advisable to use the multi-precision mode of the optimizer for more precise weight updates. Here&#39;s how you would enable this mode when creating an optimizer.</li>
</ol>
<div class="highlight"><pre><code class="language-python" data-lang="python"><span class="n">optimizer</span> <span class="o">=</span> <span class="n">mx</span><span class="o">.</span><span class="n">optimizer</span><span class="o">.</span><span class="n">create</span><span class="p">(</span><span class="s">'sgd'</span><span class="p">,</span> <span class="n">multi_precision</span><span class="o">=</span><span class="bp">True</span><span class="p">,</span> <span class="n">lr</span><span class="o">=</span><span class="mf">0.01</span><span class="p">)</span>
</code></pre></div>
<p>For a full example, please refer to <a href="https://github.com/apache/mxnet/blob/master/example/image-classification/symbols/resnet.py">resnet.py</a> file on GitHub. A small, relevant excerpt from that file is presented below.</p>
<div class="highlight"><pre><code class="language-python" data-lang="python"><span class="n">data</span> <span class="o">=</span> <span class="n">mx</span><span class="o">.</span><span class="n">sym</span><span class="o">.</span><span class="n">Variable</span><span class="p">(</span><span class="n">name</span><span class="o">=</span><span class="s">"data"</span><span class="p">)</span>
<span class="k">if</span> <span class="n">dtype</span> <span class="o">==</span> <span class="s">'float16'</span><span class="p">:</span>
<span class="n">data</span> <span class="o">=</span> <span class="n">mx</span><span class="o">.</span><span class="n">sym</span><span class="o">.</span><span class="n">Cast</span><span class="p">(</span><span class="n">data</span><span class="o">=</span><span class="n">data</span><span class="p">,</span> <span class="n">dtype</span><span class="o">=</span><span class="n">np</span><span class="o">.</span><span class="n">float16</span><span class="p">)</span>
<span class="c1"># ... the rest of the network
</span><span class="n">net_out</span> <span class="o">=</span> <span class="n">net</span><span class="p">(</span><span class="n">data</span><span class="p">)</span>
<span class="k">if</span> <span class="n">dtype</span> <span class="o">==</span> <span class="s">'float16'</span><span class="p">:</span>
<span class="n">net_out</span> <span class="o">=</span> <span class="n">mx</span><span class="o">.</span><span class="n">sym</span><span class="o">.</span><span class="n">Cast</span><span class="p">(</span><span class="n">data</span><span class="o">=</span><span class="n">net_out</span><span class="p">,</span> <span class="n">dtype</span><span class="o">=</span><span class="n">np</span><span class="o">.</span><span class="n">float32</span><span class="p">)</span>
<span class="n">output</span> <span class="o">=</span> <span class="n">mx</span><span class="o">.</span><span class="n">sym</span><span class="o">.</span><span class="n">SoftmaxOutput</span><span class="p">(</span><span class="n">data</span><span class="o">=</span><span class="n">net_out</span><span class="p">,</span> <span class="n">name</span><span class="o">=</span><span class="s">'softmax'</span><span class="p">)</span>
</code></pre></div>
<p>If you would like to train ResNet50 model on ImageNet using float16 precision, you can find the full script <a href="https://github.com/apache/mxnet/blob/master/docs/static_site/src/pages/api/faq/float16.md">here</a></p>
<p>If you don&#39;t have ImageNet dataset at your disposal, you can still run the script above using synthetic float16 data by providing the following command:</p>
<div class="highlight"><pre><code class="language-bash" data-lang="bash">python train_imagenet.py <span class="nt">--network</span> resnet-v1 <span class="nt">--num-layers</span> 50 <span class="nt">--benchmark</span> 1 <span class="nt">--gpus</span> 0 <span class="nt">--batch-size</span> 256 <span class="nt">--dtype</span> float16
</code></pre></div>
<p>There&#39;s a similar example for float16 fine tuning <a href="https://github.com/apache/mxnet/tree/master/example/image-classification/fine-tune.py">here</a> of selected models: Inception v3, Inception v4, ResNetV1, ResNet50, ResNext or VGG. The command below shows how to use that script to fine-tune a Resnet50 model trained on Imagenet for the Caltech 256 dataset using float16.</p>
<div class="highlight"><pre><code class="language-bash" data-lang="bash">python fine-tune.py <span class="nt">--network</span> resnet <span class="nt">--num-layers</span> 50 <span class="nt">--pretrained-model</span> imagenet1k-resnet-50 <span class="nt">--data-train</span> ~/.mxnet/dataset/caltech-256/caltech256-train.rec <span class="nt">--data-val</span> ~/data/caltech-256/caltech256-val.rec <span class="nt">--num-examples</span> 15420 <span class="nt">--num-classes</span> 256 <span class="nt">--gpus</span> 0 <span class="nt">--batch-size</span> 64 <span class="nt">--dtype</span> float16
</code></pre></div>
<p>If you don&#39;t have the <code>Caltech256</code> dataset, you can download it using the script below, and convert it into .rec file format using <a href="https://github.com/apache/mxnet/blob/master/tools/im2rec.py">im2rec utility file</a></p>
<div class="highlight"><pre><code class="language-python" data-lang="python"><span class="kn">import</span> <span class="nn">os</span>
<span class="kn">from</span> <span class="nn">os.path</span> <span class="kn">import</span> <span class="n">expanduser</span>
<span class="kn">import</span> <span class="nn">tarfile</span>
<span class="kn">import</span> <span class="nn">mxnet</span> <span class="k">as</span> <span class="n">mx</span>
<span class="n">data_folder</span> <span class="o">=</span> <span class="n">expanduser</span><span class="p">(</span><span class="s">"~/.mxnet/datasets/"</span><span class="p">)</span>
<span class="n">dataset_name</span> <span class="o">=</span> <span class="s">"256_ObjectCategories"</span>
<span class="n">archive_file</span> <span class="o">=</span> <span class="s">"{}.tar"</span><span class="o">.</span><span class="nb">format</span><span class="p">(</span><span class="n">dataset_name</span><span class="p">)</span>
<span class="n">archive_path</span> <span class="o">=</span> <span class="n">os</span><span class="o">.</span><span class="n">path</span><span class="o">.</span><span class="n">join</span><span class="p">(</span><span class="n">data_folder</span><span class="p">,</span> <span class="n">archive_file</span><span class="p">)</span>
<span class="n">data_url</span> <span class="o">=</span> <span class="s">"http://www.vision.caltech.edu/Image_Datasets/Caltech256/"</span>
<span class="k">if</span> <span class="ow">not</span> <span class="n">os</span><span class="o">.</span><span class="n">path</span><span class="o">.</span><span class="n">isfile</span><span class="p">(</span><span class="n">archive_path</span><span class="p">):</span>
<span class="n">mx</span><span class="o">.</span><span class="n">test_utils</span><span class="o">.</span><span class="n">download</span><span class="p">(</span><span class="s">"{}{}"</span><span class="o">.</span><span class="nb">format</span><span class="p">(</span><span class="n">data_url</span><span class="p">,</span> <span class="n">archive_file</span><span class="p">),</span>
<span class="n">dirname</span><span class="o">=</span><span class="n">data_folder</span><span class="p">)</span>
<span class="k">print</span><span class="p">(</span><span class="s">'Extracting {} in {}...'</span><span class="o">.</span><span class="nb">format</span><span class="p">(</span><span class="n">archive_file</span><span class="p">,</span> <span class="n">data_folder</span><span class="p">))</span>
<span class="n">tar</span> <span class="o">=</span> <span class="n">tarfile</span><span class="o">.</span><span class="nb">open</span><span class="p">(</span><span class="n">archive_path</span><span class="p">)</span>
<span class="n">tar</span><span class="o">.</span><span class="n">extractall</span><span class="p">(</span><span class="n">data_folder</span><span class="p">)</span>
<span class="n">tar</span><span class="o">.</span><span class="n">close</span><span class="p">()</span>
<span class="k">print</span><span class="p">(</span><span class="s">'Data extracted.'</span><span class="p">)</span>
</code></pre></div>
<h2 id="example-training-results">Example training results</h2>
<p>Let us consider training a Resnet50V1 model on the ImageNet 2012 dataset. For this model, the GPU memory usage is close to the capacity of V100 GPU with a batch size of 128 when using float32. Using float16 allows the use of 256 batch size. Shared below are results using 8 V100 GPUs on a an <a href="https://aws.amazon.com/ec2/instance-types/p3/#Amazon_EC2_P3_Instance_Product_Details">AWS p3.16xlarge</a> instance.</p>
<p>Let us compare the three scenarios that arise here: float32 with 1024 batch size, float16 with 1024 batch size and float16 with 2048 batch size. These jobs trained for 90 epochs using a learning rate of 0.4 for 1024 batch size and 0.8 for 2048 batch size. This learning rate was decayed by a factor of 0.1 at the 30th, 60th and 80th epochs. The only changes made for the float16 jobs when compared to the float32 job were that the network and data were cast to float16, and the multi-precision mode was used for optimizer. The final accuracy at 90th epoch and the time to train are tabulated below for these three scenarios. The top-1 validation errors at the end of each epoch are also plotted below.</p>
<table><thead>
<tr>
<th>Batch size</th>
<th>Data type</th>
<th>Top 1 Validation accuracy</th>
<th>Time to train</th>
<th>Speedup</th>
</tr>
</thead><tbody>
<tr>
<td>1024</td>
<td>float32</td>
<td>76.18%</td>
<td>11.8 hrs</td>
<td>1</td>
</tr>
<tr>
<td>1024</td>
<td>float16</td>
<td>76.34%</td>
<td>7.3 hrs</td>
<td>1.62x</td>
</tr>
<tr>
<td>2048</td>
<td>float16</td>
<td>76.29%</td>
<td>6.5 hrs</td>
<td>1.82x</td>
</tr>
</tbody></table>
<p><img src="https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/tutorials/mixed-precision/resnet50v1b_imagenet_fp16_fp32_training.png" alt="Training curves of Resnet50V1 on Imagenet 2012"></p>
<p>The difference in accuracies above are within normal random variation, and there is no reason to expect float16 to have better accuracy than float32 in general. As the plot indicates, training behaves similarly for these cases, even though we didn&#39;t have to change any other hyperparameters. We can also see from the table that using float16 helps train faster through faster computation with float16 as well as allowing the use of larger batch sizes.</p>
<h2 id="things-to-keep-in-mind">Things to keep in mind</h2>
<h3 id="for-performance">For performance</h3>
<p>Typical performance gains seen for float16 typically range 1.6x-2x for convolutional networks like Resnet and even about 3x for networks with LSTMs. The performance gain you see can depend on certain things which this section will introduce.</p>
<ol>
<li><p>Nvidia Tensor Cores essentially perform the computation <code>D = A * B + C</code>, where A and B are half precision matrices, while C and D could be either half precision or full precision. The tensor cores are most efficient when dimensions of these matrices are multiples of 8. This means that Tensor Cores can not be used in all cases for fast float16 computation. When training models like Resnet50 on the Cifar10 dataset, the tensors involved are sometimes smaller, and Tensor Cores can not always be used. The computation in that case falls back to slower algorithms and using float16 turns out to be slower than float32 on a single GPU. Note that when using multiple GPUs, using float16 can still be faster than float32 because of reduction in communication costs.</p></li>
<li><p>When you scale up the batch size ensure that IO and data pre-processing is not your bottleneck. If you see a slowdown this would be the first thing to check.</p></li>
<li><p>It is advisable to use batch sizes that are multiples of 8 because of the above reason when training with float16. As always, batch sizes which are powers of 2 would be best when compared to those around it.</p></li>
<li><p>You can check whether your program is using Tensor cores for fast float16 computation by profiling with <code>nvprof</code>. The operations with <code>s884cudnn</code> in their names represent the use of Tensor cores.</p></li>
<li><p>When not limited by GPU memory, it can help to set the environment variable <code>MXNET_CUDNN_AUTOTUNE_DEFAULT</code> to <code>2</code>. This configures MXNet to run tuning tests and choose the fastest convolution algorithm whose memory requirements may exceed the default memory of CUDA workspace.</p></li>
<li><p>Please note that float16 on CPU might not be supported for all operators, as in most cases float16 on CPU is much slower than float32.</p></li>
</ol>
<h3 id="for-accuracy">For accuracy</h3>
<h4 id="multi-precision-mode">Multi precision mode</h4>
<p>When training in float16, it is advisable to still store the master copy of the weights in float32 for better accuracy. The higher precision of float32 helps overcome cases where gradient update can become 0 if represented in float16. This mode can be activated by setting the parameter <code>multi_precision</code> of optimizer params to <code>True</code> as in the above example. It has been found that this is not required for all networks to achieve the same accuracy as with float32, but nevertheless recommended. Note that for distributed training, this is currently slightly slower than without <code>multi_precision</code>, but still much faster than using float32 for training.</p>
<h4 id="large-reductions">Large reductions</h4>
<p>Since float16 has low precision for large numbers, it is best to leave layers which perform large reductions in float32. This includes BatchNorm and Softmax. Ensuring that Batchnorm performs reduction in float32 is handled by default in both Gluon and Module APIs. While Softmax is set to use float32 even during float16 training in Gluon, in the Module API it needs to be a cast to float32 before softmax as the above symbolic example code shows.</p>
<h4 id="loss-scaling">Loss scaling</h4>
<p>For some networks just switching the training to float16 mode was not found to be enough to reach the same accuracy as when training with float32. This is because the activation gradients computed are too small and could not be represented in float16 representable range. Such networks can be made to achieve the accuracy reached by float32 with a couple of changes.</p>
<p>Most of the float16 representable range is not used by activation gradients generally. So you can shift the gradients into float16 range by scaling up the loss by a factor <code>S</code>. By the chain rule, this scales up the loss before backward pass, and then you can scale back the gradients before updating the weights. This ensures that training in float16 can use the same hyperparameters as used during float32 training.</p>
<p>Here&#39;s how you can configure the loss to be scaled up by 128 and rescale the gradient down before updating the weights.</p>
<p><em>Gluon API</em></p>
<div class="highlight"><pre><code class="language-python" data-lang="python"><span class="n">loss</span> <span class="o">=</span> <span class="n">gluon</span><span class="o">.</span><span class="n">loss</span><span class="o">.</span><span class="n">SoftmaxCrossEntropyLoss</span><span class="p">(</span><span class="n">weight</span><span class="o">=</span><span class="mi">128</span><span class="p">)</span>
<span class="n">optimizer</span> <span class="o">=</span> <span class="n">mx</span><span class="o">.</span><span class="n">optimizer</span><span class="o">.</span><span class="n">create</span><span class="p">(</span><span class="s">'sgd'</span><span class="p">,</span>
<span class="n">multi_precision</span><span class="o">=</span><span class="bp">True</span><span class="p">,</span>
<span class="n">rescale_grad</span><span class="o">=</span><span class="mf">1.0</span><span class="o">/</span><span class="mi">128</span><span class="p">)</span>
</code></pre></div>
<p><em>Module API</em></p>
<div class="highlight"><pre><code class="language-python" data-lang="python"><span class="n">mxnet</span><span class="o">.</span><span class="n">sym</span><span class="o">.</span><span class="n">SoftmaxOutput</span><span class="p">(</span><span class="n">other_args</span><span class="p">,</span> <span class="n">grad_scale</span><span class="o">=</span><span class="mf">128.0</span><span class="p">)</span>
<span class="n">optimizer</span> <span class="o">=</span> <span class="n">mx</span><span class="o">.</span><span class="n">optimizer</span><span class="o">.</span><span class="n">create</span><span class="p">(</span><span class="s">'sgd'</span><span class="p">,</span>
<span class="n">multi_precision</span><span class="o">=</span><span class="bp">True</span><span class="p">,</span>
<span class="n">rescale_grad</span><span class="o">=</span><span class="mf">1.0</span><span class="o">/</span><span class="mi">128</span><span class="p">)</span>
</code></pre></div>
<p>Networks like Multibox SSD, R-CNN, bigLSTM and Seq2seq were found to exhibit such behavior.
You can choose a constant scaling factor while ensuring that the absolute value of gradient when multiplied by this factor remains in the range of float16. Generally powers of 2 like 64, 128, 256, 512 are chosen. Refer to the linked articles below for more details on this.</p>
<h2 id="references">References</h2>
<ol>
<li><a href="http://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html">Training with Mixed Precision User Guide</a></li>
<li><a href="https://arxiv.org/pdf/1710.03740.pdf">Mixed Precision Training at ICLR 2018</a></li>
<li><a href="https://devblogs.nvidia.com/mixed-precision-training-deep-neural-networks/">Mixed-Precision Training of Deep Neural Networks</a></li>
</ol>
<h2 id="recommended-next-steps">Recommended Next Steps</h2>
<ul>
<li>Check out our video tutorial on <a href="https://www.youtube.com/watch?v=pR4KMh1lGC0">Using Mixed Precision with MXNet</a></li>
</ul>
</div>
</div>
</div>
</div>
</article>
</main><footer class="site-footer h-card">
<div class="wrapper">
<div class="row">
<div class="col-4">
<h4 class="footer-category-title">Resources</h4>
<ul class="contact-list">
<li><a href="/versions/1.9.1/community/contribute#mxnet-dev-communications">Mailing lists</a></li>
<li><a href="https://cwiki.apache.org/confluence/display/MXNET/Apache+MXNet+Home">Developer Wiki</a></li>
<li><a href="https://issues.apache.org/jira/projects/MXNET/issues">Jira Tracker</a></li>
<li><a href="https://github.com/apache/mxnet/labels/Roadmap">Github Roadmap</a></li>
<li><a href="https://medium.com/apache-mxnet">Blog</a></li>
<li><a href="https://discuss.mxnet.io">Forum</a></li>
<li><a href="/versions/1.9.1/community/contribute">Contribute</a></li>
</ul>
</div>
<div class="col-4"><ul class="social-media-list"><li><a href="https://github.com/apache/mxnet"><svg class="svg-icon"><use xlink:href="/versions/1.9.1/assets/minima-social-icons.svg#github"></use></svg> <span class="username">apache/mxnet</span></a></li><li><a href="https://www.twitter.com/apachemxnet"><svg class="svg-icon"><use xlink:href="/versions/1.9.1/assets/minima-social-icons.svg#twitter"></use></svg> <span class="username">apachemxnet</span></a></li><li><a href="https://youtube.com/apachemxnet"><svg class="svg-icon"><use xlink:href="/versions/1.9.1/assets/minima-social-icons.svg#youtube"></use></svg> <span class="username">apachemxnet</span></a></li></ul>
</div>
<div class="col-4 footer-text">
<p>A flexible and efficient library for deep learning.</p>
</div>
</div>
</div>
</footer>
<footer class="site-footer2">
<div class="wrapper">
<div class="row">
<div class="col-3">
<img src="/versions/1.9.1/assets/img/asf_logo.svg" class="footer-logo col-2">
</div>
<div class="footer-bottom-warning col-9">
</p><p>"Copyright © 2017-2022, The Apache Software Foundation. Licensed under the Apache License, Version 2.0. Apache MXNet, MXNet, Apache, the Apache
feather, and the Apache MXNet project logo are either registered trademarks or trademarks of the
Apache Software Foundation."</p>
</div>
</div>
</div>
</footer>
</body>
</html>