blob: 63ad963fbe0d21bee475da9bc6e7e7877574c2c2 [file] [log] [blame]
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8"/>
<meta content="IE=edge" http-equiv="X-UA-Compatible"/>
<meta content="width=device-width, initial-scale=1" name="viewport"/>
<title>Large Scale Image Classification — mxnet documentation</title>
<link crossorigin="anonymous" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.6/css/bootstrap.min.css" integrity="sha384-1q8mTJOASx8j1Au+a5WDVnPi2lkFfwwEAa8hDDdjZlpLegxhjVME1fgjWPGmkzs7" rel="stylesheet"/>
<link href="https://maxcdn.bootstrapcdn.com/font-awesome/4.5.0/css/font-awesome.min.css" rel="stylesheet"/>
<link href="../../_static/basic.css" rel="stylesheet" type="text/css">
<link href="../../_static/pygments.css" rel="stylesheet" type="text/css">
<link href="../../_static/mxnet.css" rel="stylesheet" type="text/css"/>
<script type="text/javascript">
var DOCUMENTATION_OPTIONS = {
URL_ROOT: '../../',
VERSION: '',
COLLAPSE_INDEX: false,
FILE_SUFFIX: '.html',
HAS_SOURCE: true,
SOURCELINK_SUFFIX: ''
};
</script>
<script src="../../_static/jquery-1.11.1.js" type="text/javascript"></script>
<script src="../../_static/underscore.js" type="text/javascript"></script>
<script src="../../_static/searchtools_custom.js" type="text/javascript"></script>
<script src="../../_static/doctools.js" type="text/javascript"></script>
<script src="../../_static/selectlang.js" type="text/javascript"></script>
<script src="https://cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML" type="text/javascript"></script>
<script type="text/javascript"> jQuery(function() { Search.loadIndex("/searchindex.js"); Search.init();}); </script>
<script>
(function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
(i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new
Date();a=s.createElement(o),
m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
})(window,document,'script','https://www.google-analytics.com/analytics.js','ga');
ga('create', 'UA-96378503-1', 'auto');
ga('send', 'pageview');
</script>
<!-- -->
<!-- <script type="text/javascript" src="../../_static/jquery.js"></script> -->
<!-- -->
<!-- <script type="text/javascript" src="../../_static/underscore.js"></script> -->
<!-- -->
<!-- <script type="text/javascript" src="../../_static/doctools.js"></script> -->
<!-- -->
<!-- <script type="text/javascript" src="https://cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML"></script> -->
<!-- -->
<link href="../index.html" rel="up" title="Tutorials">
<link href="../python/predict_image.html" rel="prev" title="Predict with pre-trained models"/>
<link href="https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/image/mxnet-icon.png" rel="icon" type="image/png"/>
</link></link></link></head>
<body role="document"><!-- Previous Navbar Layout
<div class="navbar navbar-default navbar-fixed-top">
<div class="container">
<div class="navbar-header">
<button type="button" class="navbar-toggle collapsed" data-toggle="collapse" data-target="#navbar" aria-expanded="false" aria-controls="navbar">
<span class="sr-only">Toggle navigation</span>
<span class="icon-bar"></span>
<span class="icon-bar"></span>
<span class="icon-bar"></span>
</button>
<a href="../../" class="navbar-brand">
<img src="http://data.mxnet.io/theme/mxnet.png">
</a>
</div>
<div id="navbar" class="navbar-collapse collapse">
<ul id="navbar" class="navbar navbar-left">
<li> <a href="../../get_started/index.html">Get Started</a> </li>
<li> <a href="../../tutorials/index.html">Tutorials</a> </li>
<li> <a href="../../how_to/index.html">How To</a> </li>
<li class="dropdown">
<a href="#" class="dropdown-toggle" data-toggle="dropdown" role="button" aria-haspopup="true" aria-expanded="true">Packages <span class="caret"></span></a>
<ul class="dropdown-menu">
<li><a href="../../packages/python/index.html">
Python
</a></li>
<li><a href="../../packages/r/index.html">
R
</a></li>
<li><a href="../../packages/julia/index.html">
Julia
</a></li>
<li><a href="../../packages/c++/index.html">
C++
</a></li>
<li><a href="../../packages/scala/index.html">
Scala
</a></li>
<li><a href="../../packages/perl/index.html">
Perl
</a></li>
</ul>
</li>
<li> <a href="../../system/index.html">System</a> </li>
<li>
<form class="" role="search" action="../../search.html" method="get" autocomplete="off">
<div class="form-group inner-addon left-addon">
<i class="glyphicon glyphicon-search"></i>
<input type="text" name="q" class="form-control" placeholder="Search">
</div>
<input type="hidden" name="check_keywords" value="yes" />
<input type="hidden" name="area" value="default" />
</form> </li>
</ul>
<ul id="navbar" class="navbar navbar-right">
<li> <a href="../../index.html"><span class="flag-icon flag-icon-us"></span></a> </li>
<li> <a href="../..//zh/index.html"><span class="flag-icon flag-icon-cn"></span></a> </li>
</ul>
</div>
</div>
</div>
Previous Navbar Layout End -->
<div class="navbar navbar-fixed-top">
<div class="container" id="navContainer">
<div class="innder" id="header-inner">
<h1 id="logo-wrap">
<a href="../../" id="logo"><img src="http://data.mxnet.io/theme/mxnet.png"/></a>
</h1>
<nav class="nav-bar" id="main-nav">
<a class="main-nav-link" href="../../get_started/install.html">Install</a>
<a class="main-nav-link" href="../../tutorials/index.html">Tutorials</a>
<a class="main-nav-link" href="../../how_to/index.html">How To</a>
<span id="dropdown-menu-position-anchor">
<a aria-expanded="true" aria-haspopup="true" class="main-nav-link dropdown-toggle" data-toggle="dropdown" href="#" role="button">API <span class="caret"></span></a>
<ul class="dropdown-menu" id="package-dropdown-menu">
<li><a class="main-nav-link" href="../../api/python/index.html">Python</a></li>
<li><a class="main-nav-link" href="../../api/scala/index.html">Scala</a></li>
<li><a class="main-nav-link" href="../../api/r/index.html">R</a></li>
<li><a class="main-nav-link" href="../../api/julia/index.html">Julia</a></li>
<li><a class="main-nav-link" href="../../api/c++/index.html">C++</a></li>
<li><a class="main-nav-link" href="../../api/perl/index.html">Perl</a></li>
</ul>
</span>
<a class="main-nav-link" href="../../architecture/index.html">Architecture</a>
<!-- <a class="main-nav-link" href="../../community/index.html">Community</a> -->
<a class="main-nav-link" href="https://github.com/dmlc/mxnet">Github</a>
<span id="dropdown-menu-position-anchor-version" style="position: relative"><a href="#" class="main-nav-link dropdown-toggle" data-toggle="dropdown" role="button" aria-haspopup="true" aria-expanded="true">Versions(master)<span class="caret"></span></a><ul id="package-dropdown-menu" class="dropdown-menu"><li><a class="main-nav-link" href=http://mxnet.incubator.apache.org/test/>v0.10.14</a></li><li><a class="main-nav-link" href=http://mxnet.incubator.apache.org/test/versions/0.10/index.html>0.10</a></li><li><a class="main-nav-link" href=http://mxnet.incubator.apache.org/test/versions/master/index.html>master</a></li></ul></span></nav>
<script> function getRootPath(){ return "../../" } </script>
<div class="burgerIcon dropdown">
<a class="dropdown-toggle" data-toggle="dropdown" href="#" role="button"></a>
<ul class="dropdown-menu dropdown-menu-right" id="burgerMenu">
<li><a href="../../get_started/install.html">Install</a></li>
<li><a href="../../tutorials/index.html">Tutorials</a></li>
<li><a href="../../how_to/index.html">How To</a></li>
<li class="dropdown-submenu">
<a href="#" tabindex="-1">API</a>
<ul class="dropdown-menu">
<li><a href="../../api/python/index.html" tabindex="-1">Python</a>
</li>
<li><a href="../../api/scala/index.html" tabindex="-1">Scala</a>
</li>
<li><a href="../../api/r/index.html" tabindex="-1">R</a>
</li>
<li><a href="../../api/julia/index.html" tabindex="-1">Julia</a>
</li>
<li><a href="../../api/c++/index.html" tabindex="-1">C++</a>
</li>
<li><a href="../../api/perl/index.html" tabindex="-1">Perl</a>
</li>
</ul>
</li>
<li><a href="../../architecture/index.html">Architecture</a></li>
<li><a class="main-nav-link" href="https://github.com/dmlc/mxnet">Github</a></li>
<li id="dropdown-menu-position-anchor-version-mobile" class="dropdown-submenu" style="position: relative"><a href="#" tabindex="-1">Versions(master)</a><ul class="dropdown-menu"><li><a tabindex="-1" href=http://mxnet.incubator.apache.org/test/>v0.10.14</a></li><li><a tabindex="-1" href=http://mxnet.incubator.apache.org/test/versions/0.10/index.html>0.10</a></li><li><a tabindex="-1" href=http://mxnet.incubator.apache.org/test/versions/master/index.html>master</a></li></ul></li></ul>
</div>
<div class="plusIcon dropdown">
<a class="dropdown-toggle" data-toggle="dropdown" href="#" role="button"><span aria-hidden="true" class="glyphicon glyphicon-plus"></span></a>
<ul class="dropdown-menu dropdown-menu-right" id="plusMenu"></ul>
</div>
<div id="search-input-wrap">
<form action="../../search.html" autocomplete="off" class="" method="get" role="search">
<div class="form-group inner-addon left-addon">
<i class="glyphicon glyphicon-search"></i>
<input class="form-control" name="q" placeholder="Search" type="text"/>
</div>
<input name="check_keywords" type="hidden" value="yes">
<input name="area" type="hidden" value="default"/>
</input></form>
<div id="search-preview"></div>
</div>
<div id="searchIcon">
<span aria-hidden="true" class="glyphicon glyphicon-search"></span>
</div>
<!-- <div id="lang-select-wrap"> -->
<!-- <label id="lang-select-label"> -->
<!-- <\!-- <i class="fa fa-globe"></i> -\-> -->
<!-- <span></span> -->
<!-- </label> -->
<!-- <select id="lang-select"> -->
<!-- <option value="en">Eng</option> -->
<!-- <option value="zh">中文</option> -->
<!-- </select> -->
<!-- </div> -->
<!-- <a id="mobile-nav-toggle">
<span class="mobile-nav-toggle-bar"></span>
<span class="mobile-nav-toggle-bar"></span>
<span class="mobile-nav-toggle-bar"></span>
</a> -->
</div>
</div>
</div>
<div class="container">
<div class="row">
<div aria-label="main navigation" class="sphinxsidebar leftsidebar" role="navigation">
<div class="sphinxsidebarwrapper">
<ul class="current">
<li class="toctree-l1"><a class="reference internal" href="../../api/python/index.html">Python Documents</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../api/r/index.html">R Documents</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../api/julia/index.html">Julia Documents</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../api/c++/index.html">C++ Documents</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../api/scala/index.html">Scala Documents</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../api/perl/index.html">Perl Documents</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../how_to/index.html">HowTo Documents</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../architecture/index.html">System Documents</a></li>
<li class="toctree-l1 current"><a class="reference internal" href="../index.html">Tutorials</a><ul class="current">
<li class="toctree-l2 current"><a class="reference internal" href="../index.html#python">Python</a><ul class="current">
<li class="toctree-l3"><a class="reference internal" href="../index.html#basics">Basics</a></li>
<li class="toctree-l3 current"><a class="reference internal" href="../index.html#training-and-inference">Training and Inference</a><ul class="current">
<li class="toctree-l4"><a class="reference internal" href="../python/linear-regression.html">Linear Regression</a></li>
<li class="toctree-l4"><a class="reference internal" href="../python/mnist.html">Handwritten Digit Recognition</a></li>
<li class="toctree-l4"><a class="reference internal" href="../python/predict_image.html">Predict with pre-trained models</a></li>
<li class="toctree-l4 current"><a class="current reference internal" href="">Large Scale Image Classification</a></li>
</ul>
</li>
</ul>
</li>
<li class="toctree-l2"><a class="reference internal" href="../index.html#contributing-tutorials">Contributing Tutorials</a></li>
</ul>
</li>
</ul>
</div>
</div>
<div class="content">
<div class="section" id="large-scale-image-classification">
<span id="large-scale-image-classification"></span><h1>Large Scale Image Classification<a class="headerlink" href="#large-scale-image-classification" title="Permalink to this headline"></a></h1>
<p>Training a neural network with a large number of images presents several challenges. Even with the latest GPUs, it is not possible to train large networks using a large number of images in a reasonable amount of time using a single GPU. This problem can be somewhat mitigated by using multiple GPUs in a single machine. But there is a limit to the number of GPUs that can be attached to one machine (typically 8 or 16). This tutorial explains how to train large networks with terabytes of data using multiple machines each containing multiple GPUs.</p>
<div class="section" id="prerequisites">
<span id="prerequisites"></span><h2>Prerequisites<a class="headerlink" href="#prerequisites" title="Permalink to this headline"></a></h2>
<ul class="simple">
<li>MXNet. See the instructions for your operating system in <a class="reference external" href="http://mxnet.io/get_started/install.html">Setup and Installation</a>.</li>
<li><a class="reference external" href="http://opencv.org/opencv-3-2.html">OpenCV Python library</a></li>
</ul>
<div class="highlight-python"><div class="highlight"><pre><span></span>$ pip install opencv-python
</pre></div>
</div>
</div>
<div class="section" id="preprocessing">
<span id="preprocessing"></span><h2>Preprocessing<a class="headerlink" href="#preprocessing" title="Permalink to this headline"></a></h2>
<div class="section" id="disk-space">
<span id="disk-space"></span><h3>Disk space<a class="headerlink" href="#disk-space" title="Permalink to this headline"></a></h3>
<p>The first step in training with large data is downloading the data and preprocessing it. For this tutorial, we will be using the full ImageNet dataset. Note that, at least 2 TB of disk space is required to download and preprocess this data. It is strongly recommended to use SSD instead of HDD. SSD is much better at dealing with a large number of small image files. After the preprocessing completes and images are packed into recordIO files, HDD should be fine for training.</p>
<p>In this tutorial, we will use an AWS storage instance for data preprocessing. The storage instance <code class="docutils literal"><span class="pre">i3.4xlarge</span></code> has 3.8 TB of disk space across two NVMe SSD disks. We will use software RAID to combine them into one disk and mount it at <code class="docutils literal"><span class="pre">~/data</span></code>.</p>
<div class="highlight-python"><div class="highlight"><pre><span></span>sudo mdadm --create --verbose /dev/md0 --level=stripe --raid-devices=2 \
/dev/nvme0n1 /dev/nvme1n1
sudo mkfs /dev/md0
sudo mkdir ~/data
sudo mount /dev/md0 ~/data
sudo chown ${whoami} ~/data
</pre></div>
</div>
<p>We now have sufficient disk space to download and preprocess the data.</p>
</div>
<div class="section" id="download-imagenet">
<span id="download-imagenet"></span><h3>Download ImageNet<a class="headerlink" href="#download-imagenet" title="Permalink to this headline"></a></h3>
<p>In this tutorial, we will be using the full ImageNet dataset which can be downloaded from http://www.image-net.org/download-images. <code class="docutils literal"><span class="pre">fall11_whole.tar</span></code> contains all the images. This file is 1.2 TB in size and could take a long time to download.</p>
<p>After downloading, untar the file.</p>
<div class="highlight-python"><div class="highlight"><pre><span></span>export ROOT=full
mkdir $ROOT
tar -xvf fall11_whole.tar -C $ROOT
</pre></div>
</div>
<p>That should give you a collection of tar files. Each tar file represents a category and contains all images belonging to that category. We can unzip each tar file and copy the images into a folder named after the name of the tar file.</p>
<div class="highlight-python"><div class="highlight"><pre><span></span>for i in $ROOT/*.tar; do j=${i%.*}; echo $j; mkdir -p $j; tar -xf $i -C $j; done
rm $ROOT/*.tar
ls $ROOT | head
n00004475
n00005787
n00006024
n00006484
n00007846
n00015388
n00017222
n00021265
n00021939
n00120010
</pre></div>
</div>
</div>
<div class="section" id="remove-uncommon-classes-for-transfer-learning-optional">
<span id="remove-uncommon-classes-for-transfer-learning-optional"></span><h3>Remove uncommon classes for transfer learning (optional)<a class="headerlink" href="#remove-uncommon-classes-for-transfer-learning-optional" title="Permalink to this headline"></a></h3>
<p>A common reason to train a network on ImageNet data is to use it for transfer learning (including feature extraction or fine-tuning other models). According to <a class="reference external" href="https://arxiv.org/pdf/1608.08614v1.pdf">this</a> study, classes with too few images don’t help in transfer learning. So, we could remove classes with fewer than a certain number of images. The following code will remove classes with less than 500 images.</p>
<div class="highlight-python"><div class="highlight"><pre><span></span>BAK=${ROOT}_filtered
mkdir -p ${BAK}
for c in ${ROOT}/n*; do
count=`ls $c/*.JPEG | wc -l`
if [ "$count" -gt "500" ]; then
echo "keep $c, count = $count"
else
echo "remove $c, $count"
mv $c ${BAK}/
fi
done
</pre></div>
</div>
</div>
<div class="section" id="generate-a-validation-set">
<span id="generate-a-validation-set"></span><h3>Generate a validation set<a class="headerlink" href="#generate-a-validation-set" title="Permalink to this headline"></a></h3>
<p>To ensure we don’t overfit the data, we will create a validation set separate from the training set. During training, we will monitor loss on the validation set frequently. We create the validation set by picking fifty random images from each class and moving them to the validation set.</p>
<div class="highlight-python"><div class="highlight"><pre><span></span>VAL_ROOT=${ROOT}_val
mkdir -p ${VAL_ROOT}
for i in ${ROOT}/n*; do
c=`basename $i`
echo $c
mkdir -p ${VAL_ROOT}/$c
for j in `ls $i/*.JPEG | shuf | head -n 50`; do
mv $j ${VAL_ROOT}/$c/
done
done
</pre></div>
</div>
</div>
<div class="section" id="pack-images-into-record-files">
<span id="pack-images-into-record-files"></span><h3>Pack images into record files<a class="headerlink" href="#pack-images-into-record-files" title="Permalink to this headline"></a></h3>
<p>While MXNet can read image files directly, it is recommended to pack the image files into a recordIO file for increased performance. MXNet provides a tool (tools/im2rec.py) to do this. To use this tool, MXNet and OpenCV’s python module needs to be installed in the system.</p>
<p>Set the environment variable <code class="docutils literal"><span class="pre">MXNET</span></code> to point to the MXNet installation directory and <code class="docutils literal"><span class="pre">NAME</span></code> to the name of the dataset. Here, we assume MXNet is installed at <code class="docutils literal"><span class="pre">~/mxnet</span></code></p>
<div class="highlight-python"><div class="highlight"><pre><span></span>MXNET=~/mxnet
NAME=full_imagenet_500_filtered
</pre></div>
</div>
<p>To create the recordIO files, we first create a list of images we want in the recordIO files and then use <code class="docutils literal"><span class="pre">im2rec</span></code> to pack images in the list into recordIO files. We create this list in <code class="docutils literal"><span class="pre">train_meta</span></code>. Training data is around 1TB. We split it into 8 parts, with each part roughly 100 GB in size.</p>
<div class="highlight-python"><div class="highlight"><pre><span></span>mkdir -p train_meta
python ${MXNET}/tools/im2rec.py --list True --chunks 8 --recursive True \
train_meta/${NAME} ${ROOT}
</pre></div>
</div>
<p>We then resize the images such that the short edge is 480 pixels long and pack the images into recordIO files. Since most of the work is disk I/O, we use multiple (16) threads to get the work done faster.</p>
<div class="highlight-python"><div class="highlight"><pre><span></span>python ${MXNET}/tools/im2rec.py --resize 480 --quality 90 \
--num-thread 16 train_meta/${NAME} ${ROOT}
</pre></div>
</div>
<p>Once done, we move the rec files into a folder named <code class="docutils literal"><span class="pre">train</span></code>.</p>
<div class="highlight-python"><div class="highlight"><pre><span></span>mkdir -p train
mv train_meta/*.rec train/
</pre></div>
</div>
<p>We do similar preprocessing for the validation set.</p>
<div class="highlight-python"><div class="highlight"><pre><span></span>mkdir -p val_meta
python ${MXNET}/tools/im2rec.py --list True --recursive True \
val_meta/${NAME} ${VAL_ROOT}
python ${MXNET}/tools/im2rec.py --resize 480 --quality 90 \
--num-thread 16 val_meta/${NAME} ${VAL_ROOT}
mkdir -p val
mv val_meta/*.rec val/
</pre></div>
</div>
<p>We now have all training and validation images in recordIO format in <code class="docutils literal"><span class="pre">train</span></code> and <code class="docutils literal"><span class="pre">val</span></code> directories respectively. We can now use these <code class="docutils literal"><span class="pre">.rec</span></code> files for training.</p>
</div>
</div>
<div class="section" id="training">
<span id="training"></span><h2>Training<a class="headerlink" href="#training" title="Permalink to this headline"></a></h2>
<p><a class="reference external" href="https://arxiv.org/abs/1512.03385">ResNet</a> has shown its effectiveness on ImageNet competition. Our experiments also <a class="reference external" href="https://github.com/tornadomeet/ResNet">reproduced</a> the results reported in the paper. As we increase the number of layers from 18 to 152, we see steady improvement in validation accuracy. Given this is a huge dataset, we will use Resnet with 152 layers.</p>
<p>Due to the huge computational complexity, even the fastest GPU needs more than one day for a single pass of the data. We often need tens of epochs before the training converges to good validation accuracy. While we can use multiple GPUs in a machine, the number of GPUs in a machine is often limited to 8 or 16. For faster training, in this tutorial, we will use multiple machines each containing multiple GPUs to train the model.</p>
<div class="section" id="setup">
<span id="setup"></span><h3>Setup<a class="headerlink" href="#setup" title="Permalink to this headline"></a></h3>
<p>We will use 16 machines (P2.16x instances), each containing 16 GPUs (Tesla K80). These machines are interconnected via 20 Gbps ethernet.</p>
<p>AWS CloudFormation makes it very easy to create deep learning clusters. We follow instructions from <a class="reference external" href="https://aws.amazon.com/blogs/compute/distributed-deep-learning-made-easy/">this</a> page and create a deep learning cluster with 16 P2.16x instances.</p>
<p>We load the data and code in the first machine (we’ll refer to this machine as master). We share both the data and code to other machines using EFS.</p>
<p>If you are setting up your cluster manually, without using AWS CloudFormation, remember to do the following:</p>
<ol>
<li><p class="first">Compile MXNet using <code class="docutils literal"><span class="pre">USE_DIST_KVSTORE=1</span></code> to enable distributed training.</p>
</li>
<li><p class="first">Create a hosts file in the master that contains the host names of all the machines in the cluster. For example,</p>
<div class="highlight-python"><div class="highlight"><pre><span></span>$ head -3 hosts
deeplearning-worker1
deeplearning-worker2
deeplearning-worker3
</pre></div>
</div>
<p>It should be possible to ssh into any of these machines from the master by invoking <code class="docutils literal"><span class="pre">ssh</span></code> with just a hostname from the file. For example,</p>
<div class="highlight-python"><div class="highlight"><pre><span></span>$ ssh deeplearning-worker2
===================================
Deep Learning AMI for Ubuntu
===================================
...
ubuntu@ip-10-0-1-199:~$
</pre></div>
</div>
<p>One way to do this is to use ssh agent forwarding. Please check <a class="reference external" href="https://aws.amazon.com/blogs/security/securely-connect-to-linux-instances-running-in-a-private-amazon-vpc/">this</a> page to learn how to set this up. In short, you’ll configure all machines to login using a particular certificate (mycert.pem) which is present on your local machine. You then login to the master using the certificate and the <code class="docutils literal"><span class="pre">-A</span></code> switch to enable agent forwarding. Now, from the master, you should be able to login to any other machine in the cluster by providing just the hostname (example: <code class="docutils literal"><span class="pre">ssh</span> <span class="pre">deeplearning-worker2</span></code>).</p>
</li>
</ol>
</div>
<div class="section" id="run-training">
<span id="run-training"></span><h3>Run Training<a class="headerlink" href="#run-training" title="Permalink to this headline"></a></h3>
<p>After the cluster is setup, login to master and run the following command from ${MXNET}/example/image-classification</p>
<div class="highlight-python"><div class="highlight"><pre><span></span>../../tools/launch.py -n 16 -H $DEEPLEARNING_WORKERS_PATH python train_imagenet.py --network resnet \
--num-layers 152 --data-train ~/data/train --data-val ~/data/val/ --gpus 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 \
--batch-size 8192 --model ~/data/model/resnet152 --num-epochs 1 --kv-store dist_sync
</pre></div>
</div>
<p>launch.py launches the command it is provided in all the machine in the cluster. List of machines in the cluster must be provided to launch.py using the <code class="docutils literal"><span class="pre">-H</span></code> switch. Here is description of options used for launch.py.</p>
<table border="1" class="docutils">
<colgroup>
<col width="50%"/>
<col width="50%"/>
</colgroup>
<thead valign="bottom">
<tr class="row-odd"><th class="head">Option</th>
<th class="head">Description</th>
</tr>
</thead>
<tbody valign="top">
<tr class="row-even"><td>n</td>
<td>specifies the number of worker jobs to run on each machine. We run 16 workers since we have 16 machines in the cluster.</td>
</tr>
<tr class="row-odd"><td>H</td>
<td>specifies the path to a file that has a list of hostnames of machines in the cluster. Since we created the cluster using the AWS deep learning CloudFormation template, the environment variable <code class="docutils literal"><span class="pre">$DEEPLEARNING_WORKERS_PATH</span></code> points to the required file.</td>
</tr>
</tbody>
</table>
<p>train_imagenet.py trains the network provided by the <code class="docutils literal"><span class="pre">--network</span></code> option using the data provided by the <code class="docutils literal"><span class="pre">--data-train</span></code> and <code class="docutils literal"><span class="pre">--data-val</span></code> options. Here is description of the options used with train_imagenet.py.</p>
<table border="1" class="docutils">
<colgroup>
<col width="50%"/>
<col width="50%"/>
</colgroup>
<thead valign="bottom">
<tr class="row-odd"><th class="head">Option</th>
<th class="head">Description</th>
</tr>
</thead>
<tbody valign="top">
<tr class="row-even"><td>network</td>
<td>The network to train. Could be any of the network available in <code class="docutils literal"><span class="pre">${MXNET}/example/image-classification</span></code>. For this tutorial, we use Resnet.</td>
</tr>
<tr class="row-odd"><td>num-layers</td>
<td>Number of layers to use in the network. We use 152 layer Resnet.</td>
</tr>
<tr class="row-even"><td>data-train</td>
<td>Directory containing the training images. We point to the EFS location (<code class="docutils literal"><span class="pre">~/data/train/</span></code>) where we stored the training images.</td>
</tr>
<tr class="row-odd"><td>data-val</td>
<td>Directory containing the validation images. We point to the EFS location (<code class="docutils literal"><span class="pre">~/data/val</span></code>) where we stored the validation images.</td>
</tr>
<tr class="row-even"><td>gpus</td>
<td>Comma separated list of gpu indices to use for training on each machine. We use all 16 GPUs.</td>
</tr>
<tr class="row-odd"><td>batch-size</td>
<td>Batch size across all GPUs. This is equal to batch size per GPU * total number of GPUs. We use a batch size of 32 images per GPU. So, effective batch size is 32 * 16 * 16 = 8192.</td>
</tr>
<tr class="row-even"><td>model</td>
<td>Path prefix for the model file created by the training.</td>
</tr>
<tr class="row-odd"><td>num-epochs</td>
<td>Number of epochs to train.</td>
</tr>
<tr class="row-even"><td>kv-store</td>
<td>Key/Value store for parameter synchronization. We use distributed kv store since we are doing distributed training.</td>
</tr>
</tbody>
</table>
<p>After training is complete, trained models are available in the directory specified by the <code class="docutils literal"><span class="pre">--model</span></code> option. Models are saved in two parts: model-symbol.json for the network definition and model-n.params for the parameters saved after the n’th epoch.</p>
</div>
</div>
<div class="section" id="scalability">
<span id="scalability"></span><h2>Scalability<a class="headerlink" href="#scalability" title="Permalink to this headline"></a></h2>
<p>One common concern using large number of machines for training is the scalability. We have benchmarked scalability running several popular networks on clusters with up to 256 GPUs and the speedup is very close to ideal.</p>
<p>This scalability test was run on sixteen P2.16xl instances with 256 GPUs in total. We used AWS deep learning AMI with CUDA 7.5 and CUDNN 5.1 installed.</p>
<p>We fixed the batch size per GPU constant and doubled the number of GPUs for every subsequent test. Synchronized SGD (–kv-store dist_device_sync) was used. The CNNs used are located <a class="reference external" href="https://github.com/dmlc/mxnet/tree/master/example/image-classification/symbols">here</a>.</p>
<table border="1" class="docutils">
<colgroup>
<col width="25%"/>
<col width="25%"/>
<col width="25%"/>
<col width="25%"/>
</colgroup>
<thead valign="bottom">
<tr class="row-odd"><th class="head"> </th>
<th class="head">alexnet</th>
<th class="head">inception-v3</th>
<th class="head">resnet-152</th>
</tr>
</thead>
<tbody valign="top">
<tr class="row-even"><td>batch size per GPU</td>
<td>512</td>
<td>32</td>
<td>32</td>
</tr>
<tr class="row-odd"><td>model size (MB)</td>
<td>203</td>
<td>95</td>
<td>240</td>
</tr>
</tbody>
</table>
<p>Number of images processed per second is shown in the following table:</p>
<table border="1" class="docutils">
<colgroup>
<col width="25%"/>
<col width="25%"/>
<col width="25%"/>
<col width="25%"/>
</colgroup>
<thead valign="bottom">
<tr class="row-odd"><th class="head">Number of GPUs</th>
<th class="head">Alexnet</th>
<th class="head">Inception-v3</th>
<th class="head">Resnet-152</th>
</tr>
</thead>
<tbody valign="top">
<tr class="row-even"><td>1</td>
<td>457.07</td>
<td>30.4</td>
<td>20.8</td>
</tr>
<tr class="row-odd"><td>2</td>
<td>870.43</td>
<td>59.61</td>
<td>38.76</td>
</tr>
<tr class="row-even"><td>4</td>
<td>1514.8</td>
<td>117.9</td>
<td>77.01</td>
</tr>
<tr class="row-odd"><td>8</td>
<td>2852.5</td>
<td>233.39</td>
<td>153.07</td>
</tr>
<tr class="row-even"><td>16</td>
<td>4244.18</td>
<td>447.61</td>
<td>298.03</td>
</tr>
<tr class="row-odd"><td>32</td>
<td>7945.57</td>
<td>882.57</td>
<td>595.53</td>
</tr>
<tr class="row-even"><td>64</td>
<td>15840.52</td>
<td>1761.24</td>
<td>1179.86</td>
</tr>
<tr class="row-odd"><td>128</td>
<td>31334.88</td>
<td>3416.2</td>
<td>2333.47</td>
</tr>
<tr class="row-even"><td>256</td>
<td>61938.36</td>
<td>6660.98</td>
<td>4630.42</td>
</tr>
</tbody>
</table>
<p>The following figure shows speedup against the number of GPUs used and compares it with ideal speedup.</p>
<p><img alt="Speedup Graph" src="https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/image/speedup-p2.png"/></p>
</div>
<div class="section" id="troubleshooting-guidelines">
<span id="troubleshooting-guidelines"></span><h2>Troubleshooting guidelines<a class="headerlink" href="#troubleshooting-guidelines" title="Permalink to this headline"></a></h2>
<div class="section" id="validation-accuracy">
<span id="validation-accuracy"></span><h3>Validation accuracy<a class="headerlink" href="#validation-accuracy" title="Permalink to this headline"></a></h3>
<p>It is often straightforward to achieve a reasonable validation accuracy, but achieving the state-of-the-art numbers reported in papers can sometimes be very hard. Here are few things you can try to improve validation accuracy.</p>
<ul class="simple">
<li>Adding more data augmentations often reduces the gap between training and validation accuracy. Data augmentation could be reduced in epochs closer to the end.</li>
<li>Start with a large learning rate and keep it large for a long time. For example, in CIFAR10, you could keep the learning rate at 0.1 for the first 200 epochs and then reduce it to 0.01.</li>
<li>Do not use a batch size that is too large, especially batch size >> number of classes.</li>
</ul>
</div>
<div class="section" id="speed">
<span id="speed"></span><h3>Speed<a class="headerlink" href="#speed" title="Permalink to this headline"></a></h3>
<ul class="simple">
<li>Distributed training improves speed when computation cost of a batch is high. So, make sure your workload is not too small (like LeNet on MNIST). Make sure batch size is reasonably large.</li>
<li>Make sure data-read and preprocessing is not the bottleneck. Use the <code class="docutils literal"><span class="pre">--test-io</span> <span class="pre">1</span></code> flag to check how many images can be pre-processed per second.</li>
<li>Increase –data-nthreads (default is 4) to use more threads for data preprocessing.</li>
<li>Data preprocessing is done by opencv. If opencv is compiled from source code, check if it is configured correctly.</li>
<li>Use <code class="docutils literal"><span class="pre">--benchmark</span> <span class="pre">1</span></code> to use randomly generated data rather than real data to narrow down where the bottleneck is.</li>
<li>Check <a class="reference external" href="http://mxnet.io/how_to/perf.html">this</a> page for more details.</li>
</ul>
</div>
<div class="section" id="memory">
<span id="memory"></span><h3>Memory<a class="headerlink" href="#memory" title="Permalink to this headline"></a></h3>
<p>If the batch size is too big, it can exhaust GPU memory. If this happens, you’ll see the error message “cudaMalloc failed: out of memory” or something similar. There are a couple of ways to fix this:</p>
<ul class="simple">
<li>Reduce the batch size.</li>
<li>Set the environment variable <code class="docutils literal"><span class="pre">MXNET_BACKWARD_DO_MIRROR</span></code> to 1. It reduces the memory consumption by trading off speed. For example, with batch size 64, inception-v3 uses 10G memory and trains 30 image/sec on a single K80 GPU. When mirroring is enabled, with 10G GPU memory consumption, we can run inception-v3 using batch size of 128. The cost is that, the speed reduces to 27 images/sec.</li>
</ul>
</div>
</div>
</div>
<div class="container">
<div class="footer">
<p> © 2015-2017 DMLC. All rights reserved. </p>
</div>
</div>
</div>
<div aria-label="main navigation" class="sphinxsidebar rightsidebar" role="navigation">
<div class="sphinxsidebarwrapper">
<h3><a href="../../index.html">Table Of Contents</a></h3>
<ul>
<li><a class="reference internal" href="#">Large Scale Image Classification</a><ul>
<li><a class="reference internal" href="#prerequisites">Prerequisites</a></li>
<li><a class="reference internal" href="#preprocessing">Preprocessing</a><ul>
<li><a class="reference internal" href="#disk-space">Disk space</a></li>
<li><a class="reference internal" href="#download-imagenet">Download ImageNet</a></li>
<li><a class="reference internal" href="#remove-uncommon-classes-for-transfer-learning-optional">Remove uncommon classes for transfer learning (optional)</a></li>
<li><a class="reference internal" href="#generate-a-validation-set">Generate a validation set</a></li>
<li><a class="reference internal" href="#pack-images-into-record-files">Pack images into record files</a></li>
</ul>
</li>
<li><a class="reference internal" href="#training">Training</a><ul>
<li><a class="reference internal" href="#setup">Setup</a></li>
<li><a class="reference internal" href="#run-training">Run Training</a></li>
</ul>
</li>
<li><a class="reference internal" href="#scalability">Scalability</a></li>
<li><a class="reference internal" href="#troubleshooting-guidelines">Troubleshooting guidelines</a><ul>
<li><a class="reference internal" href="#validation-accuracy">Validation accuracy</a></li>
<li><a class="reference internal" href="#speed">Speed</a></li>
<li><a class="reference internal" href="#memory">Memory</a></li>
</ul>
</li>
</ul>
</li>
</ul>
</div>
</div>
</div> <!-- pagename != index -->
<script crossorigin="anonymous" integrity="sha384-0mSbJDEHialfmuBBQP6A4Qrprq5OVfW37PRR3j5ELqxss1yVqOtnepnHVP9aJ7xS" src="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.6/js/bootstrap.min.js"></script>
<script src="../../_static/js/sidebar.js" type="text/javascript"></script>
<script src="../../_static/js/search.js" type="text/javascript"></script>
<script src="../../_static/js/navbar.js" type="text/javascript"></script>
<script src="../../_static/js/clipboard.min.js" type="text/javascript"></script>
<script src="../../_static/js/copycode.js" type="text/javascript"></script>
<script type="text/javascript">
$('body').ready(function () {
$('body').css('visibility', 'visible');
});
</script>
</div></body>
</html>