blob: dd19329afdea7e4dd724525d50bbaa9a6db92a87 [file] [log] [blame]
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8"/>
<meta content="IE=edge" http-equiv="X-UA-Compatible"/>
<meta content="width=device-width, initial-scale=1" name="viewport"/>
<title>MXNet on the Cloud — mxnet documentation</title>
<link crossorigin="anonymous" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.6/css/bootstrap.min.css" integrity="sha384-1q8mTJOASx8j1Au+a5WDVnPi2lkFfwwEAa8hDDdjZlpLegxhjVME1fgjWPGmkzs7" rel="stylesheet"/>
<link href="https://maxcdn.bootstrapcdn.com/font-awesome/4.5.0/css/font-awesome.min.css" rel="stylesheet"/>
<link href="../_static/basic.css" rel="stylesheet" type="text/css">
<link href="../_static/pygments.css" rel="stylesheet" type="text/css">
<link href="../_static/mxnet.css" rel="stylesheet" type="text/css"/>
<script type="text/javascript">
var DOCUMENTATION_OPTIONS = {
URL_ROOT: '../',
VERSION: '',
COLLAPSE_INDEX: false,
FILE_SUFFIX: '.html',
HAS_SOURCE: true,
SOURCELINK_SUFFIX: ''
};
</script>
<script src="../_static/jquery-1.11.1.js" type="text/javascript"></script>
<script src="../_static/underscore.js" type="text/javascript"></script>
<script src="../_static/searchtools_custom.js" type="text/javascript"></script>
<script src="../_static/doctools.js" type="text/javascript"></script>
<script src="../_static/selectlang.js" type="text/javascript"></script>
<script src="https://cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML" type="text/javascript"></script>
<script type="text/javascript"> jQuery(function() { Search.loadIndex("/searchindex.js"); Search.init();}); </script>
<script>
(function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
(i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new
Date();a=s.createElement(o),
m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
})(window,document,'script','https://www.google-analytics.com/analytics.js','ga');
ga('create', 'UA-96378503-1', 'auto');
ga('send', 'pageview');
</script>
<!-- -->
<!-- <script type="text/javascript" src="../_static/jquery.js"></script> -->
<!-- -->
<!-- <script type="text/javascript" src="../_static/underscore.js"></script> -->
<!-- -->
<!-- <script type="text/javascript" src="../_static/doctools.js"></script> -->
<!-- -->
<!-- <script type="text/javascript" src="https://cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML"></script> -->
<!-- -->
<link href="https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/image/mxnet-icon.png" rel="icon" type="image/png"/>
</link></link></head>
<body role="document"><!-- Previous Navbar Layout
<div class="navbar navbar-default navbar-fixed-top">
<div class="container">
<div class="navbar-header">
<button type="button" class="navbar-toggle collapsed" data-toggle="collapse" data-target="#navbar" aria-expanded="false" aria-controls="navbar">
<span class="sr-only">Toggle navigation</span>
<span class="icon-bar"></span>
<span class="icon-bar"></span>
<span class="icon-bar"></span>
</button>
<a href="../" class="navbar-brand">
<img src="http://data.mxnet.io/theme/mxnet.png">
</a>
</div>
<div id="navbar" class="navbar-collapse collapse">
<ul id="navbar" class="navbar navbar-left">
<li> <a href="../get_started/index.html">Get Started</a> </li>
<li> <a href="../tutorials/index.html">Tutorials</a> </li>
<li> <a href="../how_to/index.html">How To</a> </li>
<li class="dropdown">
<a href="#" class="dropdown-toggle" data-toggle="dropdown" role="button" aria-haspopup="true" aria-expanded="true">Packages <span class="caret"></span></a>
<ul class="dropdown-menu">
<li><a href="../packages/python/index.html">
Python
</a></li>
<li><a href="../packages/r/index.html">
R
</a></li>
<li><a href="../packages/julia/index.html">
Julia
</a></li>
<li><a href="../packages/c++/index.html">
C++
</a></li>
<li><a href="../packages/scala/index.html">
Scala
</a></li>
<li><a href="../packages/perl/index.html">
Perl
</a></li>
</ul>
</li>
<li> <a href="../system/index.html">System</a> </li>
<li>
<form class="" role="search" action="../search.html" method="get" autocomplete="off">
<div class="form-group inner-addon left-addon">
<i class="glyphicon glyphicon-search"></i>
<input type="text" name="q" class="form-control" placeholder="Search">
</div>
<input type="hidden" name="check_keywords" value="yes" />
<input type="hidden" name="area" value="default" />
</form> </li>
</ul>
<ul id="navbar" class="navbar navbar-right">
<li> <a href="../index.html"><span class="flag-icon flag-icon-us"></span></a> </li>
<li> <a href="..//zh/index.html"><span class="flag-icon flag-icon-cn"></span></a> </li>
</ul>
</div>
</div>
</div>
Previous Navbar Layout End -->
<div class="navbar navbar-fixed-top">
<div class="container" id="navContainer">
<div class="innder" id="header-inner">
<h1 id="logo-wrap">
<a href="../" id="logo"><img src="http://data.mxnet.io/theme/mxnet.png"/></a>
</h1>
<nav class="nav-bar" id="main-nav">
<a class="main-nav-link" href="../get_started/install.html">Install</a>
<a class="main-nav-link" href="../tutorials/index.html">Tutorials</a>
<a class="main-nav-link" href="../how_to/index.html">How To</a>
<span id="dropdown-menu-position-anchor">
<a aria-expanded="true" aria-haspopup="true" class="main-nav-link dropdown-toggle" data-toggle="dropdown" href="#" role="button">API <span class="caret"></span></a>
<ul class="dropdown-menu" id="package-dropdown-menu">
<li><a class="main-nav-link" href="../api/python/index.html">Python</a></li>
<li><a class="main-nav-link" href="../api/scala/index.html">Scala</a></li>
<li><a class="main-nav-link" href="../api/r/index.html">R</a></li>
<li><a class="main-nav-link" href="../api/julia/index.html">Julia</a></li>
<li><a class="main-nav-link" href="../api/c++/index.html">C++</a></li>
<li><a class="main-nav-link" href="../api/perl/index.html">Perl</a></li>
</ul>
</span>
<a class="main-nav-link" href="../architecture/index.html">Architecture</a>
<!-- <a class="main-nav-link" href="../community/index.html">Community</a> -->
<a class="main-nav-link" href="https://github.com/dmlc/mxnet">Github</a>
<span id="dropdown-menu-position-anchor-version" style="position: relative"><a href="#" class="main-nav-link dropdown-toggle" data-toggle="dropdown" role="button" aria-haspopup="true" aria-expanded="true">Versions(master)<span class="caret"></span></a><ul id="package-dropdown-menu" class="dropdown-menu"><li><a class="main-nav-link" href=http://mxnet.incubator.apache.org/test/>v0.10.14</a></li><li><a class="main-nav-link" href=http://mxnet.incubator.apache.org/test/versions/0.10/index.html>0.10</a></li><li><a class="main-nav-link" href=http://mxnet.incubator.apache.org/test/versions/master/index.html>master</a></li></ul></span></nav>
<script> function getRootPath(){ return "../" } </script>
<div class="burgerIcon dropdown">
<a class="dropdown-toggle" data-toggle="dropdown" href="#" role="button"></a>
<ul class="dropdown-menu dropdown-menu-right" id="burgerMenu">
<li><a href="../get_started/install.html">Install</a></li>
<li><a href="../tutorials/index.html">Tutorials</a></li>
<li><a href="../how_to/index.html">How To</a></li>
<li class="dropdown-submenu">
<a href="#" tabindex="-1">API</a>
<ul class="dropdown-menu">
<li><a href="../api/python/index.html" tabindex="-1">Python</a>
</li>
<li><a href="../api/scala/index.html" tabindex="-1">Scala</a>
</li>
<li><a href="../api/r/index.html" tabindex="-1">R</a>
</li>
<li><a href="../api/julia/index.html" tabindex="-1">Julia</a>
</li>
<li><a href="../api/c++/index.html" tabindex="-1">C++</a>
</li>
<li><a href="../api/perl/index.html" tabindex="-1">Perl</a>
</li>
</ul>
</li>
<li><a href="../architecture/index.html">Architecture</a></li>
<li><a class="main-nav-link" href="https://github.com/dmlc/mxnet">Github</a></li>
<li id="dropdown-menu-position-anchor-version-mobile" class="dropdown-submenu" style="position: relative"><a href="#" tabindex="-1">Versions(master)</a><ul class="dropdown-menu"><li><a tabindex="-1" href=http://mxnet.incubator.apache.org/test/>v0.10.14</a></li><li><a tabindex="-1" href=http://mxnet.incubator.apache.org/test/versions/0.10/index.html>0.10</a></li><li><a tabindex="-1" href=http://mxnet.incubator.apache.org/test/versions/master/index.html>master</a></li></ul></li></ul>
</div>
<div class="plusIcon dropdown">
<a class="dropdown-toggle" data-toggle="dropdown" href="#" role="button"><span aria-hidden="true" class="glyphicon glyphicon-plus"></span></a>
<ul class="dropdown-menu dropdown-menu-right" id="plusMenu"></ul>
</div>
<div id="search-input-wrap">
<form action="../search.html" autocomplete="off" class="" method="get" role="search">
<div class="form-group inner-addon left-addon">
<i class="glyphicon glyphicon-search"></i>
<input class="form-control" name="q" placeholder="Search" type="text"/>
</div>
<input name="check_keywords" type="hidden" value="yes">
<input name="area" type="hidden" value="default"/>
</input></form>
<div id="search-preview"></div>
</div>
<div id="searchIcon">
<span aria-hidden="true" class="glyphicon glyphicon-search"></span>
</div>
<!-- <div id="lang-select-wrap"> -->
<!-- <label id="lang-select-label"> -->
<!-- <\!-- <i class="fa fa-globe"></i> -\-> -->
<!-- <span></span> -->
<!-- </label> -->
<!-- <select id="lang-select"> -->
<!-- <option value="en">Eng</option> -->
<!-- <option value="zh">中文</option> -->
<!-- </select> -->
<!-- </div> -->
<!-- <a id="mobile-nav-toggle">
<span class="mobile-nav-toggle-bar"></span>
<span class="mobile-nav-toggle-bar"></span>
<span class="mobile-nav-toggle-bar"></span>
</a> -->
</div>
</div>
</div>
<div class="container">
<div class="row">
<div aria-label="main navigation" class="sphinxsidebar leftsidebar" role="navigation">
<div class="sphinxsidebarwrapper">
<ul>
<li class="toctree-l1"><a class="reference internal" href="../api/python/index.html">Python Documents</a></li>
<li class="toctree-l1"><a class="reference internal" href="../api/r/index.html">R Documents</a></li>
<li class="toctree-l1"><a class="reference internal" href="../api/julia/index.html">Julia Documents</a></li>
<li class="toctree-l1"><a class="reference internal" href="../api/c++/index.html">C++ Documents</a></li>
<li class="toctree-l1"><a class="reference internal" href="../api/scala/index.html">Scala Documents</a></li>
<li class="toctree-l1"><a class="reference internal" href="../api/perl/index.html">Perl Documents</a></li>
<li class="toctree-l1"><a class="reference internal" href="index.html">HowTo Documents</a></li>
<li class="toctree-l1"><a class="reference internal" href="../architecture/index.html">System Documents</a></li>
<li class="toctree-l1"><a class="reference internal" href="../tutorials/index.html">Tutorials</a></li>
</ul>
</div>
</div>
<div class="content">
<div class="section" id="mxnet-on-the-cloud">
<span id="mxnet-on-the-cloud"></span><h1>MXNet on the Cloud<a class="headerlink" href="#mxnet-on-the-cloud" title="Permalink to this headline"></a></h1>
<p>Deep learning can require extremely powerful hardware, often for unpredictable durations of time.
Moreover, <em>MXNet</em> can benefit from both multiple GPUs and multiple machines.
Accordingly, cloud computing, as offered by AWS and others,
is especially well suited to training deep learning models.
Using AWS, we can rapidly fire up multiple machines with multiple GPUs each at will
and maintain the resources for precisely the amount of time needed.</p>
<div class="section" id="set-up-an-aws-gpu-cluster-from-scratch">
<span id="set-up-an-aws-gpu-cluster-from-scratch"></span><h2>Set Up an AWS GPU Cluster from Scratch<a class="headerlink" href="#set-up-an-aws-gpu-cluster-from-scratch" title="Permalink to this headline"></a></h2>
<p>In this document, we provide a step-by-step guide that will teach you
how to set up an AWS cluster with <em>MXNet</em>. We show how to:</p>
<ul class="simple">
<li><a class="reference external" href="#use-amazon-s3-to-host-data">Use Amazon S3 to host data</a></li>
<li><a class="reference external" href="#set-up-an-ec2-gpu-instance">Set up an EC2 GPU instance with all dependencies installed</a></li>
<li><a class="reference external" href="#build-and-run-mxnet-on-a-gpu-instance">Build and run MXNet on a single computer</a></li>
<li><a class="reference external" href="#set-up-an-ec2-gpu-cluster-for-distributed-training">Set up an EC2 GPU cluster for distributed training</a></li>
</ul>
<div class="section" id="use-amazon-s3-to-host-data">
<span id="use-amazon-s3-to-host-data"></span><h3>Use Amazon S3 to Host Data<a class="headerlink" href="#use-amazon-s3-to-host-data" title="Permalink to this headline"></a></h3>
<p>Amazon S3 provides distributed data storage which proves especially convenient for hosting large datasets.
To use S3, you need <a class="reference external" href="http://docs.aws.amazon.com/AWSSimpleQueueService/latest/SQSGettingStartedGuide/AWSCredentials.html">AWS credentials</a>,
including an <code class="docutils literal"><span class="pre">ACCESS_KEY_ID</span></code> and a <code class="docutils literal"><span class="pre">SECRET_ACCESS_KEY</span></code>.</p>
<p>To use <em>MXNet</em> with S3, set the environment variables <code class="docutils literal"><span class="pre">AWS_ACCESS_KEY_ID</span></code> and
<code class="docutils literal"><span class="pre">AWS_SECRET_ACCESS_KEY</span></code> by adding the following two lines in
<code class="docutils literal"><span class="pre">~/.bashrc</span></code> (replacing the strings with the correct ones):</p>
<div class="highlight-bash"><div class="highlight"><pre><span></span><span class="nb">export</span> <span class="nv">AWS_ACCESS_KEY_ID</span><span class="o">=</span>AKIAIOSFODNN7EXAMPLE
<span class="nb">export</span> <span class="nv">AWS_SECRET_ACCESS_KEY</span><span class="o">=</span>wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY
</pre></div>
</div>
<p>There are several ways to upload data to S3. One simple way is to use
<a class="reference external" href="http://s3tools.org/s3cmd">s3cmd</a>. For example:</p>
<div class="highlight-bash"><div class="highlight"><pre><span></span>wget http://data.mxnet.io/mxnet/data/mnist.zip
unzip mnist.zip <span class="o">&amp;&amp;</span> s3cmd put t*-ubyte s3://dmlc/mnist/
</pre></div>
</div>
</div>
<div class="section" id="use-pre-installed-ec2-gpu-instance">
<span id="use-pre-installed-ec2-gpu-instance"></span><h3>Use Pre-installed EC2 GPU Instance<a class="headerlink" href="#use-pre-installed-ec2-gpu-instance" title="Permalink to this headline"></a></h3>
<p>The <a class="reference external" href="https://aws.amazon.com/marketplace/pp/B01M0AXXQB?qid=1475211685369&amp;sr=0-1&amp;ref_=srh_res_product_title">Deep Learning AMI</a> is an Amazon Linux image
supported and maintained by Amazon Web Services for use on Amazon Elastic Compute Cloud (Amazon EC2).
It contains <a class="reference external" href="https://github.com/dmlc/mxnet">MXNet-v0.9.3 tag</a> and the necessary components to get going with deep learning,
including Nvidia drivers, CUDA, cuDNN, Anaconda, Python2 and Python3.The AMI IDs are the following:</p>
<ul class="simple">
<li>us-east-1: ami-e7c96af1</li>
<li>us-west-2: ami-dfb13ebf</li>
<li>eu-west-1: ami-6e5d6808</li>
</ul>
<p>Now you can launch <em>MXNet</em> directly on an EC2 GPU instance.You can also use <a class="reference external" href="http://jupyter.org">Jupyter</a> notebook on EC2 machine.
Here is a <a class="reference external" href="https://github.com/dmlc/mxnet-notebooks">good tutorial</a>
on how to connect to a Jupyter notebook running on an EC2 instance.</p>
</div>
<div class="section" id="set-up-an-ec2-gpu-instance-from-scratch">
<span id="set-up-an-ec2-gpu-instance-from-scratch"></span><h3>Set Up an EC2 GPU Instance from Scratch<a class="headerlink" href="#set-up-an-ec2-gpu-instance-from-scratch" title="Permalink to this headline"></a></h3>
<p><em>MXNet</em> requires the following libraries:</p>
<ul class="simple">
<li>C++ compiler with C++11 support, such as <code class="docutils literal"><span class="pre">gcc</span> <span class="pre">>=</span> <span class="pre">4.8</span></code></li>
<li><code class="docutils literal"><span class="pre">CUDA</span></code> (<code class="docutils literal"><span class="pre">CUDNN</span></code> in optional) for GPU linear algebra</li>
<li><code class="docutils literal"><span class="pre">BLAS</span></code> (cblas, open-blas, atblas, mkl, or others) for CPU linear algebra</li>
<li><code class="docutils literal"><span class="pre">opencv</span></code> for image augmentations</li>
<li><code class="docutils literal"><span class="pre">curl</span></code> and <code class="docutils literal"><span class="pre">openssl</span></code> for the ability to read/write to Amazon S3</li>
</ul>
<p>Installing <code class="docutils literal"><span class="pre">CUDA</span></code> on EC2 instances requires some effort. Caffe has a good
<a class="reference external" href="https://github.com/BVLC/caffe/wiki/Install-Caffe-on-EC2-from-scratch-(Ubuntu,-CUDA-7,-cuDNN-3)">tutorial</a>
on how to install CUDA 7.0 on Ubuntu 14.04.</p>
<p><strong><em>Note:</em></strong> We tried CUDA 7.5 on Nov 7, 2015, but found it problematic.</p>
<p>You can install the rest using the package manager. For example, on Ubuntu:</p>
<div class="highlight-python"><div class="highlight"><pre><span></span>sudo apt-get update
sudo apt-get install -y build-essential git libcurl4-openssl-dev libatlas-base-dev libopencv-dev python-numpy
</pre></div>
</div>
<p>The Amazon Machine Image (AMI) <a class="reference external" href="https://console.aws.amazon.com/ec2/v2/home?region=us-east-1#LaunchInstanceWizard:ami=ami-12fd8178">ami-12fd8178</a> has the packages listed above installed.</p>
</div>
<div class="section" id="build-and-run-mxnet-on-a-gpu-instance">
<span id="build-and-run-mxnet-on-a-gpu-instance"></span><h3>Build and Run MXNet on a GPU Instance<a class="headerlink" href="#build-and-run-mxnet-on-a-gpu-instance" title="Permalink to this headline"></a></h3>
<p>The following commands build <em>MXNet</em> with CUDA/CUDNN, Amazon S3, and distributed
training.</p>
<div class="highlight-bash"><div class="highlight"><pre><span></span>git clone --recursive https://github.com/dmlc/mxnet
<span class="nb">cd</span> mxnet<span class="p">;</span> cp make/config.mk .
<span class="nb">echo</span> <span class="s2">"USE_CUDA=1"</span> >>config.mk
<span class="nb">echo</span> <span class="s2">"USE_CUDA_PATH=/usr/local/cuda"</span> >>config.mk
<span class="nb">echo</span> <span class="s2">"USE_CUDNN=1"</span> >>config.mk
<span class="nb">echo</span> <span class="s2">"USE_BLAS=atlas"</span> >> config.mk
<span class="nb">echo</span> <span class="s2">"USE_DIST_KVSTORE = 1"</span> >>config.mk
<span class="nb">echo</span> <span class="s2">"USE_S3=1"</span> >>config.mk
make -j<span class="k">$(</span>nproc<span class="k">)</span>
</pre></div>
</div>
<p>To test whether everything is installed properly, we can try training a convolutional neural network (CNN) on the MNIST dataset using a GPU:</p>
<div class="highlight-bash"><div class="highlight"><pre><span></span>python tests/python/gpu/test_conv.py
</pre></div>
</div>
<p>If you’ve placed the MNIST data on <code class="docutils literal"><span class="pre">s3://dmlc/mnist</span></code>, you can read the data stored on Amazon S3 directly with the following command:</p>
<div class="highlight-bash"><div class="highlight"><pre><span></span>sed -i.bak <span class="s2">"s!data_dir = 'data'!data_dir = 's3://dmlc/mnist'!"</span> tests/python/gpu/test_conv.py
</pre></div>
</div>
<p><strong><em>Note:</em></strong> You can use <code class="docutils literal"><span class="pre">sudo</span> <span class="pre">ln</span> <span class="pre">/dev/null</span> <span class="pre">/dev/raw1394</span></code> to fix the opencv error <code class="docutils literal"><span class="pre">libdc1394</span> <span class="pre">error:</span> <span class="pre">Failed</span> <span class="pre">to</span> <span class="pre">initialize</span> <span class="pre">libdc1394</span></code>.</p>
</div>
<div class="section" id="set-up-an-ec2-gpu-cluster-for-distributed-training">
<span id="set-up-an-ec2-gpu-cluster-for-distributed-training"></span><h3>Set Up an EC2 GPU Cluster for Distributed Training<a class="headerlink" href="#set-up-an-ec2-gpu-cluster-for-distributed-training" title="Permalink to this headline"></a></h3>
<p>A cluster consists of multiple computers.
You can use one computer with <em>MXNet</em> installed as the root computer for submitting jobs,and then launch several
slave computers to run the jobs. For example, launch multiple instances using an
AMI, e.g.,
<a class="reference external" href="https://console.aws.amazon.com/ec2/v2/home?region=us-east-1#LaunchInstanceWizard:ami=ami-12fd8178">ami-12fd8178</a>,
with dependencies installed. There are two options:</p>
<ul class="simple">
<li>Make all slaves’ ports accessible (same for the root) by setting type: All TCP,
Source: Anywhere in Configure Security Group.</li>
<li>Use the same <code class="docutils literal"><span class="pre">pem</span></code> as the root computer to access all slave computers, and
then copy the <code class="docutils literal"><span class="pre">pem</span></code> file into the root computer’s <code class="docutils literal"><span class="pre">~/.ssh/id_rsa</span></code>. If you do this, all slave computers can be accessed with SSH from the root.</li>
</ul>
<p>Now, run the CNN on multiple computers. Assume that we are on a working
directory of the root computer, such as <code class="docutils literal"><span class="pre">~/train</span></code>, and MXNet is built as <code class="docutils literal"><span class="pre">~/mxnet</span></code>.</p>
<ol class="simple">
<li>Pack the <em>MXNet</em> Python library into this working directory for easy
synchronization:</li>
</ol>
<div class="highlight-bash"><div class="highlight"><pre><span></span>cp -r ~/mxnet/python/mxnet .
cp ~/mxnet/lib/libmxnet.so mxnet/
</pre></div>
</div>
<p>And then copy the training program:</p>
<div class="highlight-bash"><div class="highlight"><pre><span></span>cp ~/mxnet/example/image-classification/*.py .
cp -r ~/mxnet/example/image-classification/common .
</pre></div>
</div>
<ol class="simple">
<li>Prepare a host file with all slaves private IPs. For example, <code class="docutils literal"><span class="pre">cat</span> <span class="pre">hosts</span></code>:</li>
</ol>
<div class="highlight-bash"><div class="highlight"><pre><span></span><span class="m">172</span>.30.0.172
<span class="m">172</span>.30.0.171
</pre></div>
</div>
<ol class="simple">
<li>Assuming that there are two computers, train the CNN using two workers:</li>
</ol>
<div class="highlight-bash"><div class="highlight"><pre><span></span>../../tools/launch.py -n <span class="m">2</span> -H hosts --sync-dir /tmp/mxnet python train_mnist.py --kv-store dist_sync
</pre></div>
</div>
<p><strong><em>Note:</em></strong> Sometimes the jobs linger at the slave computers even though you’ve pressed <code class="docutils literal"><span class="pre">Ctrl-c</span></code>
at the root node. To terminate them, use the following command:</p>
<div class="highlight-bash"><div class="highlight"><pre><span></span>cat hosts <span class="p">|</span> xargs -I<span class="o">{}</span> ssh -o <span class="nv">StrictHostKeyChecking</span><span class="o">=</span>no <span class="o">{}</span> <span class="s1">'uname -a; pgrep python | xargs kill -9'</span>
</pre></div>
</div>
<p><strong><em>Note:</em></strong> The preceding example is very simple to train and therefore isn’t a good
benchmark for distributed training. Consider using other <a class="reference external" href="https://github.com/dmlc/mxnet/tree/master/example/image-classification">examples</a>.</p>
</div>
<div class="section" id="more-options">
<span id="more-options"></span><h3>More Options<a class="headerlink" href="#more-options" title="Permalink to this headline"></a></h3>
<div class="section" id="use-multiple-data-shards">
<span id="use-multiple-data-shards"></span><h4>Use Multiple Data Shards<a class="headerlink" href="#use-multiple-data-shards" title="Permalink to this headline"></a></h4>
<p>It is common to pack a dataset into multiple files, especially when working in a distributed environment.
<em>MXNet</em> supports direct loading from multiple data shards.
Put all of the record files into a folder, and point the data path to the folder.</p>
</div>
<div class="section" id="use-yarn-and-sge">
<span id="use-yarn-and-sge"></span><h4>Use YARN and SGE<a class="headerlink" href="#use-yarn-and-sge" title="Permalink to this headline"></a></h4>
<p>Although using SSH can be simple when you don’t have a cluster scheduling framework,
<em>MXNet</em> is designed to be portable to various platforms.We provide scripts available in <a class="reference external" href="https://github.com/dmlc/dmlc-core/tree/master/tracker">tracker</a>
to allow running on other cluster frameworks, including Hadoop (YARN) and SGE.
We welcome contributions from the community of examples of running <em>MXNet</em> on your favorite distributed platform.</p>
</div>
</div>
</div>
</div>
<div class="container">
<div class="footer">
<p> © 2015-2017 DMLC. All rights reserved. </p>
</div>
</div>
</div>
<div aria-label="main navigation" class="sphinxsidebar rightsidebar" role="navigation">
<div class="sphinxsidebarwrapper">
<h3><a href="../index.html">Table Of Contents</a></h3>
<ul>
<li><a class="reference internal" href="#">MXNet on the Cloud</a><ul>
<li><a class="reference internal" href="#set-up-an-aws-gpu-cluster-from-scratch">Set Up an AWS GPU Cluster from Scratch</a><ul>
<li><a class="reference internal" href="#use-amazon-s3-to-host-data">Use Amazon S3 to Host Data</a></li>
<li><a class="reference internal" href="#use-pre-installed-ec2-gpu-instance">Use Pre-installed EC2 GPU Instance</a></li>
<li><a class="reference internal" href="#set-up-an-ec2-gpu-instance-from-scratch">Set Up an EC2 GPU Instance from Scratch</a></li>
<li><a class="reference internal" href="#build-and-run-mxnet-on-a-gpu-instance">Build and Run MXNet on a GPU Instance</a></li>
<li><a class="reference internal" href="#set-up-an-ec2-gpu-cluster-for-distributed-training">Set Up an EC2 GPU Cluster for Distributed Training</a></li>
<li><a class="reference internal" href="#more-options">More Options</a><ul>
<li><a class="reference internal" href="#use-multiple-data-shards">Use Multiple Data Shards</a></li>
<li><a class="reference internal" href="#use-yarn-and-sge">Use YARN and SGE</a></li>
</ul>
</li>
</ul>
</li>
</ul>
</li>
</ul>
</div>
</div>
</div> <!-- pagename != index -->
<script crossorigin="anonymous" integrity="sha384-0mSbJDEHialfmuBBQP6A4Qrprq5OVfW37PRR3j5ELqxss1yVqOtnepnHVP9aJ7xS" src="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.6/js/bootstrap.min.js"></script>
<script src="../_static/js/sidebar.js" type="text/javascript"></script>
<script src="../_static/js/search.js" type="text/javascript"></script>
<script src="../_static/js/navbar.js" type="text/javascript"></script>
<script src="../_static/js/clipboard.min.js" type="text/javascript"></script>
<script src="../_static/js/copycode.js" type="text/javascript"></script>
<script type="text/javascript">
$('body').ready(function () {
$('body').css('visibility', 'visible');
});
</script>
</div></body>
</html>