| <!DOCTYPE html> |
| |
| <!--- |
| Licensed to the Apache Software Foundation (ASF) under one |
| or more contributor license agreements. See the NOTICE file |
| distributed with this work for additional information |
| regarding copyright ownership. The ASF licenses this file |
| to you under the Apache License, Version 2.0 (the |
| "License"); you may not use this file except in compliance |
| with the License. You may obtain a copy of the License at |
| http://www.apache.org/licenses/LICENSE-2.0 |
| Unless required by applicable law or agreed to in writing, |
| software distributed under the License is distributed on an |
| "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| KIND, either express or implied. See the License for the |
| specific language governing permissions and limitations |
| under the License. |
| --> |
| |
| <html lang=" en"><head> |
| <meta charset="utf-8"> |
| <meta http-equiv="X-UA-Compatible" content="IE=edge"> |
| <meta name="viewport" content="width=device-width, initial-scale=1"> |
| <link href="/versions/1.9.1/assets/img/mxnet-icon.png" rel="icon" type="image/png"><!-- Begin Jekyll SEO tag v2.6.1 --> |
| <title>MXNet on the Cloud | Apache MXNet</title> |
| <meta name="generator" content="Jekyll v3.8.6" /> |
| <meta property="og:title" content="MXNet on the Cloud" /> |
| <meta property="og:locale" content="en_US" /> |
| <meta name="description" content="A flexible and efficient library for deep learning." /> |
| <meta property="og:description" content="A flexible and efficient library for deep learning." /> |
| <link rel="canonical" href="https://mxnet.apache.org/versions/1.9.1/api/faq/cloud" /> |
| <meta property="og:url" content="https://mxnet.apache.org/versions/1.9.1/api/faq/cloud" /> |
| <meta property="og:site_name" content="Apache MXNet" /> |
| <script type="application/ld+json"> |
| {"description":"A flexible and efficient library for deep learning.","headline":"MXNet on the Cloud","@type":"WebPage","url":"https://mxnet.apache.org/versions/1.9.1/api/faq/cloud","@context":"https://schema.org"}</script> |
| <!-- End Jekyll SEO tag --> |
| <link rel="stylesheet" href="/versions/1.9.1/assets/docsearch.min.css" /><link rel="stylesheet" href="/versions/1.9.1/assets/main.css"><link type="application/atom+xml" rel="alternate" href="https://mxnet.apache.org/versions/1.9.1/feed.xml" title="Apache MXNet" /><!-- Matomo --> |
| <script> |
| var _paq = window._paq = window._paq || []; |
| /* tracker methods like "setCustomDimension" should be called before "trackPageView" */ |
| /* We explicitly disable cookie tracking to avoid privacy issues */ |
| _paq.push(['disableCookies']); |
| _paq.push(['trackPageView']); |
| _paq.push(['enableLinkTracking']); |
| (function() { |
| var u="https://analytics.apache.org/"; |
| _paq.push(['setTrackerUrl', u+'matomo.php']); |
| _paq.push(['setSiteId', '23']); |
| var d=document, g=d.createElement('script'), s=d.getElementsByTagName('script')[0]; |
| g.async=true; g.src=u+'matomo.js'; s.parentNode.insertBefore(g,s); |
| })(); |
| </script> |
| <!-- End Matomo Code --> |
| |
| <script src="/versions/1.9.1/assets/js/jquery-3.3.1.min.js"></script> |
| <script src="/versions/1.9.1/assets/js/docsearch.min.js"></script><script src="/versions/1.9.1/assets/js/globalSearch.js" defer></script> |
| <script src="/versions/1.9.1/assets/js/clipboard.js" defer></script> |
| <script src="/versions/1.9.1/assets/js/copycode.js" defer></script></head> |
| <body><header class="site-header" role="banner"> |
| |
| <script> |
| $(document).ready(function () { |
| |
| // HEADER OPACITY LOGIC |
| |
| function opacity_header() { |
| var value = "rgba(4,140,204," + ($(window).scrollTop() / 300 + 0.4) + ")" |
| $('.site-header').css("background-color", value) |
| } |
| |
| $(window).scroll(function () { |
| opacity_header() |
| }) |
| opacity_header(); |
| |
| // MENU SELECTOR LOGIC |
| $('.page-link').each( function () { |
| if (window.location.href.includes(this.href)) { |
| $(this).addClass("page-current"); |
| } |
| }); |
| }) |
| </script> |
| <div class="wrapper"> |
| <a class="site-title" rel="author" href="/versions/1.9.1/"><img |
| src="/versions/1.9.1/assets/img/mxnet_logo.png" class="site-header-logo"></a> |
| <nav class="site-nav"> |
| <input type="checkbox" id="nav-trigger" class="nav-trigger"/> |
| <label for="nav-trigger"> |
| <span class="menu-icon"> |
| <svg viewBox="0 0 18 15" width="18px" height="15px"> |
| <path d="M18,1.484c0,0.82-0.665,1.484-1.484,1.484H1.484C0.665,2.969,0,2.304,0,1.484l0,0C0,0.665,0.665,0,1.484,0 h15.032C17.335,0,18,0.665,18,1.484L18,1.484z M18,7.516C18,8.335,17.335,9,16.516,9H1.484C0.665,9,0,8.335,0,7.516l0,0 c0-0.82,0.665-1.484,1.484-1.484h15.032C17.335,6.031,18,6.696,18,7.516L18,7.516z M18,13.516C18,14.335,17.335,15,16.516,15H1.484 C0.665,15,0,14.335,0,13.516l0,0c0-0.82,0.665-1.483,1.484-1.483h15.032C17.335,12.031,18,12.695,18,13.516L18,13.516z"/> |
| </svg> |
| </span> |
| </label> |
| <div class="gs-search-border"> |
| <div id="gs-search-icon"></div> |
| <form id="global-search-form"> |
| <input id="global-search" type="text" title="Search" placeholder="Search" /> |
| <div id="global-search-dropdown-container"> |
| <button class="gs-current-version btn" type="button" data-toggle="dropdown"> |
| <span id="gs-current-version-label">1.9.1</span> |
| <svg class="gs-dropdown-caret" viewBox="0 0 32 32" class="icon icon-caret-bottom" aria-hidden="true"> |
| <path class="dropdown-caret-path" d="M24 11.305l-7.997 11.39L8 11.305z"></path> |
| </svg> |
| </button> |
| <ul class="gs-opt-group gs-version-dropdown"> |
| |
| |
| <li class="gs-opt gs-versions">master</li> |
| |
| |
| |
| <li class="gs-opt gs-versions active">1.9.1</li> |
| |
| |
| |
| <li class="gs-opt gs-versions">1.8.0</li> |
| |
| |
| |
| <li class="gs-opt gs-versions">1.7.0</li> |
| |
| |
| |
| <li class="gs-opt gs-versions">1.6.0</li> |
| |
| |
| |
| <li class="gs-opt gs-versions">1.5.0</li> |
| |
| |
| |
| <li class="gs-opt gs-versions">1.4.1</li> |
| |
| |
| |
| <li class="gs-opt gs-versions">1.3.1</li> |
| |
| |
| |
| <li class="gs-opt gs-versions">1.2.1</li> |
| |
| |
| |
| <li class="gs-opt gs-versions">1.1.0</li> |
| |
| |
| |
| <li class="gs-opt gs-versions">1.0.0</li> |
| |
| |
| |
| <li class="gs-opt gs-versions">0.12.1</li> |
| |
| |
| |
| <li class="gs-opt gs-versions">0.11.0</li> |
| |
| |
| </ul> |
| </div> |
| <span id="global-search-close">x</span> |
| </form> |
| </div> |
| <div class="trigger"> |
| <div id="global-search-mobile-border"> |
| <div id="gs-search-icon-mobile"></div> |
| <input id="global-search-mobile" placeholder="Search..." type="text"/> |
| <div id="global-search-dropdown-container-mobile"> |
| <button class="gs-current-version-mobile btn" type="button" data-toggle="dropdown"> |
| <svg class="gs-dropdown-caret" viewBox="0 0 32 32" class="icon icon-caret-bottom" aria-hidden="true"> |
| <path class="dropdown-caret-path" d="M24 11.305l-7.997 11.39L8 11.305z"></path> |
| </svg> |
| </button> |
| <ul class="gs-opt-group gs-version-dropdown-mobile"> |
| |
| |
| <li class="gs-opt gs-versions">master</li> |
| |
| |
| |
| <li class="gs-opt gs-versions active">1.9.1</li> |
| |
| |
| |
| <li class="gs-opt gs-versions">1.8.0</li> |
| |
| |
| |
| <li class="gs-opt gs-versions">1.7.0</li> |
| |
| |
| |
| <li class="gs-opt gs-versions">1.6.0</li> |
| |
| |
| |
| <li class="gs-opt gs-versions">1.5.0</li> |
| |
| |
| |
| <li class="gs-opt gs-versions">1.4.1</li> |
| |
| |
| |
| <li class="gs-opt gs-versions">1.3.1</li> |
| |
| |
| |
| <li class="gs-opt gs-versions">1.2.1</li> |
| |
| |
| |
| <li class="gs-opt gs-versions">1.1.0</li> |
| |
| |
| |
| <li class="gs-opt gs-versions">1.0.0</li> |
| |
| |
| |
| <li class="gs-opt gs-versions">0.12.1</li> |
| |
| |
| |
| <li class="gs-opt gs-versions">0.11.0</li> |
| |
| |
| </ul> |
| </div> |
| </div> |
| <a class="page-link" href="/versions/1.9.1/get_started">Get Started</a> |
| <a class="page-link" href="/versions/1.9.1/features">Features</a> |
| <a class="page-link" href="/versions/1.9.1/ecosystem">Ecosystem</a> |
| <a class="page-link" href="/versions/1.9.1/api">Docs & Tutorials</a> |
| <a class="page-link" href="/versions/1.9.1/trusted_by">Trusted By</a> |
| <a class="page-link" href="https://github.com/apache/mxnet">GitHub</a> |
| <div class="dropdown" style="min-width:100px"> |
| <span class="dropdown-header">Apache |
| <svg class="dropdown-caret" viewBox="0 0 32 32" class="icon icon-caret-bottom" aria-hidden="true"><path class="dropdown-caret-path" d="M24 11.305l-7.997 11.39L8 11.305z"></path></svg> |
| </span> |
| <div class="dropdown-content" style="min-width:250px"> |
| <a href="https://www.apache.org/foundation/">Apache Software Foundation</a> |
| <a href="https://www.apache.org/licenses/">License</a> |
| <a href="/versions/1.9.1/api/faq/security.html">Security</a> |
| <a href="https://privacy.apache.org/policies/privacy-policy-public.html">Privacy</a> |
| <a href="https://www.apache.org/events/current-event">Events</a> |
| <a href="https://www.apache.org/foundation/sponsorship.html">Sponsorship</a> |
| <a href="https://www.apache.org/foundation/thanks.html">Thanks</a> |
| </div> |
| </div> |
| <div class="dropdown"> |
| <span class="dropdown-header">1.9.1 |
| <svg class="dropdown-caret" viewBox="0 0 32 32" class="icon icon-caret-bottom" aria-hidden="true"><path class="dropdown-caret-path" d="M24 11.305l-7.997 11.39L8 11.305z"></path></svg> |
| </span> |
| <div class="dropdown-content"> |
| <a href="/">master</a> |
| <a class="dropdown-option-active" href="/versions/1.9.1/">1.9.1</a> |
| <a href="/versions/1.8.0/">1.8.0</a> |
| <a href="/versions/1.7.0/">1.7.0</a> |
| <a href="/versions/1.6.0/">1.6.0</a> |
| <a href="/versions/1.5.0/">1.5.0</a> |
| <a href="/versions/1.4.1/">1.4.1</a> |
| <a href="/versions/1.3.1/">1.3.1</a> |
| <a href="/versions/1.2.1/">1.2.1</a> |
| <a href="/versions/1.1.0/">1.1.0</a> |
| <a href="/versions/1.0.0/">1.0.0</a> |
| <a href="/versions/0.12.1/">0.12.1</a> |
| <a href="/versions/0.11.0/">0.11.0</a> |
| </div> |
| </div> |
| </div> |
| </nav> |
| </div> |
| </header> |
| <main class="page-content" aria-label="Content"> |
| <script> |
| |
| </script> |
| <article class="post"> |
| |
| <header class="post-header wrapper"> |
| <h1 class="post-title">MXNet on the Cloud</h1> |
| <h3></h3></header> |
| |
| <div class="post-content"> |
| <div class="wrapper"> |
| <div class="row"> |
| <div class="col-3 docs-side-bar"> |
| <h3 style="text-transform: capitalize; padding-left:10px">faq</h3> |
| <ul> |
| |
| <!-- page-category --> |
| |
| <!-- page-category --> |
| |
| |
| <li><a href="/versions/1.9.1/api/faq/add_op_in_backend">A Beginner's Guide to Implementing Operators in MXNet Backend</a></li> |
| <!-- page-category --> |
| |
| <!-- page-category --> |
| |
| <!-- page-category --> |
| |
| <!-- page-category --> |
| |
| |
| <li><a href="/versions/1.9.1/api/faq/caffe">Convert from Caffe to MXNet</a></li> |
| <!-- page-category --> |
| |
| <!-- page-category --> |
| |
| <!-- page-category --> |
| |
| <!-- page-category --> |
| |
| <!-- page-category --> |
| |
| |
| <li><a href="/versions/1.9.1/api/faq/cloud">MXNet on the Cloud</a></li> |
| <!-- page-category --> |
| |
| <!-- page-category --> |
| |
| <!-- page-category --> |
| |
| <!-- page-category --> |
| |
| <!-- page-category --> |
| |
| |
| <li><a href="/versions/1.9.1/api/faq/distributed_training">Distributed Training in MXNet</a></li> |
| <!-- page-category --> |
| |
| <!-- page-category --> |
| |
| <!-- page-category --> |
| |
| |
| <li><a href="/versions/1.9.1/api/faq/env_var">Environment Variables</a></li> |
| <!-- page-category --> |
| |
| <!-- page-category --> |
| |
| <!-- page-category --> |
| |
| <!-- page-category --> |
| |
| <!-- page-category --> |
| |
| <!-- page-category --> |
| |
| |
| <li><a href="/versions/1.9.1/api/faq/float16">Float16</a></li> |
| <!-- page-category --> |
| |
| <!-- page-category --> |
| |
| <!-- page-category --> |
| |
| <!-- page-category --> |
| |
| <!-- page-category --> |
| |
| <!-- page-category --> |
| |
| <!-- page-category --> |
| |
| <!-- page-category --> |
| |
| <!-- page-category --> |
| |
| <!-- page-category --> |
| |
| <!-- page-category --> |
| |
| <!-- page-category --> |
| |
| <!-- page-category --> |
| |
| <!-- page-category --> |
| |
| <!-- page-category --> |
| |
| <!-- page-category --> |
| |
| <!-- page-category --> |
| |
| <!-- page-category --> |
| |
| <!-- page-category --> |
| |
| <!-- page-category --> |
| |
| <!-- page-category --> |
| |
| <!-- page-category --> |
| |
| <!-- page-category --> |
| |
| <!-- page-category --> |
| |
| <!-- page-category --> |
| |
| <!-- page-category --> |
| |
| |
| <li><a href="/versions/1.9.1/api/faq/large_tensor_support">Using MXNet with Large Tensor Support</a></li> |
| <!-- page-category --> |
| |
| <!-- page-category --> |
| |
| <!-- page-category --> |
| |
| <!-- page-category --> |
| |
| <!-- page-category --> |
| |
| |
| <li><a href="/versions/1.9.1/api/faq/model_parallel_lstm">Model Parallel</a></li> |
| <!-- page-category --> |
| |
| <!-- page-category --> |
| |
| <!-- page-category --> |
| |
| <!-- page-category --> |
| |
| |
| <li><a href="/versions/1.9.1/api/faq/multi_device">Data Parallelism with Multiple CPU/GPUs on MXNet</a></li> |
| <!-- page-category --> |
| |
| <!-- page-category --> |
| |
| <!-- page-category --> |
| |
| <!-- page-category --> |
| |
| <!-- page-category --> |
| |
| <!-- page-category --> |
| |
| <!-- page-category --> |
| |
| <!-- page-category --> |
| |
| <!-- page-category --> |
| |
| <!-- page-category --> |
| |
| |
| <li><a href="/versions/1.9.1/api/faq/new_op">Create New Operators</a></li> |
| <!-- page-category --> |
| |
| |
| <li><a href="/versions/1.9.1/api/faq/nnpack">NNPACK for Multi-Core CPU Support in MXNet</a></li> |
| <!-- page-category --> |
| |
| <!-- page-category --> |
| |
| <!-- page-category --> |
| |
| <!-- page-category --> |
| |
| <!-- page-category --> |
| |
| <!-- page-category --> |
| |
| |
| <li><a href="/versions/1.9.1/api/faq/perf">Some Tips for Improving MXNet Performance</a></li> |
| <!-- page-category --> |
| |
| <!-- page-category --> |
| |
| <!-- page-category --> |
| |
| |
| <li><a href="/versions/1.9.1/api/faq/recordio">Create a Dataset Using RecordIO</a></li> |
| <!-- page-category --> |
| |
| |
| <li><a href="/versions/1.9.1/api/faq/s3_integration">Use data from S3 for training</a></li> |
| <!-- page-category --> |
| |
| <!-- page-category --> |
| |
| |
| <li><a href="/versions/1.9.1/api/faq/security">MXNet Security Best Practices</a></li> |
| <!-- page-category --> |
| |
| |
| <li><a href="/versions/1.9.1/api/faq/smart_device">Deep Learning at the Edge</a></li> |
| <!-- page-category --> |
| |
| <!-- page-category --> |
| |
| <!-- page-category --> |
| |
| <!-- page-category --> |
| |
| <!-- page-category --> |
| |
| <!-- page-category --> |
| |
| <!-- page-category --> |
| |
| <!-- page-category --> |
| |
| <!-- page-category --> |
| |
| <!-- page-category --> |
| |
| <!-- page-category --> |
| |
| <!-- page-category --> |
| |
| |
| <li><a href="/versions/1.9.1/api/faq/visualize_graph">Visualize Neural Networks</a></li> |
| <!-- page-category --> |
| |
| |
| <li><a href="/versions/1.9.1/api/faq/why_mxnet">Why MXNet came to be?</a></li> |
| <!-- page-category --> |
| |
| <!-- page-category --> |
| |
| <!-- page-category --> |
| <!-- resource-p --> |
| </ul> |
| </div> |
| <div class="col-9"> |
| <!--- Licensed to the Apache Software Foundation (ASF) under one --> |
| |
| <!--- or more contributor license agreements. See the NOTICE file --> |
| |
| <!--- distributed with this work for additional information --> |
| |
| <!--- regarding copyright ownership. The ASF licenses this file --> |
| |
| <!--- to you under the Apache License, Version 2.0 (the --> |
| |
| <!--- "License"); you may not use this file except in compliance --> |
| |
| <!--- with the License. You may obtain a copy of the License at --> |
| |
| <!--- http://www.apache.org/licenses/LICENSE-2.0 --> |
| |
| <!--- Unless required by applicable law or agreed to in writing, --> |
| |
| <!--- software distributed under the License is distributed on an --> |
| |
| <!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY --> |
| |
| <!--- KIND, either express or implied. See the License for the --> |
| |
| <!--- specific language governing permissions and limitations --> |
| |
| <!--- under the License. --> |
| |
| <h1 id="mxnet-on-the-cloud">MXNet on the Cloud</h1> |
| |
| <p>Deep learning can require extremely powerful hardware, often for unpredictable durations of time. |
| Moreover, <em>MXNet</em> can benefit from both multiple GPUs and multiple machines. |
| Accordingly, cloud computing, as offered by AWS and others, |
| is especially well suited to training deep learning models. |
| Using AWS, we can rapidly fire up multiple machines with multiple GPUs each at will |
| and maintain the resources for precisely the amount of time needed.</p> |
| |
| <h2 id="set-up-an-aws-gpu-cluster-from-scratch">Set Up an AWS GPU Cluster from Scratch</h2> |
| |
| <p>In this document, we provide a step-by-step guide that will teach you |
| how to set up an AWS cluster with <em>MXNet</em>. We show how to:</p> |
| |
| <ul> |
| <li><a href="#use-amazon-s3-to-host-data">Use Amazon S3 to host data</a></li> |
| <li><a href="#set-up-an-ec2-gpu-instance">Set up an EC2 GPU instance with all dependencies installed</a></li> |
| <li><a href="#build-and-run-mxnet-on-a-gpu-instance">Build and run MXNet on a single computer</a></li> |
| <li><a href="#set-up-an-ec2-gpu-cluster-for-distributed-training">Set up an EC2 GPU cluster for distributed training</a></li> |
| </ul> |
| |
| <h3 id="use-amazon-s3-to-host-data">Use Amazon S3 to Host Data</h3> |
| |
| <p>Amazon S3 provides distributed data storage which proves especially convenient for hosting large datasets. |
| To use S3, you need <a href="https://docs.aws.amazon.com/AWSSimpleQueueService/latest/SQSGettingStartedGuide/AWSCredentials.html">AWS credentials</a>, |
| including an <code>ACCESS_KEY_ID</code> and a <code>SECRET_ACCESS_KEY</code>.</p> |
| |
| <p>To use <em>MXNet</em> with S3, set the environment variables <code>AWS_ACCESS_KEY_ID</code> and |
| <code>AWS_SECRET_ACCESS_KEY</code> by adding the following two lines in |
| <code>~/.bashrc</code> (replacing the strings with the correct ones):</p> |
| <div class="highlight"><pre><code class="language-bash" data-lang="bash"><span class="nb">export </span><span class="nv">AWS_ACCESS_KEY_ID</span><span class="o">=</span>AKIAIOSFODNN7EXAMPLE |
| <span class="nb">export </span><span class="nv">AWS_SECRET_ACCESS_KEY</span><span class="o">=</span>wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY |
| </code></pre></div> |
| <p>There are several ways to upload data to S3. One simple way is to use |
| <a href="https://s3tools.org/s3cmd">s3cmd</a>. For example:</p> |
| <div class="highlight"><pre><code class="language-bash" data-lang="bash">wget http://data.mxnet.io/mxnet/data/mnist.zip |
| unzip mnist.zip <span class="o">&&</span> s3cmd put t<span class="k">*</span><span class="nt">-ubyte</span> s3://dmlc/mnist/ |
| </code></pre></div> |
| <h3 id="use-pre-installed-ec2-gpu-instance">Use Pre-installed EC2 GPU Instance</h3> |
| |
| <p>The <a href="https://aws.amazon.com/marketplace/pp/B01M0AXXQB?qid=1475211685369&sr=0-1&ref_=srh_res_product_title">Deep Learning AMI</a> is an Amazon Linux image |
| supported and maintained by Amazon Web Services for use on Amazon Elastic Compute Cloud (Amazon EC2). |
| It contains <a href="https://github.com/apache/mxnet">MXNet-v0.9.3 tag</a> and the necessary components to get going with deep learning, |
| including Nvidia drivers, CUDA, cuDNN, Anaconda, Python2 and Python3.<br> |
| The AMI IDs are the following:</p> |
| |
| <ul> |
| <li>us-east-1: ami-e7c96af1</li> |
| <li>us-west-2: ami-dfb13ebf</li> |
| <li>eu-west-1: ami-6e5d6808</li> |
| </ul> |
| |
| <p>Now you can launch <em>MXNet</em> directly on an EC2 GPU instance.<br> |
| You can also use <a href="https://jupyter.org">Jupyter</a> notebook on EC2 machine. |
| Here is a <a href="https://github.com/dmlc/mxnet-notebooks">good tutorial</a> |
| on how to connect to a Jupyter notebook running on an EC2 instance.</p> |
| |
| <h3 id="set-up-an-ec2-gpu-instance-from-scratch">Set Up an EC2 GPU Instance from Scratch</h3> |
| |
| <p><em>MXNet</em> requires the following libraries:</p> |
| |
| <ul> |
| <li>C++ compiler with C++11 support, such as <code>gcc >= 4.8</code></li> |
| <li><code>CUDA</code> (<code>CUDNN</code> in optional) for GPU linear algebra</li> |
| <li><code>BLAS</code> (cblas, open-blas, atblas, mkl, or others) for CPU linear algebra</li> |
| <li><code>opencv</code> for image augmentations</li> |
| <li><code>curl</code> and <code>openssl</code> for the ability to read/write to Amazon S3</li> |
| </ul> |
| |
| <p>Installing <code>CUDA</code> on EC2 instances requires some effort. Caffe has a good |
| <a href="https://github.com/BVLC/caffe/wiki/Install-Caffe-on-EC2-from-scratch-(Ubuntu,-CUDA-7,-cuDNN-3)">tutorial</a> |
| on how to install CUDA 7.0 on Ubuntu 14.04.</p> |
| |
| <p><strong><em>Note:</em></strong> We tried CUDA 7.5 on Nov 7, 2015, but found it problematic.</p> |
| |
| <p>You can install the rest using the package manager. For example, on Ubuntu:</p> |
| <div class="highlight"><pre><code class="language-" data-lang="">sudo apt-get update |
| sudo apt-get install -y build-essential git libcurl4-openssl-dev libatlas-base-dev libopencv-dev python-numpy |
| </code></pre></div> |
| <p>The Amazon Machine Image (AMI) <a href="https://console.aws.amazon.com/ec2/v2/home?region=us-east-1#LaunchInstanceWizard:ami=ami-12fd8178">ami-12fd8178</a> has the packages listed above installed.</p> |
| |
| <h3 id="build-and-run-mxnet-on-a-gpu-instance">Build and Run MXNet on a GPU Instance</h3> |
| |
| <p>The following commands build <em>MXNet</em> with CUDA/CUDNN, Amazon S3, and distributed |
| training.</p> |
| <div class="highlight"><pre><code class="language-bash" data-lang="bash">git clone <span class="nt">--recursive</span> https://github.com/apache/mxnet |
| <span class="nb">cd </span>mxnet<span class="p">;</span> <span class="nb">cp </span>make/config.mk <span class="nb">.</span> |
| <span class="nb">echo</span> <span class="s2">"USE_CUDA=1"</span> <span class="o">>></span>config.mk |
| <span class="nb">echo</span> <span class="s2">"USE_CUDA_PATH=/usr/local/cuda"</span> <span class="o">>></span>config.mk |
| <span class="nb">echo</span> <span class="s2">"USE_CUDNN=1"</span> <span class="o">>></span>config.mk |
| <span class="nb">echo</span> <span class="s2">"USE_BLAS=atlas"</span> <span class="o">>></span> config.mk |
| <span class="nb">echo</span> <span class="s2">"USE_DIST_KVSTORE = 1"</span> <span class="o">>></span>config.mk |
| <span class="nb">echo</span> <span class="s2">"USE_S3=1"</span> <span class="o">>></span>config.mk |
| make <span class="nt">-j</span><span class="si">$(</span><span class="nb">nproc</span><span class="si">)</span> |
| </code></pre></div> |
| <p>To test whether everything is installed properly, we can try training a convolutional neural network (CNN) on the MNIST dataset using a GPU:</p> |
| <div class="highlight"><pre><code class="language-bash" data-lang="bash">python example/image-classification/train_mnist.py |
| </code></pre></div> |
| <p>If you've placed the MNIST data on <code>s3://dmlc/mnist</code>, you can read the data stored on Amazon S3 directly with the following command:</p> |
| <div class="highlight"><pre><code class="language-bash" data-lang="bash"><span class="nb">sed</span> <span class="nt">-i</span>.bak <span class="s2">"s!data_dir = 'data'!data_dir = 's3://dmlc/mnist'!"</span> example/image-classification/train_mnist.py |
| </code></pre></div> |
| <p><strong><em>Note:</em></strong> You can use <code>sudo ln /dev/null /dev/raw1394</code> to fix the opencv error <code>libdc1394 error: Failed to initialize libdc1394</code>.</p> |
| |
| <h3 id="set-up-an-ec2-gpu-cluster-for-distributed-training">Set Up an EC2 GPU Cluster for Distributed Training</h3> |
| |
| <p>A cluster consists of multiple computers. |
| You can use one computer with <em>MXNet</em> installed as the root computer for submitting jobs,and then launch several |
| slave computers to run the jobs. For example, launch multiple instances using an |
| AMI, e.g., |
| <a href="https://console.aws.amazon.com/ec2/v2/home?region=us-east-1#LaunchInstanceWizard:ami=ami-12fd8178">ami-12fd8178</a>, |
| with dependencies installed. There are two options:</p> |
| |
| <ul> |
| <li><p>Make all slaves' ports accessible (same for the root) by setting type: All TCP, |
| Source: Anywhere in Configure Security Group.</p></li> |
| <li><p>Use the same <code>pem</code> as the root computer to access all slave computers, and |
| then copy the <code>pem</code> file into the root computer's <code>~/.ssh/id_rsa</code>. If you do this, all slave computers can be accessed with SSH from the root.</p></li> |
| </ul> |
| |
| <p>Now, run the CNN on multiple computers. Assume that we are on a working |
| directory of the root computer, such as <code>~/train</code>, and MXNet is built as <code>~/mxnet</code>.</p> |
| |
| <ol> |
| <li>Pack the <em>MXNet</em> Python library into this working directory for easy |
| synchronization:</li> |
| </ol> |
| <div class="highlight"><pre><code class="language-bash" data-lang="bash"> <span class="nb">cp</span> <span class="nt">-r</span> ~/mxnet/python/mxnet <span class="nb">.</span> |
| <span class="nb">cp</span> ~/mxnet/lib/libmxnet.so mxnet/ |
| </code></pre></div> |
| <p>And then copy the training program:</p> |
| <div class="highlight"><pre><code class="language-bash" data-lang="bash"> <span class="nb">cp</span> ~/mxnet/example/image-classification/<span class="k">*</span>.py <span class="nb">.</span> |
| <span class="nb">cp</span> <span class="nt">-r</span> ~/mxnet/example/image-classification/common <span class="nb">.</span> |
| </code></pre></div> |
| <ol> |
| <li>Prepare a host file with all slaves private IPs. For example, <code>cat hosts</code>:</li> |
| </ol> |
| <div class="highlight"><pre><code class="language-bash" data-lang="bash"> 172.30.0.172 |
| 172.30.0.171 |
| </code></pre></div> |
| <ol> |
| <li>Assuming that there are two computers, train the CNN using two workers:</li> |
| </ol> |
| <div class="highlight"><pre><code class="language-bash" data-lang="bash"> ../../tools/launch.py <span class="nt">-n</span> 2 <span class="nt">-H</span> hosts <span class="nt">--sync-dir</span> /tmp/mxnet python train_mnist.py <span class="nt">--kv-store</span> dist_sync |
| </code></pre></div> |
| <p><strong><em>Note:</em></strong> Sometimes the jobs linger at the slave computers even though you've pressed <code>Ctrl-c</code> |
| at the root node. To terminate them, use the following command:</p> |
| <div class="highlight"><pre><code class="language-bash" data-lang="bash"><span class="nb">cat </span>hosts | xargs <span class="nt">-I</span><span class="o">{}</span> ssh <span class="nt">-o</span> <span class="nv">StrictHostKeyChecking</span><span class="o">=</span>no <span class="o">{}</span> <span class="s1">'uname -a; pgrep python | xargs kill -9'</span> |
| </code></pre></div> |
| <p><strong><em>Note:</em></strong> The preceding example is very simple to train and therefore isn't a good |
| benchmark for distributed training. Consider using other <a href="https://github.com/apache/mxnet/tree/v1.x/example/image-classification">examples</a>.</p> |
| |
| <h3 id="more-options">More Options</h3> |
| |
| <h4 id="use-multiple-data-shards">Use Multiple Data Shards</h4> |
| |
| <p>It is common to pack a dataset into multiple files, especially when working in a distributed environment. |
| <em>MXNet</em> supports direct loading from multiple data shards. |
| Put all of the record files into a folder, and point the data path to the folder.</p> |
| |
| <h4 id="use-yarn-and-sge">Use YARN and SGE</h4> |
| |
| <p>Although using SSH can be simple when you don't have a cluster scheduling framework, |
| <em>MXNet</em> is designed to be portable to various platforms.<br> |
| We provide scripts available in <a href="https://github.com/dmlc/dmlc-core/tree/master/tracker">tracker</a> |
| to allow running on other cluster frameworks, including Hadoop (YARN) and SGE. |
| We welcome contributions from the community of examples of running <em>MXNet</em> on your favorite distributed platform.</p> |
| |
| </div> |
| </div> |
| |
| </div> |
| </div> |
| |
| </article> |
| |
| </main><footer class="site-footer h-card"> |
| <div class="wrapper"> |
| <div class="row"> |
| <div class="col-4"> |
| <h4 class="footer-category-title">Resources</h4> |
| <ul class="contact-list"> |
| <li><a href="/versions/1.9.1/community/contribute#mxnet-dev-communications">Mailing lists</a></li> |
| <li><a href="https://cwiki.apache.org/confluence/display/MXNET/Apache+MXNet+Home">Developer Wiki</a></li> |
| <li><a href="https://issues.apache.org/jira/projects/MXNET/issues">Jira Tracker</a></li> |
| <li><a href="https://github.com/apache/mxnet/labels/Roadmap">Github Roadmap</a></li> |
| <li><a href="https://medium.com/apache-mxnet">Blog</a></li> |
| <li><a href="https://discuss.mxnet.io">Forum</a></li> |
| <li><a href="/versions/1.9.1/community/contribute">Contribute</a></li> |
| </ul> |
| </div> |
| |
| <div class="col-4"><ul class="social-media-list"><li><a href="https://github.com/apache/mxnet"><svg class="svg-icon"><use xlink:href="/versions/1.9.1/assets/minima-social-icons.svg#github"></use></svg> <span class="username">apache/mxnet</span></a></li><li><a href="https://www.twitter.com/apachemxnet"><svg class="svg-icon"><use xlink:href="/versions/1.9.1/assets/minima-social-icons.svg#twitter"></use></svg> <span class="username">apachemxnet</span></a></li><li><a href="https://youtube.com/apachemxnet"><svg class="svg-icon"><use xlink:href="/versions/1.9.1/assets/minima-social-icons.svg#youtube"></use></svg> <span class="username">apachemxnet</span></a></li></ul> |
| </div> |
| |
| <div class="col-4 footer-text"> |
| <p>A flexible and efficient library for deep learning.</p> |
| </div> |
| </div> |
| </div> |
| </footer> |
| <footer class="site-footer2"> |
| <div class="wrapper"> |
| <div class="row"> |
| <div class="col-3"> |
| <img src="/versions/1.9.1/assets/img/asf_logo.svg" class="footer-logo col-2"> |
| </div> |
| <div class="footer-bottom-warning col-9"> |
| </p><p>"Copyright © 2017-2022, The Apache Software Foundation. Licensed under the Apache License, Version 2.0. Apache MXNet, MXNet, Apache, the Apache |
| feather, and the Apache MXNet project logo are either registered trademarks or trademarks of the |
| Apache Software Foundation."</p> |
| </div> |
| </div> |
| </div> |
| </footer> |
| |
| |
| |
| |
| </body> |
| |
| </html> |