| <!DOCTYPE html> |
| |
| <!--- |
| Licensed to the Apache Software Foundation (ASF) under one |
| or more contributor license agreements. See the NOTICE file |
| distributed with this work for additional information |
| regarding copyright ownership. The ASF licenses this file |
| to you under the Apache License, Version 2.0 (the |
| "License"); you may not use this file except in compliance |
| with the License. You may obtain a copy of the License at |
| http://www.apache.org/licenses/LICENSE-2.0 |
| Unless required by applicable law or agreed to in writing, |
| software distributed under the License is distributed on an |
| "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| KIND, either express or implied. See the License for the |
| specific language governing permissions and limitations |
| under the License. |
| --> |
| |
| <html lang=" en"><head> |
| <meta charset="utf-8"> |
| <meta http-equiv="X-UA-Compatible" content="IE=edge"> |
| <meta name="viewport" content="width=device-width, initial-scale=1"> |
| <link href="/versions/master/assets/img/mxnet-icon.png" rel="icon" type="image/png"><!-- Begin Jekyll SEO tag v2.6.1 --> |
| <title>MXNet on the Cloud | Apache MXNet</title> |
| <meta name="generator" content="Jekyll v4.0.0" /> |
| <meta property="og:title" content="MXNet on the Cloud" /> |
| <meta property="og:locale" content="en_US" /> |
| <meta name="description" content="A flexible and efficient library for deep learning." /> |
| <meta property="og:description" content="A flexible and efficient library for deep learning." /> |
| <link rel="canonical" href="https://mxnet.apache.org/versions/master/api/faq/cloud" /> |
| <meta property="og:url" content="https://mxnet.apache.org/versions/master/api/faq/cloud" /> |
| <meta property="og:site_name" content="Apache MXNet" /> |
| <script type="application/ld+json"> |
| {"url":"https://mxnet.apache.org/versions/master/api/faq/cloud","headline":"MXNet on the Cloud","description":"A flexible and efficient library for deep learning.","@type":"WebPage","@context":"https://schema.org"}</script> |
| <!-- End Jekyll SEO tag --> |
| <link rel="stylesheet" href="/versions/master/assets/docsearch.min.css" /><link rel="stylesheet" href="/versions/master/assets/main.css"><link type="application/atom+xml" rel="alternate" href="https://mxnet.apache.org/versions/master/feed.xml" title="Apache MXNet" /><!-- Matomo --> |
| <script> |
| var _paq = window._paq = window._paq || []; |
| /* tracker methods like "setCustomDimension" should be called before "trackPageView" */ |
| /* We explicitly disable cookie tracking to avoid privacy issues */ |
| _paq.push(['disableCookies']); |
| _paq.push(['trackPageView']); |
| _paq.push(['enableLinkTracking']); |
| (function() { |
| var u="https://analytics.apache.org/"; |
| _paq.push(['setTrackerUrl', u+'matomo.php']); |
| _paq.push(['setSiteId', '23']); |
| var d=document, g=d.createElement('script'), s=d.getElementsByTagName('script')[0]; |
| g.async=true; g.src=u+'matomo.js'; s.parentNode.insertBefore(g,s); |
| })(); |
| </script> |
| <!-- End Matomo Code --> |
| |
| <script src="/versions/master/assets/js/jquery-3.3.1.min.js"></script> |
| <script src="/versions/master/assets/js/docsearch.min.js"></script><script src="/versions/master/assets/js/globalSearch.js" defer></script> |
| <script src="/versions/master/assets/js/clipboard.js" defer></script> |
| <script src="/versions/master/assets/js/copycode.js" defer></script></head> |
| <body><header class="site-header" role="banner"> |
| |
| <script> |
| $(document).ready(function () { |
| |
| // HEADER OPACITY LOGIC |
| |
| function opacity_header() { |
| var value = "rgba(4,140,204," + ($(window).scrollTop() / 300 + 0.4) + ")" |
| $('.site-header').css("background-color", value) |
| } |
| |
| $(window).scroll(function () { |
| opacity_header() |
| }) |
| opacity_header(); |
| |
| // MENU SELECTOR LOGIC |
| $('.page-link').each( function () { |
| if (window.location.href.includes(this.href)) { |
| $(this).addClass("page-current"); |
| } |
| }); |
| }) |
| </script> |
| <div class="wrapper"> |
| <a class="site-title" rel="author" href="/versions/master/"><img |
| src="/versions/master/assets/img/mxnet_logo.png" class="site-header-logo"></a> |
| <nav class="site-nav"> |
| <input type="checkbox" id="nav-trigger" class="nav-trigger"/> |
| <label for="nav-trigger"> |
| <span class="menu-icon"> |
| <svg viewBox="0 0 18 15" width="18px" height="15px"> |
| <path d="M18,1.484c0,0.82-0.665,1.484-1.484,1.484H1.484C0.665,2.969,0,2.304,0,1.484l0,0C0,0.665,0.665,0,1.484,0 h15.032C17.335,0,18,0.665,18,1.484L18,1.484z M18,7.516C18,8.335,17.335,9,16.516,9H1.484C0.665,9,0,8.335,0,7.516l0,0 c0-0.82,0.665-1.484,1.484-1.484h15.032C17.335,6.031,18,6.696,18,7.516L18,7.516z M18,13.516C18,14.335,17.335,15,16.516,15H1.484 C0.665,15,0,14.335,0,13.516l0,0c0-0.82,0.665-1.483,1.484-1.483h15.032C17.335,12.031,18,12.695,18,13.516L18,13.516z"/> |
| </svg> |
| </span> |
| </label> |
| <div class="gs-search-border"> |
| <div id="gs-search-icon"></div> |
| <form id="global-search-form"> |
| <input id="global-search" type="text" title="Search" placeholder="Search" /> |
| <div id="global-search-dropdown-container"> |
| <button class="gs-current-version btn" type="button" data-toggle="dropdown"> |
| <span id="gs-current-version-label">master</span> |
| <svg class="gs-dropdown-caret" viewBox="0 0 32 32" class="icon icon-caret-bottom" aria-hidden="true"> |
| <path class="dropdown-caret-path" d="M24 11.305l-7.997 11.39L8 11.305z"></path> |
| </svg> |
| </button> |
| <ul class="gs-opt-group gs-version-dropdown"> |
| |
| |
| <li class="gs-opt gs-versions active">master</li> |
| |
| |
| |
| <li class="gs-opt gs-versions">1.9.1</li> |
| |
| |
| |
| <li class="gs-opt gs-versions">1.8.0</li> |
| |
| |
| |
| <li class="gs-opt gs-versions">1.7.0</li> |
| |
| |
| |
| <li class="gs-opt gs-versions">1.6.0</li> |
| |
| |
| |
| <li class="gs-opt gs-versions">1.5.0</li> |
| |
| |
| |
| <li class="gs-opt gs-versions">1.4.1</li> |
| |
| |
| |
| <li class="gs-opt gs-versions">1.3.1</li> |
| |
| |
| |
| <li class="gs-opt gs-versions">1.2.1</li> |
| |
| |
| |
| <li class="gs-opt gs-versions">1.1.0</li> |
| |
| |
| |
| <li class="gs-opt gs-versions">1.0.0</li> |
| |
| |
| |
| <li class="gs-opt gs-versions">0.12.1</li> |
| |
| |
| |
| <li class="gs-opt gs-versions">0.11.0</li> |
| |
| |
| </ul> |
| </div> |
| <span id="global-search-close">x</span> |
| </form> |
| </div> |
| <div class="trigger"> |
| <div id="global-search-mobile-border"> |
| <div id="gs-search-icon-mobile"></div> |
| <input id="global-search-mobile" placeholder="Search..." type="text"/> |
| <div id="global-search-dropdown-container-mobile"> |
| <button class="gs-current-version-mobile btn" type="button" data-toggle="dropdown"> |
| <svg class="gs-dropdown-caret" viewBox="0 0 32 32" class="icon icon-caret-bottom" aria-hidden="true"> |
| <path class="dropdown-caret-path" d="M24 11.305l-7.997 11.39L8 11.305z"></path> |
| </svg> |
| </button> |
| <ul class="gs-opt-group gs-version-dropdown-mobile"> |
| |
| |
| <li class="gs-opt gs-versions active">master</li> |
| |
| |
| |
| <li class="gs-opt gs-versions">1.9.1</li> |
| |
| |
| |
| <li class="gs-opt gs-versions">1.8.0</li> |
| |
| |
| |
| <li class="gs-opt gs-versions">1.7.0</li> |
| |
| |
| |
| <li class="gs-opt gs-versions">1.6.0</li> |
| |
| |
| |
| <li class="gs-opt gs-versions">1.5.0</li> |
| |
| |
| |
| <li class="gs-opt gs-versions">1.4.1</li> |
| |
| |
| |
| <li class="gs-opt gs-versions">1.3.1</li> |
| |
| |
| |
| <li class="gs-opt gs-versions">1.2.1</li> |
| |
| |
| |
| <li class="gs-opt gs-versions">1.1.0</li> |
| |
| |
| |
| <li class="gs-opt gs-versions">1.0.0</li> |
| |
| |
| |
| <li class="gs-opt gs-versions">0.12.1</li> |
| |
| |
| |
| <li class="gs-opt gs-versions">0.11.0</li> |
| |
| |
| </ul> |
| </div> |
| </div> |
| <a class="page-link" href="/versions/master/get_started">Get Started</a> |
| <a class="page-link" href="/versions/master/features">Features</a> |
| <a class="page-link" href="/versions/master/ecosystem">Ecosystem</a> |
| <a class="page-link" href="/versions/master/api">Docs & Tutorials</a> |
| <a class="page-link" href="/versions/master/trusted_by">Trusted By</a> |
| <a class="page-link" href="https://github.com/apache/mxnet">GitHub</a> |
| <div class="dropdown" style="min-width:100px"> |
| <span class="dropdown-header">Apache |
| <svg class="dropdown-caret" viewBox="0 0 32 32" class="icon icon-caret-bottom" aria-hidden="true"><path class="dropdown-caret-path" d="M24 11.305l-7.997 11.39L8 11.305z"></path></svg> |
| </span> |
| <div class="dropdown-content" style="min-width:250px"> |
| <a href="https://www.apache.org/foundation/">Apache Software Foundation</a> |
| <a href="https://www.apache.org/licenses/">License</a> |
| <a href="/versions/master/api/faq/security.html">Security</a> |
| <a href="https://privacy.apache.org/policies/privacy-policy-public.html">Privacy</a> |
| <a href="https://www.apache.org/events/current-event">Events</a> |
| <a href="https://www.apache.org/foundation/sponsorship.html">Sponsorship</a> |
| <a href="https://www.apache.org/foundation/thanks.html">Thanks</a> |
| </div> |
| </div> |
| <div class="dropdown"> |
| <span class="dropdown-header">master |
| <svg class="dropdown-caret" viewBox="0 0 32 32" class="icon icon-caret-bottom" aria-hidden="true"><path class="dropdown-caret-path" d="M24 11.305l-7.997 11.39L8 11.305z"></path></svg> |
| </span> |
| <div class="dropdown-content"> |
| |
| |
| <a class="dropdown-option-active" href="/">master</a> |
| |
| |
| |
| <a href="/versions/1.9.1/">1.9.1</a> |
| |
| |
| |
| <a href="/versions/1.8.0/">1.8.0</a> |
| |
| |
| |
| <a href="/versions/1.7.0/">1.7.0</a> |
| |
| |
| |
| <a href="/versions/1.6.0/">1.6.0</a> |
| |
| |
| |
| <a href="/versions/1.5.0/">1.5.0</a> |
| |
| |
| |
| <a href="/versions/1.4.1/">1.4.1</a> |
| |
| |
| |
| <a href="/versions/1.3.1/">1.3.1</a> |
| |
| |
| |
| <a href="/versions/1.2.1/">1.2.1</a> |
| |
| |
| |
| <a href="/versions/1.1.0/">1.1.0</a> |
| |
| |
| |
| <a href="/versions/1.0.0/">1.0.0</a> |
| |
| |
| |
| <a href="/versions/0.12.1/">0.12.1</a> |
| |
| |
| |
| <a href="/versions/0.11.0/">0.11.0</a> |
| |
| |
| </div> |
| </div> |
| </div> |
| </nav> |
| </div> |
| </header> |
| <main class="page-content" aria-label="Content"> |
| <script> |
| |
| </script> |
| <article class="post"> |
| |
| <header class="post-header wrapper"> |
| <h1 class="post-title">MXNet on the Cloud</h1> |
| <h3></h3></header> |
| |
| <div class="post-content"> |
| <div class="wrapper"> |
| <div class="row"> |
| <div class="col-3 docs-side-bar"> |
| <h3 style="text-transform: capitalize; padding-left:10px">faq</h3> |
| <ul> |
| |
| <!-- page-category --> |
| |
| <!-- page-category --> |
| |
| |
| <li><a href="/versions/master/api/faq/add_op_in_backend">A Beginner's Guide to Implementing Operators in MXNet Backend</a></li> |
| <!-- page-category --> |
| |
| <!-- page-category --> |
| |
| <!-- page-category --> |
| |
| <!-- page-category --> |
| |
| <!-- page-category --> |
| |
| <!-- page-category --> |
| |
| <!-- page-category --> |
| |
| |
| <li><a href="/versions/master/api/faq/cloud">MXNet on the Cloud</a></li> |
| <!-- page-category --> |
| |
| <!-- page-category --> |
| |
| <!-- page-category --> |
| |
| <!-- page-category --> |
| |
| <!-- page-category --> |
| |
| <!-- page-category --> |
| |
| <!-- page-category --> |
| |
| |
| <li><a href="/versions/master/api/faq/distributed_training">Distributed Training in MXNet</a></li> |
| <!-- page-category --> |
| |
| <!-- page-category --> |
| |
| <!-- page-category --> |
| |
| <!-- page-category --> |
| |
| |
| <li><a href="/versions/master/api/faq/env_var">Environment Variables</a></li> |
| <!-- page-category --> |
| |
| <!-- page-category --> |
| |
| <!-- page-category --> |
| |
| <!-- page-category --> |
| |
| <!-- page-category --> |
| |
| <!-- page-category --> |
| |
| |
| <li><a href="/versions/master/api/faq/float16">Float16</a></li> |
| <!-- page-category --> |
| |
| <!-- page-category --> |
| |
| <!-- page-category --> |
| |
| <!-- page-category --> |
| |
| <!-- page-category --> |
| |
| <!-- page-category --> |
| |
| <!-- page-category --> |
| |
| <!-- page-category --> |
| |
| <!-- page-category --> |
| |
| <!-- page-category --> |
| |
| <!-- page-category --> |
| |
| <!-- page-category --> |
| |
| <!-- page-category --> |
| |
| <!-- page-category --> |
| |
| <!-- page-category --> |
| |
| <!-- page-category --> |
| |
| <!-- page-category --> |
| |
| <!-- page-category --> |
| |
| <!-- page-category --> |
| |
| <!-- page-category --> |
| |
| <!-- page-category --> |
| |
| <!-- page-category --> |
| |
| <!-- page-category --> |
| |
| <!-- page-category --> |
| |
| <!-- page-category --> |
| |
| <!-- page-category --> |
| |
| <!-- page-category --> |
| |
| |
| <li><a href="/versions/master/api/faq/large_tensor_support">Using MXNet with Large Tensor Support</a></li> |
| <!-- page-category --> |
| |
| <!-- page-category --> |
| |
| |
| <li><a href="/versions/master/api/faq/model_parallel_lstm">Model Parallel</a></li> |
| <!-- page-category --> |
| |
| <!-- page-category --> |
| |
| <!-- page-category --> |
| |
| <!-- page-category --> |
| |
| <!-- page-category --> |
| |
| <!-- page-category --> |
| |
| <!-- page-category --> |
| |
| <!-- page-category --> |
| |
| <!-- page-category --> |
| |
| |
| <li><a href="/versions/master/api/faq/new_op">Create New Operators</a></li> |
| <!-- page-category --> |
| |
| <!-- page-category --> |
| |
| <!-- page-category --> |
| |
| <!-- page-category --> |
| |
| <!-- page-category --> |
| |
| |
| <li><a href="/versions/master/api/faq/perf">Some Tips for Improving MXNet Performance</a></li> |
| <!-- page-category --> |
| |
| <!-- page-category --> |
| |
| <!-- page-category --> |
| |
| <!-- page-category --> |
| |
| |
| <li><a href="/versions/master/api/faq/recordio">Create a Dataset Using RecordIO</a></li> |
| <!-- page-category --> |
| |
| |
| <li><a href="/versions/master/api/faq/s3_integration">Use data from S3 for training</a></li> |
| <!-- page-category --> |
| |
| |
| <li><a href="/versions/master/api/faq/security">MXNet Security Best Practices</a></li> |
| <!-- page-category --> |
| |
| <!-- page-category --> |
| |
| <!-- page-category --> |
| |
| <!-- page-category --> |
| |
| <!-- page-category --> |
| |
| <!-- page-category --> |
| |
| <!-- page-category --> |
| |
| <!-- page-category --> |
| |
| <!-- page-category --> |
| |
| |
| <li><a href="/versions/master/api/faq/tensor_inspector_tutorial">Use TensorInspector to Help Debug Operators</a></li> |
| <!-- page-category --> |
| |
| <!-- page-category --> |
| |
| |
| <li><a href="/versions/master/api/faq/using_rtc">Using runtime compilation (RTC) to write CUDA kernels in MXNet</a></li> |
| <!-- page-category --> |
| |
| <!-- page-category --> |
| |
| |
| <li><a href="/versions/master/api/faq/why_mxnet">Why MXNet came to be?</a></li> |
| <!-- page-category --> |
| |
| <!-- page-category --> |
| <!-- resource-p --> |
| </ul> |
| </div> |
| <div class="col-9"> |
| <!--- Licensed to the Apache Software Foundation (ASF) under one --> |
| <!--- or more contributor license agreements. See the NOTICE file --> |
| <!--- distributed with this work for additional information --> |
| <!--- regarding copyright ownership. The ASF licenses this file --> |
| <!--- to you under the Apache License, Version 2.0 (the --> |
| <!--- "License"); you may not use this file except in compliance --> |
| <!--- with the License. You may obtain a copy of the License at --> |
| |
| <!--- http://www.apache.org/licenses/LICENSE-2.0 --> |
| |
| <!--- Unless required by applicable law or agreed to in writing, --> |
| <!--- software distributed under the License is distributed on an --> |
| <!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY --> |
| <!--- KIND, either express or implied. See the License for the --> |
| <!--- specific language governing permissions and limitations --> |
| <!--- under the License. --> |
| |
| <h1 id="mxnet-on-the-cloud">MXNet on the Cloud</h1> |
| |
| <p>Deep learning can require extremely powerful hardware, often for unpredictable durations of time. |
| Moreover, <em>MXNet</em> can benefit from both multiple GPUs and multiple machines. |
| Accordingly, cloud computing, as offered by AWS and others, |
| is especially well suited to training deep learning models. |
| Using AWS, we can rapidly fire up multiple machines with multiple GPUs each at will |
| and maintain the resources for precisely the amount of time needed.</p> |
| |
| <h2 id="set-up-an-aws-gpu-cluster-from-scratch">Set Up an AWS GPU Cluster from Scratch</h2> |
| |
| <p>In this document, we provide a step-by-step guide that will teach you |
| how to set up an AWS cluster with <em>MXNet</em>. We show how to:</p> |
| |
| <ul> |
| <li><a href="#use-pre-installed-ec2-gpu-instance">Use Pre-installed EC2 GPU Instance</a></li> |
| <li><a href="#build-and-run-mxnet-on-a-gpu-instance">Build and run MXNet on a single computer</a></li> |
| <li><a href="#set-up-an-ec2-gpu-cluster-for-distributed-training">Set up an EC2 GPU cluster for distributed training</a></li> |
| </ul> |
| |
| <h3 id="use-pre-installed-ec2-gpu-instance">Use Pre-installed EC2 GPU Instance</h3> |
| <p>The <a href="https://aws.amazon.com/marketplace/search/results?x=0&y=0&searchTerms=Deep+Learning+AMI">Deep Learning AMIs</a> |
| are a series of images supported and maintained by Amazon Web Services for use |
| on Amazon Elastic Compute Cloud (Amazon EC2) and contain the latest MXNet release.</p> |
| |
| <p>Now you can launch <em>MXNet</em> directly on an EC2 GPU instance. |
| You can also use <a href="https://jupyter.org">Jupyter</a> notebook on EC2 machine. |
| Here is a <a href="https://github.com/dmlc/mxnet-notebooks">good tutorial</a> |
| on how to connect to a Jupyter notebook running on an EC2 instance.</p> |
| |
| <h3 id="set-up-an-ec2-gpu-instance-from-scratch">Set Up an EC2 GPU Instance from Scratch</h3> |
| |
| <p><a href="https://aws.amazon.com/marketplace/search/results?x=0&y=0&searchTerms=Deep+Learning+Base+AMI">Deep Learning Base AMIs</a> |
| provide a foundational image with NVIDIA CUDA, cuDNN, GPU drivers, oneDNN, |
| Docker and Nvidia-Docker, etc. for deploying your own custom deep |
| learning environment. You may follow the <a href="https://mxnet.apache.org/get_started/build_from_source">MXNet Build From Source |
| instructions</a> easily on |
| the Deep Learning Base AMIs.</p> |
| |
| <h3 id="set-up-an-ec2-gpu-cluster-for-distributed-training">Set Up an EC2 GPU Cluster for Distributed Training</h3> |
| |
| <p>A cluster consists of multiple computers. |
| You can use one computer with <em>MXNet</em> installed as the root computer for submitting jobs,and then launch several |
| slave computers to run the jobs. For example, launch multiple instances using an |
| AMI with dependencies installed. There are two options:</p> |
| |
| <ul> |
| <li> |
| <p>Make all slaves’ ports accessible (same for the root) by setting type: All TCP, |
| Source: Anywhere in Configure Security Group.</p> |
| </li> |
| <li> |
| <p>Use the same <code class="highlighter-rouge">pem</code> as the root computer to access all slave computers, and |
| then copy the <code class="highlighter-rouge">pem</code> file into the root computer’s <code class="highlighter-rouge">~/.ssh/id_rsa</code>. If you do this, all slave computers can be accessed with SSH from the root.</p> |
| </li> |
| </ul> |
| |
| <p>Now, run the CNN on multiple computers. Assume that we are on a working |
| directory of the root computer, such as <code class="highlighter-rouge">~/train</code>, and MXNet is built as <code class="highlighter-rouge">~/mxnet</code>.</p> |
| |
| <ol> |
| <li>Pack the <em>MXNet</em> Python library into this working directory for easy |
| synchronization:</li> |
| </ol> |
| |
| <div class="language-bash highlighter-rouge"><div class="highlight"><pre class="highlight"><code> <span class="nb">cp</span> <span class="nt">-r</span> ~/mxnet/python/mxnet <span class="nb">.</span> |
| <span class="nb">cp</span> ~/mxnet/lib/libmxnet.so mxnet/ |
| </code></pre></div></div> |
| |
| <p>And then copy the training program:</p> |
| |
| <div class="language-bash highlighter-rouge"><div class="highlight"><pre class="highlight"><code> <span class="nb">cp</span> ~/mxnet/example/image-classification/<span class="k">*</span>.py <span class="nb">.</span> |
| <span class="nb">cp</span> <span class="nt">-r</span> ~/mxnet/example/image-classification/common <span class="nb">.</span> |
| </code></pre></div></div> |
| |
| <ol> |
| <li>Prepare a host file with all slaves private IPs. For example, <code class="highlighter-rouge">cat hosts</code>:</li> |
| </ol> |
| |
| <div class="language-bash highlighter-rouge"><div class="highlight"><pre class="highlight"><code> 172.30.0.172 |
| 172.30.0.171 |
| </code></pre></div></div> |
| |
| <ol> |
| <li>Assuming that there are two computers, train the CNN using two workers:</li> |
| </ol> |
| |
| <div class="language-bash highlighter-rouge"><div class="highlight"><pre class="highlight"><code> ../../tools/launch.py <span class="nt">-n</span> 2 <span class="nt">-H</span> hosts <span class="nt">--sync-dir</span> /tmp/mxnet python train_mnist.py <span class="nt">--kv-store</span> dist_sync |
| </code></pre></div></div> |
| |
| <p><strong><em>Note:</em></strong> Sometimes the jobs linger at the slave computers even though you’ve pressed <code class="highlighter-rouge">Ctrl-c</code> |
| at the root node. To terminate them, use the following command:</p> |
| |
| <div class="language-bash highlighter-rouge"><div class="highlight"><pre class="highlight"><code><span class="nb">cat </span>hosts | xargs <span class="nt">-I</span><span class="o">{}</span> ssh <span class="nt">-o</span> <span class="nv">StrictHostKeyChecking</span><span class="o">=</span>no <span class="o">{}</span> <span class="s1">'uname -a; pgrep python | xargs kill -9'</span> |
| </code></pre></div></div> |
| |
| <p><strong><em>Note:</em></strong> The preceding example is very simple to train and therefore isn’t a good |
| benchmark for distributed training. Consider using other <a href="https://github.com/apache/mxnet/tree/master/example/image-classification">examples</a>.</p> |
| |
| <h3 id="more-options">More Options</h3> |
| <h4 id="use-multiple-data-shards">Use Multiple Data Shards</h4> |
| <p>It is common to pack a dataset into multiple files, especially when working in a distributed environment. |
| <em>MXNet</em> supports direct loading from multiple data shards. |
| Put all of the record files into a folder, and point the data path to the folder.</p> |
| |
| <h4 id="use-yarn-and-sge">Use YARN and SGE</h4> |
| <p>Although using SSH can be simple when you don’t have a cluster scheduling framework, |
| <em>MXNet</em> is designed to be portable to various platforms. |
| We provide scripts available in <a href="https://github.com/dmlc/dmlc-core/tree/master/tracker">tracker</a> |
| to allow running on other cluster frameworks, including Hadoop (YARN) and SGE. |
| We welcome contributions from the community of examples of running <em>MXNet</em> on your favorite distributed platform.</p> |
| |
| </div> |
| </div> |
| |
| </div> |
| </div> |
| |
| </article> |
| |
| </main><footer class="site-footer h-card"> |
| <div class="wrapper"> |
| <div class="row"> |
| <div class="col-4"> |
| <h4 class="footer-category-title">Resources</h4> |
| <ul class="contact-list"> |
| <li><a href="/versions/master/community#stay-connected">Mailing lists</a></li> |
| <li><a href="/versions/master/community#github-issues">Github Issues</a></li> |
| <li><a href="https://github.com/apache/mxnet/projects">Projects</a></li> |
| <li><a href="https://cwiki.apache.org/confluence/display/MXNET/Apache+MXNet+Home">Developer Wiki</a></li> |
| <li><a href="https://discuss.mxnet.io">Forum</a></li> |
| <li><a href="/versions/master/community">Contribute To MXNet</a></li> |
| </ul> |
| </div> |
| |
| <div class="col-4"><ul class="social-media-list"><li><a href="https://github.com/apache/mxnet"><svg class="svg-icon"><use xlink:href="/versions/master/assets/minima-social-icons.svg#github"></use></svg> <span class="username">apache/mxnet</span></a></li><li><a href="https://www.twitter.com/apachemxnet"><svg class="svg-icon"><use xlink:href="/versions/master/assets/minima-social-icons.svg#twitter"></use></svg> <span class="username">apachemxnet</span></a></li><li><a href="https://youtube.com/apachemxnet"><svg class="svg-icon"><use xlink:href="/versions/master/assets/minima-social-icons.svg#youtube"></use></svg> <span class="username">apachemxnet</span></a></li></ul> |
| </div> |
| |
| <div class="col-4 footer-text"> |
| <p>A flexible and efficient library for deep learning.</p> |
| </div> |
| </div> |
| </div> |
| </footer> |
| <footer class="site-footer2"> |
| <div class="wrapper"> |
| <div class="row"> |
| <div class="col-3"> |
| <img src="/versions/master/assets/img/asf_logo.svg" class="footer-logo col-2"> |
| </div> |
| <div class="footer-bottom-warning col-9"> |
| </p><p>"Copyright © 2017-2022, The Apache Software Foundation. Licensed under the Apache License, Version 2.0. Apache MXNet, MXNet, Apache, the Apache |
| feather, and the Apache MXNet project logo are either registered trademarks or trademarks of the |
| Apache Software Foundation."</p> |
| </div> |
| </div> |
| </div> |
| </footer> |
| |
| |
| |
| |
| </body> |
| |
| </html> |