blob: e8febfe0402984b238d6832340d82a6673fde059 [file] [log] [blame]
<!DOCTYPE html>
<html lang=" en"><head>
<meta charset="utf-8">
<meta http-equiv="X-UA-Compatible" content="IE=edge">
<meta name="viewport" content="width=device-width, initial-scale=1">
<link href="https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/image/mxnet-icon.png" rel="icon" type="image/png"><!-- Begin Jekyll SEO tag v2.6.1 -->
<title>Use data from S3 for training | Apache MXNet</title>
<meta name="generator" content="Jekyll v3.8.6" />
<meta property="og:title" content="Use data from S3 for training" />
<meta property="og:locale" content="en_US" />
<meta name="description" content="A flexible and efficient library for deep learning." />
<meta property="og:description" content="A flexible and efficient library for deep learning." />
<link rel="canonical" href="https://mxnet.apache.org/versions/1.8.0/api/faq/s3_integration" />
<meta property="og:url" content="https://mxnet.apache.org/versions/1.8.0/api/faq/s3_integration" />
<meta property="og:site_name" content="Apache MXNet" />
<script type="application/ld+json">
{"url":"https://mxnet.apache.org/versions/1.8.0/api/faq/s3_integration","@type":"WebPage","description":"A flexible and efficient library for deep learning.","headline":"Use data from S3 for training","@context":"https://schema.org"}</script>
<!-- End Jekyll SEO tag -->
<script src="https://medium-widget.pixelpoint.io/widget.js"></script>
<link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/docsearch.js@2/dist/cdn/docsearch.min.css" />
<link rel="stylesheet" href="/versions/1.8.0/assets/main.css"><link type="application/atom+xml" rel="alternate" href="https://mxnet.apache.org/versions/1.8.0/feed.xml" title="Apache MXNet" /><script>
if(!(window.doNotTrack === "1" || navigator.doNotTrack === "1" || navigator.doNotTrack === "yes" || navigator.msDoNotTrack === "1")) {
(function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
(i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o),
m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
})(window,document,'script','https://www.google-analytics.com/analytics.js','ga');
ga('create', 'UA-96378503-1', 'auto');
ga('send', 'pageview');
}
</script>
<script src="/versions/1.8.0/assets/js/jquery-3.3.1.min.js"></script><script src="https://cdn.jsdelivr.net/npm/docsearch.js@2/dist/cdn/docsearch.min.js" defer></script>
<script src="/versions/1.8.0/assets/js/globalSearch.js" defer></script>
<script src="/versions/1.8.0/assets/js/clipboard.js" defer></script>
<script src="/versions/1.8.0/assets/js/copycode.js" defer></script></head>
<body><header class="site-header" role="banner">
<script>
$(document).ready(function () {
// HEADER OPACITY LOGIC
function opacity_header() {
var value = "rgba(4,140,204," + ($(window).scrollTop() / 300 + 0.4) + ")"
$('.site-header').css("background-color", value)
}
$(window).scroll(function () {
opacity_header()
})
opacity_header();
// MENU SELECTOR LOGIC
$('.page-link').each( function () {
if (window.location.href.includes(this.href)) {
$(this).addClass("page-current");
}
});
})
</script>
<div class="wrapper">
<a class="site-title" rel="author" href="/versions/1.8.0/"><img
src="/versions/1.8.0/assets/img/mxnet_logo.png" class="site-header-logo"></a>
<nav class="site-nav">
<input type="checkbox" id="nav-trigger" class="nav-trigger"/>
<label for="nav-trigger">
<span class="menu-icon">
<svg viewBox="0 0 18 15" width="18px" height="15px">
<path d="M18,1.484c0,0.82-0.665,1.484-1.484,1.484H1.484C0.665,2.969,0,2.304,0,1.484l0,0C0,0.665,0.665,0,1.484,0 h15.032C17.335,0,18,0.665,18,1.484L18,1.484z M18,7.516C18,8.335,17.335,9,16.516,9H1.484C0.665,9,0,8.335,0,7.516l0,0 c0-0.82,0.665-1.484,1.484-1.484h15.032C17.335,6.031,18,6.696,18,7.516L18,7.516z M18,13.516C18,14.335,17.335,15,16.516,15H1.484 C0.665,15,0,14.335,0,13.516l0,0c0-0.82,0.665-1.483,1.484-1.483h15.032C17.335,12.031,18,12.695,18,13.516L18,13.516z"/>
</svg>
</span>
</label>
<div class="gs-search-border">
<div id="gs-search-icon"></div>
<form id="global-search-form">
<input id="global-search" type="text" title="Search" placeholder="Search" />
<div id="global-search-dropdown-container">
<button class="gs-current-version btn" type="button" data-toggle="dropdown">
<span id="gs-current-version-label">1.8.0</span>
<svg class="gs-dropdown-caret" viewBox="0 0 32 32" class="icon icon-caret-bottom" aria-hidden="true">
<path class="dropdown-caret-path" d="M24 11.305l-7.997 11.39L8 11.305z"></path>
</svg>
</button>
<ul class="gs-opt-group gs-version-dropdown">
<li class="gs-opt gs-versions">master</li>
<li class="gs-opt gs-versions active">1.8.0</li>
<li class="gs-opt gs-versions">1.7.0</li>
<li class="gs-opt gs-versions">1.6.0</li>
<li class="gs-opt gs-versions">1.5.0</li>
<li class="gs-opt gs-versions">1.4.1</li>
<li class="gs-opt gs-versions">1.3.1</li>
<li class="gs-opt gs-versions">1.2.1</li>
<li class="gs-opt gs-versions">1.1.0</li>
<li class="gs-opt gs-versions">1.0.0</li>
<li class="gs-opt gs-versions">0.12.1</li>
<li class="gs-opt gs-versions">0.11.0</li>
</ul>
</div>
<span id="global-search-close">x</span>
</form>
</div>
<div class="trigger">
<div id="global-search-mobile-border">
<div id="gs-search-icon-mobile"></div>
<input id="global-search-mobile" placeholder="Search..." type="text"/>
<div id="global-search-dropdown-container-mobile">
<button class="gs-current-version-mobile btn" type="button" data-toggle="dropdown">
<svg class="gs-dropdown-caret" viewBox="0 0 32 32" class="icon icon-caret-bottom" aria-hidden="true">
<path class="dropdown-caret-path" d="M24 11.305l-7.997 11.39L8 11.305z"></path>
</svg>
</button>
<ul class="gs-opt-group gs-version-dropdown-mobile">
<li class="gs-opt gs-versions">master</li>
<li class="gs-opt gs-versions active">1.8.0</li>
<li class="gs-opt gs-versions">1.7.0</li>
<li class="gs-opt gs-versions">1.6.0</li>
<li class="gs-opt gs-versions">1.5.0</li>
<li class="gs-opt gs-versions">1.4.1</li>
<li class="gs-opt gs-versions">1.3.1</li>
<li class="gs-opt gs-versions">1.2.1</li>
<li class="gs-opt gs-versions">1.1.0</li>
<li class="gs-opt gs-versions">1.0.0</li>
<li class="gs-opt gs-versions">0.12.1</li>
<li class="gs-opt gs-versions">0.11.0</li>
</ul>
</div>
</div>
<a class="page-link" href="/versions/1.8.0/get_started">Get Started</a>
<a class="page-link" href="/versions/1.8.0/blog">Blog</a>
<a class="page-link" href="/versions/1.8.0/features">Features</a>
<a class="page-link" href="/versions/1.8.0/ecosystem">Ecosystem</a>
<a class="page-link" href="/versions/1.8.0/api">Docs & Tutorials</a>
<a class="page-link" href="https://github.com/apache/incubator-mxnet">GitHub</a>
<div class="dropdown">
<span class="dropdown-header">1.8.0
<svg class="dropdown-caret" viewBox="0 0 32 32" class="icon icon-caret-bottom" aria-hidden="true"><path class="dropdown-caret-path" d="M24 11.305l-7.997 11.39L8 11.305z"></path></svg>
</span>
<div class="dropdown-content">
<a href="/">master</a>
<a class="dropdown-option-active" href="/versions/1.8.0/">1.8.0</a>
<a href="/versions/1.7.0/">1.7.0</a>
<a href="/versions/1.6.0/">1.6.0</a>
<a href="/versions/1.5.0/">1.5.0</a>
<a href="/versions/1.4.1/">1.4.1</a>
<a href="/versions/1.3.1/">1.3.1</a>
<a href="/versions/1.2.1/">1.2.1</a>
<a href="/versions/1.1.0/">1.1.0</a>
<a href="/versions/1.0.0/">1.0.0</a>
<a href="/versions/0.12.1/">0.12.1</a>
<a href="/versions/0.11.0/">0.11.0</a>
</div>
</div>
</div>
</nav>
</div>
</header>
<main class="page-content" aria-label="Content">
<script>
</script>
<article class="post">
<header class="post-header wrapper">
<h1 class="post-title">Use data from S3 for training</h1>
<h3></h3></header>
<div class="post-content">
<div class="wrapper">
<div class="row">
<div class="col-3 docs-side-bar">
<h3 style="text-transform: capitalize; padding-left:10px">faq</h3>
<ul>
<!-- page-category -->
<!-- page-category -->
<li><a href="/versions/1.8.0/api/faq/add_op_in_backend">A Beginner's Guide to Implementing Operators in MXNet Backend</a></li>
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<li><a href="/versions/1.8.0/api/faq/caffe">Convert from Caffe to MXNet</a></li>
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<li><a href="/versions/1.8.0/api/faq/cloud">MXNet on the Cloud</a></li>
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<li><a href="/versions/1.8.0/api/faq/distributed_training">Distributed Training in MXNet</a></li>
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<li><a href="/versions/1.8.0/api/faq/env_var">Environment Variables</a></li>
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<li><a href="/versions/1.8.0/api/faq/float16">Float16</a></li>
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<li><a href="/versions/1.8.0/api/faq/large_tensor_support">Using MXNet with Large Tensor Support</a></li>
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<li><a href="/versions/1.8.0/api/faq/model_parallel_lstm">Model Parallel</a></li>
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<li><a href="/versions/1.8.0/api/faq/multi_device">Data Parallelism with Multiple CPU/GPUs on MXNet</a></li>
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<li><a href="/versions/1.8.0/api/faq/new_op">Create New Operators</a></li>
<!-- page-category -->
<li><a href="/versions/1.8.0/api/faq/nnpack">NNPACK for Multi-Core CPU Support in MXNet</a></li>
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<li><a href="/versions/1.8.0/api/faq/perf">Some Tips for Improving MXNet Performance</a></li>
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<li><a href="/versions/1.8.0/api/faq/recordio">Create a Dataset Using RecordIO</a></li>
<!-- page-category -->
<li><a href="/versions/1.8.0/api/faq/s3_integration">Use data from S3 for training</a></li>
<!-- page-category -->
<!-- page-category -->
<li><a href="/versions/1.8.0/api/faq/security">MXNet Security Best Practices</a></li>
<!-- page-category -->
<li><a href="/versions/1.8.0/api/faq/smart_device">Deep Learning at the Edge</a></li>
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<li><a href="/versions/1.8.0/api/faq/visualize_graph">Visualize Neural Networks</a></li>
<!-- page-category -->
<li><a href="/versions/1.8.0/api/faq/why_mxnet">Why MXNet came to be?</a></li>
<!-- page-category -->
<!-- page-category -->
<!-- page-category -->
<!-- resource-p -->
</ul>
</div>
<div class="col-9">
<!--- Licensed to the Apache Software Foundation (ASF) under one -->
<!--- or more contributor license agreements. See the NOTICE file -->
<!--- distributed with this work for additional information -->
<!--- regarding copyright ownership. The ASF licenses this file -->
<!--- to you under the Apache License, Version 2.0 (the -->
<!--- "License"); you may not use this file except in compliance -->
<!--- with the License. You may obtain a copy of the License at -->
<!--- http://www.apache.org/licenses/LICENSE-2.0 -->
<!--- Unless required by applicable law or agreed to in writing, -->
<!--- software distributed under the License is distributed on an -->
<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
<!--- KIND, either express or implied. See the License for the -->
<!--- specific language governing permissions and limitations -->
<!--- under the License. -->
<h1 id="use-data-from-s3-for-training">Use data from S3 for training</h1>
<p>AWS S3 is a cloud-based object storage service that allows storage and retrieval of large amounts of data at a very low cost. This makes it an attractive option to store large training datasets. MXNet is deeply integrated with S3 for this purpose.</p>
<p>An S3 protocol URL (like <code>s3://bucket-name/training-data</code>) can be provided as a parameter for any data iterator that takes a file path as input. For example,</p>
<div class="highlight"><pre><code class="language-" data-lang="">data_iter = mx.io.ImageRecordIter(
path_imgrec="s3://bucket-name/training-data/caltech_train.rec",
data_shape=(3, 227, 227),
batch_size=4,
resize=256)
</code></pre></div>
<p>Following are detailed instructions on how to use data from S3 for training.</p>
<h2 id="step-1-build-mxnet-with-s3-integration-enabled">Step 1: Build MXNet with S3 integration enabled</h2>
<p>Follow instructions <a href="/versions/1.8.0/get_started">here</a> to install MXNet from source with the following additional steps to enable S3 integration.</p>
<ol>
<li>Install <code>libcurl4-openssl-dev</code> and <code>libssl-dev</code> before building MXNet. These packages are required to read/write from AWS S3.</li>
<li>Append <code>USE_S3=1</code> to <code>config.mk</code> before building MXNet.
<code>
echo &quot;USE_S3=1&quot; &gt;&gt; config.mk
</code></li>
</ol>
<h2 id="step-2-configure-s3-authentication-tokens">Step 2: Configure S3 authentication tokens</h2>
<p>MXNet requires the S3 environment variables <code>AWS_ACCESS_KEY_ID</code> and <code>AWS_SECRET_ACCESS_KEY</code> to be set. <a href="https://aws.amazon.com/blogs/security/wheres-my-secret-access-key/">Here</a> are instructions to get the access keys from AWS console.</p>
<div class="highlight"><pre><code class="language-" data-lang="">export AWS_ACCESS_KEY_ID=&lt;your-access-key-id&gt;
AWS_SECRET_ACCESS_KEY=&lt;your-secret-access-key&gt;
</code></pre></div>
<h2 id="step-3-upload-data-to-s3">Step 3: Upload data to S3</h2>
<p>There are several ways to upload data to S3. One easy way is to use the AWS command line utility. For example, the following <code>sync</code> command will recursively copy contents from a local directory to a directory in S3.</p>
<div class="highlight"><pre><code class="language-" data-lang="">aws s3 sync ./training-data s3://bucket-name/training-data
</code></pre></div>
<h2 id="step-4-train-with-data-from-s3">Step 4: Train with data from S3</h2>
<p>Once the data is in S3, it is very straightforward to use it from MXNet. Any data iterator that can read/write data from a local drive can also read/write data from S3.</p>
<p>Let&#39;s modify an existing example code in MXNet repository to read data from S3 instead of local disk. <a href="https://github.com/dmlc/mxnet/blob/master/tests/python/train/test_conv.py"><code>mxnet/tests/python/train/test_conv.py</code></a> trains a convolutional network using MNIST data from local disk. We&#39;ll do the following change to read the data from S3 instead.</p>
<div class="highlight"><pre><code class="language-" data-lang=""><span class="err">~/mxnet$</span> sed -i -- 's/data\//s3:\/\/bucket-name\/training-data\//g' ./tests/python/train/test_conv.py
~/mxnet$ git diff ./tests/python/train/test_conv.py
<span class="gh">diff --git a/tests/python/train/test_conv.py b/tests/python/train/test_conv.py
index 039790e..66a60ce 100644
</span><span class="gd">--- a/tests/python/train/test_conv.py
</span><span class="gi">+++ b/tests/python/train/test_conv.py
</span><span class="p">@@ -39,14 +39,14 @@</span> def get_iters():
batch_size = 100
train_dataiter = mx.io.MNISTIter(
<span class="gd">- image="data/train-images-idx3-ubyte",
- label="data/train-labels-idx1-ubyte",
</span><span class="gi">+ image="s3://bucket-name/training-data/train-images-idx3-ubyte",
+ label="s3://bucket-name/training-data/train-labels-idx1-ubyte",
</span> data_shape=(1, 28, 28),
label_name='sm_label',
batch_size=batch_size, shuffle=True, flat=False, silent=False, seed=10)
val_dataiter = mx.io.MNISTIter(
<span class="gd">- image="data/t10k-images-idx3-ubyte",
- label="data/t10k-labels-idx1-ubyte",
</span><span class="gi">+ image="s3://bucket-name/training-data/t10k-images-idx3-ubyte",
+ label="s3://bucket-name/training-data/t10k-labels-idx1-ubyte",
</span> data_shape=(1, 28, 28),
label_name='sm_label',
batch_size=batch_size, shuffle=True, flat=False, silent=False)
</code></pre></div>
<p>After the above change <code>test_conv.py</code> will fetch data from S3 instead of the local disk.</p>
<div class="highlight"><pre><code class="language-" data-lang="">python ./tests/python/train/test_conv.py
[21:59:19] src/io/s3_filesys.cc:878: No AWS Region set, using default region us-east-1
[21:59:21] src/io/iter_mnist.cc:94: MNISTIter: load 60000 images, shuffle=1, shape=(100,1,28,28)
[21:59:21] src/io/iter_mnist.cc:94: MNISTIter: load 10000 images, shuffle=1, shape=(100,1,28,28)
INFO:root:Start training with [cpu(0)]
Start training with [cpu(0)]
INFO:root:Epoch[0] Resetting Data Iterator
Epoch[0] Resetting Data Iterator
INFO:root:Epoch[0] Time cost=11.277
Epoch[0] Time cost=11.277
INFO:root:Epoch[0] Validation-accuracy=0.955100
Epoch[0] Validation-accuracy=0.955100
INFO:root:Finish fit...
Finish fit...
INFO:root:Finish predict...
Finish predict...
INFO:root:final accuracy = 0.955100
final accuracy = 0.955100
</code></pre></div>
</div>
</div>
</div>
</div>
</article>
</main><footer class="site-footer h-card">
<div class="wrapper">
<div class="row">
<div class="col-4">
<h4 class="footer-category-title">Resources</h4>
<ul class="contact-list">
<li><a href="/versions/1.8.0/community/contribute#mxnet-dev-communications">Mailing lists</a></li>
<li><a href="https://cwiki.apache.org/confluence/display/MXNET/Apache+MXNet+Home">Developer Wiki</a></li>
<li><a href="https://issues.apache.org/jira/projects/MXNET/issues">Jira Tracker</a></li>
<li><a href="https://github.com/apache/incubator-mxnet/labels/Roadmap">Github Roadmap</a></li>
<li><a href="https://discuss.mxnet.io">MXNet Discuss forum</a></li>
<li><a href="/versions/1.8.0/community/contribute">Contribute To MXNet</a></li>
</ul>
</div>
<div class="col-4"><ul class="social-media-list"><li><a href="https://github.com/apache/incubator-mxnet"><svg class="svg-icon"><use xlink:href="/versions/1.8.0/assets/minima-social-icons.svg#github"></use></svg> <span class="username">apache/incubator-mxnet</span></a></li><li><a href="https://www.twitter.com/apachemxnet"><svg class="svg-icon"><use xlink:href="/versions/1.8.0/assets/minima-social-icons.svg#twitter"></use></svg> <span class="username">apachemxnet</span></a></li><li><a href="https://youtube.com/apachemxnet"><svg class="svg-icon"><use xlink:href="/versions/1.8.0/assets/minima-social-icons.svg#youtube"></use></svg> <span class="username">apachemxnet</span></a></li></ul>
</div>
<div class="col-4 footer-text">
<p>A flexible and efficient library for deep learning.</p>
</div>
</div>
</div>
</footer>
<footer class="site-footer2">
<div class="wrapper">
<div class="row">
<div class="col-3">
<img src="/versions/1.8.0/assets/img/apache_incubator_logo.png" class="footer-logo col-2">
</div>
<div class="footer-bottom-warning col-9">
<p>Apache MXNet is an effort undergoing incubation at The Apache Software Foundation (ASF), <span
style="font-weight:bold">sponsored by the <i>Apache Incubator</i></span>. Incubation is required
of all newly accepted projects until a further review indicates that the infrastructure,
communications, and decision making process have stabilized in a manner consistent with other
successful ASF projects. While incubation status is not necessarily a reflection of the completeness
or stability of the code, it does indicate that the project has yet to be fully endorsed by the ASF.
</p><p>"Copyright © 2017-2018, The Apache Software Foundation Apache MXNet, MXNet, Apache, the Apache
feather, and the Apache MXNet project logo are either registered trademarks or trademarks of the
Apache Software Foundation."</p>
</div>
</div>
</div>
</footer>
</body>
</html>