versions/master/architecture/note_data_loading.html - mxnet-test - Git at Google

 <!DOCTYPE html>

 <html lang="en">
 <head>
 <meta charset="utf-8"/>
 <meta content="IE=edge" http-equiv="X-UA-Compatible"/>
 <meta content="width=device-width, initial-scale=1" name="viewport"/>
 <title>Designing Efficient Data Loaders for Deep Learning — mxnet  documentation</title>
 <link crossorigin="anonymous" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.6/css/bootstrap.min.css" integrity="sha384-1q8mTJOASx8j1Au+a5WDVnPi2lkFfwwEAa8hDDdjZlpLegxhjVME1fgjWPGmkzs7" rel="stylesheet"/>
 <link href="https://maxcdn.bootstrapcdn.com/font-awesome/4.5.0/css/font-awesome.min.css" rel="stylesheet"/>
 <link href="../_static/basic.css" rel="stylesheet" type="text/css">
 <link href="../_static/pygments.css" rel="stylesheet" type="text/css">
 <link href="../_static/mxnet.css" rel="stylesheet" type="text/css"/>
 <script type="text/javascript">
       var DOCUMENTATION_OPTIONS = {
         URL_ROOT:    '../',
         VERSION:     '',
         COLLAPSE_INDEX: false,
         FILE_SUFFIX: '.html',
         HAS_SOURCE:  true,
         SOURCELINK_SUFFIX: ''
       };
     </script>
 <script src="../_static/jquery-1.11.1.js" type="text/javascript"></script>
 <script src="../_static/underscore.js" type="text/javascript"></script>
 <script src="../_static/searchtools_custom.js" type="text/javascript"></script>
 <script src="../_static/doctools.js" type="text/javascript"></script>
 <script src="../_static/selectlang.js" type="text/javascript"></script>
 <script src="https://cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML" type="text/javascript"></script>
 <script type="text/javascript"> jQuery(function() { Search.loadIndex("/searchindex.js"); Search.init();}); </script>
 <script>
       (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
       (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new
       Date();a=s.createElement(o),
       m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
       })(window,document,'script','https://www.google-analytics.com/analytics.js','ga');

       ga('create', 'UA-96378503-1', 'auto');
       ga('send', 'pageview');

     </script>
 <!-- -->
 <!-- <script type="text/javascript" src="../_static/jquery.js"></script> -->
 <!-- -->
 <!-- <script type="text/javascript" src="../_static/underscore.js"></script> -->
 <!-- -->
 <!-- <script type="text/javascript" src="../_static/doctools.js"></script> -->
 <!-- -->
 <!-- <script type="text/javascript" src="https://cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML"></script> -->
 <!-- -->
 <link href="https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/image/mxnet-icon.png" rel="icon" type="image/png"/>
 </link></link></head>
 <body role="document"><!-- Previous Navbar Layout
 <div class="navbar navbar-default navbar-fixed-top">
   <div class="container">
     <div class="navbar-header">
       <button type="button" class="navbar-toggle collapsed" data-toggle="collapse" data-target="#navbar" aria-expanded="false" aria-controls="navbar">
         <span class="sr-only">Toggle navigation</span>
         <span class="icon-bar"></span>
         <span class="icon-bar"></span>
         <span class="icon-bar"></span>
       </button>
       <a href="../" class="navbar-brand">
         <img src="http://data.mxnet.io/theme/mxnet.png">
       </a>
     </div>
     <div id="navbar" class="navbar-collapse collapse">
       <ul id="navbar" class="navbar navbar-left">

         <li> <a href="../get_started/index.html">Get Started</a> </li>

         <li> <a href="../tutorials/index.html">Tutorials</a> </li>

         <li> <a href="../how_to/index.html">How To</a> </li>


         <li class="dropdown">
           <a href="#" class="dropdown-toggle" data-toggle="dropdown" role="button" aria-haspopup="true" aria-expanded="true">Packages <span class="caret"></span></a>
           <ul class="dropdown-menu">

             <li><a href="../packages/python/index.html">
                 Python
             </a></li>

             <li><a href="../packages/r/index.html">
                 R
             </a></li>

             <li><a href="../packages/julia/index.html">
                 Julia
             </a></li>

             <li><a href="../packages/c++/index.html">
                 C++
             </a></li>

             <li><a href="../packages/scala/index.html">
                 Scala
             </a></li>

             <li><a href="../packages/perl/index.html">
                 Perl
             </a></li>

           </ul>
         </li>

         <li> <a href="../system/index.html">System</a> </li>
         <li>
 <form class="" role="search" action="../search.html" method="get" autocomplete="off">
   <div class="form-group inner-addon left-addon">
     <i class="glyphicon glyphicon-search"></i>
     <input type="text" name="q" class="form-control" placeholder="Search">
   </div>
   <input type="hidden" name="check_keywords" value="yes" />
   <input type="hidden" name="area" value="default" />

 </form> </li>
       </ul>
       <ul id="navbar" class="navbar navbar-right">
         <li> <a href="../index.html"><span class="flag-icon flag-icon-us"></span></a> </li>
         <li> <a href="..//zh/index.html"><span class="flag-icon flag-icon-cn"></span></a> </li>
       </ul>
     </div>
   </div>
 </div>
 Previous Navbar Layout End -->
 <div class="navbar navbar-fixed-top">
 <div class="container" id="navContainer">
 <div class="innder" id="header-inner">
 <h1 id="logo-wrap">
 <a href="../" id="logo"><img src="http://data.mxnet.io/theme/mxnet.png"/></a>
 </h1>
 <nav class="nav-bar" id="main-nav">
 <a class="main-nav-link" href="../get_started/install.html">Install</a>
 <a class="main-nav-link" href="../tutorials/index.html">Tutorials</a>
 <a class="main-nav-link" href="../how_to/index.html">How To</a>
 <span id="dropdown-menu-position-anchor">
 <a aria-expanded="true" aria-haspopup="true" class="main-nav-link dropdown-toggle" data-toggle="dropdown" href="#" role="button">API <span class="caret"></span></a>
 <ul class="dropdown-menu" id="package-dropdown-menu">
 <li><a class="main-nav-link" href="../api/python/index.html">Python</a></li>
 <li><a class="main-nav-link" href="../api/scala/index.html">Scala</a></li>
 <li><a class="main-nav-link" href="../api/r/index.html">R</a></li>
 <li><a class="main-nav-link" href="../api/julia/index.html">Julia</a></li>
 <li><a class="main-nav-link" href="../api/c++/index.html">C++</a></li>
 <li><a class="main-nav-link" href="../api/perl/index.html">Perl</a></li>
 </ul>
 </span>
 <a class="main-nav-link" href="../architecture/index.html">Architecture</a>
 <!-- <a class="main-nav-link" href="../community/index.html">Community</a> -->
 <a class="main-nav-link" href="https://github.com/dmlc/mxnet">Github</a>
 <span id="dropdown-menu-position-anchor-version" style="position: relative"><a href="#" class="main-nav-link dropdown-toggle" data-toggle="dropdown" role="button" aria-haspopup="true" aria-expanded="true">Versions(master)<span class="caret"></span></a><ul id="package-dropdown-menu" class="dropdown-menu"><li><a class="main-nav-link" href=http://mxnet.incubator.apache.org/test/>v0.10.14</a></li><li><a class="main-nav-link" href=http://mxnet.incubator.apache.org/test/versions/0.10/index.html>0.10</a></li><li><a class="main-nav-link" href=http://mxnet.incubator.apache.org/test/versions/master/index.html>master</a></li></ul></span></nav>
 <script> function getRootPath(){ return "../" } </script>
 <div class="burgerIcon dropdown">
 <a class="dropdown-toggle" data-toggle="dropdown" href="#" role="button">☰</a>
 <ul class="dropdown-menu dropdown-menu-right" id="burgerMenu">
 <li><a href="../get_started/install.html">Install</a></li>
 <li><a href="../tutorials/index.html">Tutorials</a></li>
 <li><a href="../how_to/index.html">How To</a></li>
 <li class="dropdown-submenu">
 <a href="#" tabindex="-1">API</a>
 <ul class="dropdown-menu">
 <li><a href="../api/python/index.html" tabindex="-1">Python</a>
 </li>
 <li><a href="../api/scala/index.html" tabindex="-1">Scala</a>
 </li>
 <li><a href="../api/r/index.html" tabindex="-1">R</a>
 </li>
 <li><a href="../api/julia/index.html" tabindex="-1">Julia</a>
 </li>
 <li><a href="../api/c++/index.html" tabindex="-1">C++</a>
 </li>
 <li><a href="../api/perl/index.html" tabindex="-1">Perl</a>
 </li>
 </ul>
 </li>
 <li><a href="../architecture/index.html">Architecture</a></li>
 <li><a class="main-nav-link" href="https://github.com/dmlc/mxnet">Github</a></li>
 <li id="dropdown-menu-position-anchor-version-mobile" class="dropdown-submenu" style="position: relative"><a href="#" tabindex="-1">Versions(master)</a><ul class="dropdown-menu"><li><a tabindex="-1" href=http://mxnet.incubator.apache.org/test/>v0.10.14</a></li><li><a tabindex="-1" href=http://mxnet.incubator.apache.org/test/versions/0.10/index.html>0.10</a></li><li><a tabindex="-1" href=http://mxnet.incubator.apache.org/test/versions/master/index.html>master</a></li></ul></li></ul>
 </div>
 <div class="plusIcon dropdown">
 <a class="dropdown-toggle" data-toggle="dropdown" href="#" role="button"><span aria-hidden="true" class="glyphicon glyphicon-plus"></span></a>
 <ul class="dropdown-menu dropdown-menu-right" id="plusMenu"></ul>
 </div>
 <div id="search-input-wrap">
 <form action="../search.html" autocomplete="off" class="" method="get" role="search">
 <div class="form-group inner-addon left-addon">
 <i class="glyphicon glyphicon-search"></i>
 <input class="form-control" name="q" placeholder="Search" type="text"/>
 </div>
 <input name="check_keywords" type="hidden" value="yes">
 <input name="area" type="hidden" value="default"/>
 </input></form>
 <div id="search-preview"></div>
 </div>
 <div id="searchIcon">
 <span aria-hidden="true" class="glyphicon glyphicon-search"></span>
 </div>
 <!-- <div id="lang-select-wrap"> -->
 <!--   <label id="lang-select-label"> -->
 <!--     <\!-- <i class="fa fa-globe"></i> -\-> -->
 <!--     <span></span> -->
 <!--   </label> -->
 <!--   <select id="lang-select"> -->
 <!--     <option value="en">Eng</option> -->
 <!--     <option value="zh">中文</option> -->
 <!--   </select> -->
 <!-- </div> -->
 <!--     <a id="mobile-nav-toggle">
         <span class="mobile-nav-toggle-bar"></span>
         <span class="mobile-nav-toggle-bar"></span>
         <span class="mobile-nav-toggle-bar"></span>
       </a> -->
 </div>
 </div>
 </div>
 <div class="container">
 <div class="row">
 <div aria-label="main navigation" class="sphinxsidebar leftsidebar" role="navigation">
 <div class="sphinxsidebarwrapper">
 <ul>
 <li class="toctree-l1"><a class="reference internal" href="../api/python/index.html">Python Documents</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../api/r/index.html">R Documents</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../api/julia/index.html">Julia Documents</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../api/c++/index.html">C++ Documents</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../api/scala/index.html">Scala Documents</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../api/perl/index.html">Perl Documents</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../how_to/index.html">HowTo Documents</a></li>
 <li class="toctree-l1"><a class="reference internal" href="index.html">System Documents</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../tutorials/index.html">Tutorials</a></li>
 </ul>
 </div>
 </div>
 <div class="content">
 <div class="section" id="designing-efficient-data-loaders-for-deep-learning">
 <span id="designing-efficient-data-loaders-for-deep-learning"></span><h1>Designing Efficient Data Loaders for Deep Learning<a class="headerlink" href="#designing-efficient-data-loaders-for-deep-learning" title="Permalink to this headline">¶</a></h1>
 <p>Data loading is an important component of any machine learning system.
 When we work with tiny datasets, we can get away with loading an entire dataset into GPU memory.
 With larger datasets, we must store examples in main memory.
 And when datasets grow too large to fit into main memory,
 data loading can become performance-critical.
 In designing a data loader,
 we aim to achieve more efficient data loading,
 to spend less effort on data preparation,
 and to present a clean and flexible interface.</p>
 <p>We organize this design note as follows:</p>
 <ul class="simple">
 <li><strong>IO Design Insight:</strong>  Guiding principles in data loading design.</li>
 <li><strong>Data Format:</strong> Our solution using dmlc-core’s binary recordIO implementation.</li>
 <li><strong>Data Loading:</strong> Our method to reduce IO cost by utilizing the threaded iterator provided by dmlc-core.</li>
 <li><strong>Interface Design:</strong> Our approach to facilitate writing MXNet data iterators in just a few lines of Python.</li>
 <li><strong>Future Extension:</strong> Prospective ideas for making data loading more flexible.</li>
 </ul>
 <p>Our analysis will motivate several requirements that an effective IO system should fulfill.</p>
 <p><strong><em>List of Key Requirements</em></strong></p>
 <ul class="simple">
 <li>Small file size.</li>
 <li>Parallel (distributed) packing of data.</li>
 <li>Fast data loading and online augmentation.</li>
 <li>Quick reads from arbitrary parts of the dataset in the distributed setting.</li>
 </ul>
 <div class="section" id="design-insight">
 <span id="design-insight"></span><h2>Design Insight<a class="headerlink" href="#design-insight" title="Permalink to this headline">¶</a></h2>
 <p>To design an IO system, we must address two kinds of tasks:
 data preparation and data loading.
 Data preparation is usually performed offline,
 whereas data loading influences the online performance.
 In this section, we will introduce our insight of IO design involving the two phases.</p>
 <div class="section" id="data-preparation">
 <span id="data-preparation"></span><h3>Data Preparation<a class="headerlink" href="#data-preparation" title="Permalink to this headline">¶</a></h3>
 <p>Data preparation describes the process of packing data
 into a desired format for later processing.
 When working with large datasets like ImageNet, this process can be time-consuming.
 In these cases, there are several heuristics we ought to follow:</p>
 <ul class="simple">
 <li>Pack the dataset into small numbers of files. A dataset may contain millions of data instances. Packed data distributes easily from machine to machine.</li>
 <li>Do the packing once. We don’t want to repack data every time run-time settings, like the number of machines, are changed.</li>
 <li>Process the packing in parallel to save time.</li>
 <li>Be able to access arbitrary parts of the data easily. This is crucial for distributed machine learning when data parallelism is introduced. Things may get tricky when the data has been packed into several physical data files. The desired behavior could be: the packed data can be logically separated into arbitrary numbers of partitions, no matter how many physical data files there are. For example, if we pack 1000 images into 4 physical files, then each file contains 250 images. If we then use 10 machines to train a DNN, we should be able to load approximately 100 images per machine. Some machines may need images from different physical files.</li>
 </ul>
 </div>
 <div class="section" id="data-loading">
 <span id="data-loading"></span><h3>Data Loading<a class="headerlink" href="#data-loading" title="Permalink to this headline">¶</a></h3>
 <p>The next step to consider is how to load the packed data into RAM.
 Our goal is to load the data as quickly as possible.
 There are several heuristics we try to follow:</p>
 <ul class="simple">
 <li><strong>Read continuously:</strong> We can read faster when reading from contiguous locations on disk.</li>
 <li><strong>Reduce the bytes to be loaded:</strong> We can achieve this by storing data in a compact way, e.g. saving images in JPEG format.</li>
 <li><strong>Load and train in different threads:</strong> This avoids computational bottlenecks while loading data.</li>
 <li><strong>Save RAM:</strong> Judiciously decide whether to load entire files into RAM.</li>
 </ul>
 </div>
 </div>
 <div class="section" id="data-format">
 <span id="data-format"></span><h2>Data Format<a class="headerlink" href="#data-format" title="Permalink to this headline">¶</a></h2>
 <p>Since the training of deep neural network often involves large amounts of data,
 the format we choose should be both efficient and convenient.
 To achieve our goals, we need to pack binary data into a splittable format.
 In MXNet, we rely on the binary recordIO format implemented in dmlc-core.</p>
 <div class="section" id="binary-record">
 <span id="binary-record"></span><h3>Binary Record<a class="headerlink" href="#binary-record" title="Permalink to this headline">¶</a></h3>
 <p><img alt="baserecordio" src="https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/io/baserecordio.jpg">
 In MXNet’s binary RecordIO, we store each data instance as a record.
 <strong>kMagic</strong> is a <em>magic number</em> indicating the start of a record.
 <strong>Lrecord</strong> encodes length and a continue flag.
 In lrecord,</img></p>
 <ul class="simple">
 <li>cflag == 0: this is a complete record</li>
 <li>cflag == 1: start of a multiple-records</li>
 <li>cflag == 2: middle of multiple-records</li>
 <li>cflag == 3: end of multiple-records</li>
 </ul>
 <p><strong>Data</strong> is the space to save data content.
 <strong>Pad</strong> is simply a padding space to make record align to 4 bytes.</p>
 <p>After we pack the data, each file contains multiple records.
 Then, loading can be continuous.
 This avoids the low performance that can result
 from reading random locations on disk.</p>
 <p>One advantage of storing data via records
 is that each record can vary in length.
 This allows us to save data compactly
 when good compression algorithms are available for our data.
 For example, we can use JPEG format to save image data.
 The packed data will be much smaller
 compared with storing uncompressed RGB values for each pixel.</p>
 <p>Take ImageNet_1K dataset as an example.
 If we store the data as 3 * 256 * 256 array of raw RGB values,
 the dataset would occupy more than <strong>200G</strong>.
 But after compressing the images using JPEG,
 they only occupy about <strong>35G</strong> of disk space.
 This significantly reduces the cost owing to reading from disk.</p>
 <p>Here’s an example of binary recordIO:
 <img alt="baserecordio" src="https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/io/ImageRecordIO.jpg"/>
 We first resize the image into 256 * 256,
 then compress into JPEG format.
 After that, we save a header that indicates the index and label
 for that image to be used when constructing the <em>Data</em> field for that record.
 We then pack several images together into a file.</p>
 </div>
 <div class="section" id="access-arbitrary-parts-of-data">
 <span id="access-arbitrary-parts-of-data"></span><h3>Access Arbitrary Parts Of Data<a class="headerlink" href="#access-arbitrary-parts-of-data" title="Permalink to this headline">¶</a></h3>
 <p>One desirable property for a data loader might be:
 The packed data can be logically sliced into an arbitrary number of partitions,
 no matter how many physical packed data files there are.
 Since binary recordIO can easily locate
 the start and end of a record using the Magic Number,
 we can achieve the above goal using the InputSplit
 functionality provided by dmlc-core.</p>
 <p>InputSplit takes the following parameters:</p>
 <ul class="simple">
 <li>FileSystem <em>filesys</em>: dmlc-core wrapper around the IO operations for different file systems, like hdfs, s3, local. User shouldn’t need to worry about the difference between file systems anymore.</li>
 <li>Char <em>uri</em>: The URI of files. Note that it could be a list of files because we may pack the data into several physical parts. File URIs are separated by ‘;’.</li>
 <li>Unsigned <em>nsplit</em>: The number of logical splits. <em>nsplit</em> could be different from the number of physical files.</li>
 <li>Unsigned <em>rank</em>: Which split to load in this process.</li>
 </ul>
 <p>The splitting process is demonstrated below:</p>
 <ul class="simple">
 <li>Determine the size of each partition.</li>
 </ul>
 <p><img alt="beforepartition" src="https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/io/beforepartition.jpg"/></p>
 <ul class="simple">
 <li>Approximately partition the records according to file size. Note that the boundary of each part may be located in the middle of a record.</li>
 </ul>
 <p><img alt="approxipartition" src="https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/io/approximatepartition.jpg"/></p>
 <ul class="simple">
 <li>Set the beginning of partitions in such a way as to avoid splitting records across partitions.</li>
 </ul>
 <p><img alt="afterpartition" src="https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/io/afterpartition.jpg"/></p>
 <p>By conducting the above operations,
 we now identify the records belong to each part,
 and the physical data files needed by each logical part.
 InputSplit greatly simplifies data parallelism,
 where each process only reads part of the data.</p>
 <p>Since our partitioning scheme does not depend on the number of physical data files,
 we can process a huge dataset like ImageNet_22K in parallel fashion as illustrated below.
 We don’t need to consider distributed loading issue at the preparation time,
 just select the most efficient physical file number
 according to the dataset size and computing resources available.
 <img alt="parallelprepare" src="https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/io/parallelprepare.jpg"/></p>
 </div>
 </div>
 <div class="section" id="data-loading-and-preprocessing">
 <span id="data-loading-and-preprocessing"></span><h2>Data Loading and Preprocessing<a class="headerlink" href="#data-loading-and-preprocessing" title="Permalink to this headline">¶</a></h2>
 <p>When the speed of loading and preprocessing can’t keep up
 with the speed of training or evaluation,
 IO can bottleneck the speed of the whole system.
 In this section, we will introduce a few tricks
 to achieve greater efficiency when loading
 and preprocessing data packed in binary recordIO format.
 When applied to the ImageNet dataset, our approach achieves
 the IO speed of <strong>3000</strong> images/sec <strong>with a normal HDD</strong>.</p>
 <div class="section" id="loading-and-preprocessing-on-the-fly">
 <span id="loading-and-preprocessing-on-the-fly"></span><h3>Loading and preprocessing on the fly<a class="headerlink" href="#loading-and-preprocessing-on-the-fly" title="Permalink to this headline">¶</a></h3>
 <p>When training deep neural networks,
 we sometimes must load and preprocess the data
 while simultaneously training for the following reasons:</p>
 <ul class="simple">
 <li>When the whole size of the dataset exceeds available RAM size, we can’t load it in advance;</li>
 <li>Sometimes, to make models robust to things like translations, rotations, and small amounts of color shift of noise, we introduce randomness into the training process. In these cases we must re-preprocess the data each time we revisit an example.</li>
 </ul>
 <p>In service of efficiency, we also address multi-threading techniques. Taking Imagenet training as an example, after loading a bunch of image records, we can start multiple threads to simultaneously perform image decoding and image augmentation. We depict this process in the following illustration:
 <img alt="process" src="https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/io/process.jpg"/></p>
 </div>
 <div class="section" id="hide-io-cost-using-threadediter">
 <span id="hide-io-cost-using-threadediter"></span><h3>Hide IO Cost Using Threadediter<a class="headerlink" href="#hide-io-cost-using-threadediter" title="Permalink to this headline">¶</a></h3>
 <p>One way to lower IO cost is to pre-fetch the data for next batch on one thread,
 while the main thread performs the forward and backward passes for training.
 To support more complicated training schemes,
 MXNet provides a more general IO processing pipeline
 using <em>threadediter</em> provided by dmlc-core.
 The key of <em>threadediter</em> is to start a stand-alone thread that acts as a data provider,
 while the main thread acts as a data consumer as illustrated below.</p>
 <p>The threadediter maintains a buffer of a certain size
 and automatically fills the buffer when it’s not full.
 And after the consumer finishes consuming part of the data in the buffer,
 threadediter will reuse the space to save the next part of data.
 <img alt="threadediter" src="https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/io/threadediter.png"/></p>
 </div>
 </div>
 <div class="section" id="mxnet-io-python-interface">
 <span id="mxnet-io-python-interface"></span><h2>MXNet IO Python Interface<a class="headerlink" href="#mxnet-io-python-interface" title="Permalink to this headline">¶</a></h2>
 <p>We make the IO object as an iterator in numpy.
 By achieving that, the user can easily access the data
 using a for-loop or calling next() function.
 Defining a data iterator is very similar to defining a symbolic operator in MXNet.</p>
 <p>The following example code demonstrates a Cifar data iterator.</p>
 <div class="highlight-python"><div class="highlight"><pre><span></span><span class="n">dataiter</span> <span class="o">=</span> <span class="n">mx</span><span class="o">.</span><span class="n">io</span><span class="o">.</span><span class="n">ImageRecordIter</span><span class="p">(</span>
     <span class="c1"># Dataset Parameter, indicating the data file, please check the data is already there</span>
     <span class="n">path_imgrec</span><span class="o">=</span><span class="s2">"data/cifar/train.rec"</span><span class="p">,</span>
     <span class="c1"># Dataset Parameter, indicating the image size after preprocessing</span>
     <span class="n">data_shape</span><span class="o">=</span><span class="p">(</span><span class="mi">3</span><span class="p">,</span><span class="mi">28</span><span class="p">,</span><span class="mi">28</span><span class="p">),</span>
     <span class="c1"># Batch Parameter, tells how many images in a batch</span>
     <span class="n">batch_size</span><span class="o">=</span><span class="mi">100</span><span class="p">,</span>
     <span class="c1"># Augmentation Parameter, when offers mean_img, each image will subtract the mean value at each pixel</span>
     <span class="n">mean_img</span><span class="o">=</span><span class="s2">"data/cifar/cifar10_mean.bin"</span><span class="p">,</span>
     <span class="c1"># Augmentation Parameter, randomly crop a patch of the data_shape from the original image</span>
     <span class="n">rand_crop</span><span class="o">=</span><span class="bp">True</span><span class="p">,</span>
     <span class="c1"># Augmentation Parameter, randomly mirror the image horizontally</span>
     <span class="n">rand_mirror</span><span class="o">=</span><span class="bp">True</span><span class="p">,</span>
     <span class="c1"># Augmentation Parameter, randomly shuffle the data</span>
     <span class="n">shuffle</span><span class="o">=</span><span class="bp">False</span><span class="p">,</span>
     <span class="c1"># Backend Parameter, preprocessing thread number</span>
     <span class="n">preprocess_threads</span><span class="o">=</span><span class="mi">4</span><span class="p">,</span>
     <span class="c1"># Backend Parameter, prefetch buffer size</span>
     <span class="n">prefetch_buffer</span><span class="o">=</span><span class="mi">1</span><span class="p">)</span>
 </pre></div>
 </div>
 <p>Generally, to create a data iterator, you need to provide five kinds of parameters:</p>
 <ul class="simple">
 <li><strong>Dataset Param:</strong> Information needed to access the dataset, e.g. file path, input shape.</li>
 <li><strong>Batch Param:</strong> Specifies how to form a batch, e.g. batch size.</li>
 <li><strong>Augmentation Param:</strong> Which augmentation operations (e.g. crop, mirror) should be taken on an input image.</li>
 <li><strong>Backend Param:</strong> Controls the behavior of the backend threads to hide data loading cost.</li>
 <li><strong>Auxiliary Param:</strong> Provides options to help with debugging.</li>
 </ul>
 <p>Usually, <strong>Dataset Param</strong> and <strong>Batch Param</strong> MUST be given,
 otherwise the data batch can’t be created.
 Other parameters can be given as needed.
 Ideally, we should separate the MX Data IO into modules,
 some of which might be useful to expose to users, for example:</p>
 <ul class="simple">
 <li><strong>Efficient prefetcher:</strong> allows the user to write a data loader that reads their customized binary format that automatically gets multi-threaded prefetcher support.</li>
 <li><strong>Data transformer:</strong> image random cropping, mirroring, etc. Allows the users to use those tools, or plug in their own customized transformers (maybe they want to add some specific kind of coherent random noise to data, etc.)</li>
 </ul>
 </div>
 <div class="section" id="future-extensions">
 <span id="future-extensions"></span><h2>Future Extensions<a class="headerlink" href="#future-extensions" title="Permalink to this headline">¶</a></h2>
 <p>In the future, there are some extensions to our data IO
 that we might consider adding.
 Specifically, we might add specialized support
 for applications including image segmentation, object localization, and speech recognition.
 More detail will be provided when such applications have been running on MXNet.</p>
 </div>
 </div>
 <div class="container">
 <div class="footer">
 <p> © 2015-2017 DMLC. All rights reserved. </p>
 </div>
 </div>
 </div>
 <div aria-label="main navigation" class="sphinxsidebar rightsidebar" role="navigation">
 <div class="sphinxsidebarwrapper">
 <h3><a href="../index.html">Table Of Contents</a></h3>
 <ul>
 <li><a class="reference internal" href="#">Designing Efficient Data Loaders for Deep Learning</a><ul>
 <li><a class="reference internal" href="#design-insight">Design Insight</a><ul>
 <li><a class="reference internal" href="#data-preparation">Data Preparation</a></li>
 <li><a class="reference internal" href="#data-loading">Data Loading</a></li>
 </ul>
 </li>
 <li><a class="reference internal" href="#data-format">Data Format</a><ul>
 <li><a class="reference internal" href="#binary-record">Binary Record</a></li>
 <li><a class="reference internal" href="#access-arbitrary-parts-of-data">Access Arbitrary Parts Of Data</a></li>
 </ul>
 </li>
 <li><a class="reference internal" href="#data-loading-and-preprocessing">Data Loading and Preprocessing</a><ul>
 <li><a class="reference internal" href="#loading-and-preprocessing-on-the-fly">Loading and preprocessing on the fly</a></li>
 <li><a class="reference internal" href="#hide-io-cost-using-threadediter">Hide IO Cost Using Threadediter</a></li>
 </ul>
 </li>
 <li><a class="reference internal" href="#mxnet-io-python-interface">MXNet IO Python Interface</a></li>
 <li><a class="reference internal" href="#future-extensions">Future Extensions</a></li>
 </ul>
 </li>
 </ul>
 </div>
 </div>
 </div> <!-- pagename != index -->
 <script crossorigin="anonymous" integrity="sha384-0mSbJDEHialfmuBBQP6A4Qrprq5OVfW37PRR3j5ELqxss1yVqOtnepnHVP9aJ7xS" src="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.6/js/bootstrap.min.js"></script>
 <script src="../_static/js/sidebar.js" type="text/javascript"></script>
 <script src="../_static/js/search.js" type="text/javascript"></script>
 <script src="../_static/js/navbar.js" type="text/javascript"></script>
 <script src="../_static/js/clipboard.min.js" type="text/javascript"></script>
 <script src="../_static/js/copycode.js" type="text/javascript"></script>
 <script type="text/javascript">
         $('body').ready(function () {
             $('body').css('visibility', 'visible');
         });
     </script>
 </div></body>
 </html>