| |
| |
| |
| <!DOCTYPE html> |
| <html class="writer-html5" lang="en" > |
| <head> |
| <meta charset="utf-8"> |
| |
| <meta name="viewport" content="width=device-width, initial-scale=1.0"> |
| |
| <title>Tabular Datasets — Apache Arrow v2.0.0</title> |
| |
| |
| |
| <link rel="stylesheet" href="../_static/css/theme.css" type="text/css" /> |
| <link rel="stylesheet" href="../_static/pygments.css" type="text/css" /> |
| |
| |
| |
| |
| |
| |
| |
| <!--[if lt IE 9]> |
| <script src="../_static/js/html5shiv.min.js"></script> |
| <![endif]--> |
| |
| |
| <script type="text/javascript" id="documentation_options" data-url_root="../" src="../_static/documentation_options.js"></script> |
| <script src="../_static/jquery.js"></script> |
| <script src="../_static/underscore.js"></script> |
| <script src="../_static/doctools.js"></script> |
| <script src="../_static/language_data.js"></script> |
| |
| <script type="text/javascript" src="../_static/js/theme.js"></script> |
| |
| |
| <link rel="canonical" href="https://arrow.apache.org/docs/python/dataset.html" /> |
| <link rel="index" title="Index" href="../genindex.html" /> |
| <link rel="search" title="Search" href="../search.html" /> |
| <link rel="next" title="CUDA Integration" href="cuda.html" /> |
| <link rel="prev" title="Reading and Writing the Apache Parquet Format" href="parquet.html" /> |
|
|
|
|
| <!-- Matomo -->
|
| <script>
|
| var _paq = window._paq = window._paq || [];
|
| /* tracker methods like "setCustomDimension" should be called before "trackPageView" */
|
| _paq.push(["setDoNotTrack", true]);
|
| _paq.push(["disableCookies"]);
|
| _paq.push(['trackPageView']);
|
| _paq.push(['enableLinkTracking']);
|
| (function() {
|
| var u="https://analytics.apache.org/";
|
| _paq.push(['setTrackerUrl', u+'matomo.php']);
|
| _paq.push(['setSiteId', '20']);
|
| var d=document, g=d.createElement('script'), s=d.getElementsByTagName('script')[0];
|
| g.async=true; g.src=u+'matomo.js'; s.parentNode.insertBefore(g,s);
|
| })();
|
| </script>
|
| <!-- End Matomo Code -->
|
|
|
| </head> |
| |
| <body class="wy-body-for-nav"> |
| |
| |
| <div class="wy-grid-for-nav"> |
| |
| <nav data-toggle="wy-nav-shift" class="wy-nav-side"> |
| <div class="wy-side-scroll"> |
| <div class="wy-side-nav-search" > |
| |
| |
| |
| <a href="../index.html" class="icon icon-home" alt="Documentation Home"> Apache Arrow |
| |
| |
| |
| </a> |
| |
| |
| |
| |
| <div class="version"> |
| 2.0.0 |
| </div> |
| |
| |
| |
| |
| <div role="search"> |
| <form id="rtd-search-form" class="wy-form" action="../search.html" method="get"> |
| <input type="text" name="q" placeholder="Search docs" /> |
| <input type="hidden" name="check_keywords" value="yes" /> |
| <input type="hidden" name="area" value="default" /> |
| </form> |
| </div> |
| |
| |
| </div> |
| |
| |
| <div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="main navigation"> |
| |
| |
| |
| |
| |
| |
| <p class="caption"><span class="caption-text">Specifications and Protocols</span></p> |
| <ul> |
| <li class="toctree-l1"><a class="reference internal" href="../format/Versioning.html">Format Versioning and Stability</a></li> |
| <li class="toctree-l1"><a class="reference internal" href="../format/Columnar.html">Arrow Columnar Format</a></li> |
| <li class="toctree-l1"><a class="reference internal" href="../format/Flight.html">Arrow Flight RPC</a></li> |
| <li class="toctree-l1"><a class="reference internal" href="../format/Integration.html">Integration Testing</a></li> |
| <li class="toctree-l1"><a class="reference internal" href="../format/CDataInterface.html">The Arrow C data interface</a></li> |
| <li class="toctree-l1"><a class="reference internal" href="../format/CStreamInterface.html">The Arrow C stream interface</a></li> |
| <li class="toctree-l1"><a class="reference internal" href="../format/Other.html">Other Data Structures</a></li> |
| </ul> |
| <p class="caption"><span class="caption-text">Libraries</span></p> |
| <ul class="current"> |
| <li class="toctree-l1"><a class="reference internal" href="../status.html">Implementation Status</a></li> |
| <li class="toctree-l1"><a class="reference external" href="https://arrow.apache.org/docs/c_glib/">C/GLib</a></li> |
| <li class="toctree-l1"><a class="reference internal" href="../cpp/index.html">C++</a></li> |
| <li class="toctree-l1"><a class="reference external" href="https://github.com/apache/arrow/blob/master/csharp/README.md">C#</a></li> |
| <li class="toctree-l1"><a class="reference external" href="https://godoc.org/github.com/apache/arrow/go/arrow">Go</a></li> |
| <li class="toctree-l1"><a class="reference internal" href="../java/index.html">Java</a></li> |
| <li class="toctree-l1"><a class="reference external" href="https://arrow.apache.org/docs/js/">JavaScript</a></li> |
| <li class="toctree-l1"><a class="reference external" href="https://github.com/apache/arrow/blob/master/matlab/README.md">MATLAB</a></li> |
| <li class="toctree-l1 current"><a class="reference internal" href="index.html">Python</a><ul class="current"> |
| <li class="toctree-l2"><a class="reference internal" href="install.html">Installing PyArrow</a></li> |
| <li class="toctree-l2"><a class="reference internal" href="memory.html">Memory and IO Interfaces</a></li> |
| <li class="toctree-l2"><a class="reference internal" href="data.html">Data Types and In-Memory Data Model</a></li> |
| <li class="toctree-l2"><a class="reference internal" href="compute.html">Compute Functions</a></li> |
| <li class="toctree-l2"><a class="reference internal" href="ipc.html">Streaming, Serialization, and IPC</a></li> |
| <li class="toctree-l2"><a class="reference internal" href="filesystems.html">Filesystem Interface</a></li> |
| <li class="toctree-l2"><a class="reference internal" href="filesystems_deprecated.html">Filesystem Interface (legacy)</a></li> |
| <li class="toctree-l2"><a class="reference internal" href="plasma.html">The Plasma In-Memory Object Store</a></li> |
| <li class="toctree-l2"><a class="reference internal" href="numpy.html">NumPy Integration</a></li> |
| <li class="toctree-l2"><a class="reference internal" href="pandas.html">Pandas Integration</a></li> |
| <li class="toctree-l2"><a class="reference internal" href="timestamps.html">Timestamps</a></li> |
| <li class="toctree-l2"><a class="reference internal" href="csv.html">Reading CSV files</a></li> |
| <li class="toctree-l2"><a class="reference internal" href="feather.html">Feather File Format</a></li> |
| <li class="toctree-l2"><a class="reference internal" href="json.html">Reading JSON files</a></li> |
| <li class="toctree-l2"><a class="reference internal" href="parquet.html">Reading and Writing the Apache Parquet Format</a></li> |
| <li class="toctree-l2 current"><a class="current reference internal" href="#">Tabular Datasets</a><ul> |
| <li class="toctree-l3"><a class="reference internal" href="#reading-datasets">Reading Datasets</a><ul> |
| <li class="toctree-l4"><a class="reference internal" href="#dataset-discovery">Dataset discovery</a></li> |
| <li class="toctree-l4"><a class="reference internal" href="#reading-different-file-formats">Reading different file formats</a></li> |
| <li class="toctree-l4"><a class="reference internal" href="#customizing-file-formats">Customizing file formats</a></li> |
| </ul> |
| </li> |
| <li class="toctree-l3"><a class="reference internal" href="#filtering-data">Filtering data</a></li> |
| <li class="toctree-l3"><a class="reference internal" href="#reading-partitioned-data">Reading partitioned data</a><ul> |
| <li class="toctree-l4"><a class="reference internal" href="#different-partitioning-schemes">Different partitioning schemes</a></li> |
| </ul> |
| </li> |
| <li class="toctree-l3"><a class="reference internal" href="#reading-from-cloud-storage">Reading from cloud storage</a></li> |
| <li class="toctree-l3"><a class="reference internal" href="#reading-from-minio">Reading from Minio</a></li> |
| <li class="toctree-l3"><a class="reference internal" href="#working-with-parquet-datasets">Working with Parquet Datasets</a></li> |
| <li class="toctree-l3"><a class="reference internal" href="#manual-specification-of-the-dataset">Manual specification of the Dataset</a></li> |
| <li class="toctree-l3"><a class="reference internal" href="#manual-scheduling">Manual scheduling</a></li> |
| </ul> |
| </li> |
| <li class="toctree-l2"><a class="reference internal" href="cuda.html">CUDA Integration</a></li> |
| <li class="toctree-l2"><a class="reference internal" href="extending_types.html">Extending pyarrow</a></li> |
| <li class="toctree-l2"><a class="reference internal" href="extending.html">Using pyarrow from C++ and Cython Code</a></li> |
| <li class="toctree-l2"><a class="reference internal" href="api.html">API Reference</a></li> |
| <li class="toctree-l2"><a class="reference internal" href="getting_involved.html">Getting Involved</a></li> |
| <li class="toctree-l2"><a class="reference internal" href="benchmarks.html">Benchmarks</a></li> |
| </ul> |
| </li> |
| <li class="toctree-l1"><a class="reference external" href="https://arrow.apache.org/docs/r/">R</a></li> |
| <li class="toctree-l1"><a class="reference external" href="https://github.com/apache/arrow/blob/master/ruby/README.md">Ruby</a></li> |
| <li class="toctree-l1"><a class="reference external" href="https://docs.rs/crate/arrow/">Rust</a></li> |
| </ul> |
| <p class="caption"><span class="caption-text">Development</span></p> |
| <ul> |
| <li class="toctree-l1"><a class="reference internal" href="../developers/contributing.html">Contributing to Apache Arrow</a></li> |
| <li class="toctree-l1"><a class="reference internal" href="../developers/cpp/index.html">C++ Development</a></li> |
| <li class="toctree-l1"><a class="reference internal" href="../developers/python.html">Python Development</a></li> |
| <li class="toctree-l1"><a class="reference internal" href="../developers/archery.html">Daily Development using Archery</a></li> |
| <li class="toctree-l1"><a class="reference internal" href="../developers/crossbow.html">Packaging and Testing with Crossbow</a></li> |
| <li class="toctree-l1"><a class="reference internal" href="../developers/docker.html">Running Docker Builds</a></li> |
| <li class="toctree-l1"><a class="reference internal" href="../developers/benchmarks.html">Benchmarks</a></li> |
| <li class="toctree-l1"><a class="reference internal" href="../developers/documentation.html">Building the Documentation</a></li> |
| </ul> |
| |
| |
| |
| </div> |
| |
| </div> |
| </nav> |
| |
| <section data-toggle="wy-nav-shift" class="wy-nav-content-wrap"> |
| |
| |
| <nav class="wy-nav-top" aria-label="top navigation"> |
| |
| <i data-toggle="wy-nav-top" class="fa fa-bars"></i> |
| <a href="../index.html">Apache Arrow</a> |
| |
| </nav> |
| |
| |
| <div class="wy-nav-content"> |
| |
| <div class="rst-content"> |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| <div role="navigation" aria-label="breadcrumbs navigation"> |
| |
| <ul class="wy-breadcrumbs"> |
| |
| <li><a href="../index.html" class="icon icon-home"></a> »</li> |
| |
| <li><a href="index.html">Python bindings</a> »</li> |
| |
| <li>Tabular Datasets</li> |
| |
| |
| <li class="wy-breadcrumbs-aside"> |
| |
| |
| <a href="../_sources/python/dataset.rst.txt" rel="nofollow"> View page source</a> |
| |
| |
| </li> |
| |
| </ul> |
| |
| |
| <hr/> |
| </div> |
| <div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article"> |
| <div itemprop="articleBody"> |
| |
| <div class="section" id="tabular-datasets"> |
| <span id="dataset"></span><h1>Tabular Datasets<a class="headerlink" href="#tabular-datasets" title="Permalink to this headline">¶</a></h1> |
| <div class="admonition warning"> |
| <p class="admonition-title">Warning</p> |
| <p>The <code class="docutils literal notranslate"><span class="pre">pyarrow.dataset</span></code> module is experimental (specifically the classes), |
| and a stable API is not yet guaranteed.</p> |
| </div> |
| <p>The <code class="docutils literal notranslate"><span class="pre">pyarrow.dataset</span></code> module provides functionality to efficiently work with |
| tabular, potentially larger than memory and multi-file datasets:</p> |
| <ul class="simple"> |
| <li><p>A unified interface for different sources: supporting different sources and |
| file formats (Parquet, Feather files) and different file systems (local, |
| cloud).</p></li> |
| <li><p>Discovery of sources (crawling directories, handle directory-based partitioned |
| datasets, basic schema normalization, ..)</p></li> |
| <li><p>Optimized reading with predicate pushdown (filtering rows), projection |
| (selecting columns), parallel reading or fine-grained managing of tasks.</p></li> |
| </ul> |
| <p>Currently, only Parquet and Feather / Arrow IPC files are supported. The goal |
| is to expand this in the future to other file formats and data sources (e.g. |
| database connections).</p> |
| <p>For those familiar with the existing <a class="reference internal" href="generated/pyarrow.parquet.ParquetDataset.html#pyarrow.parquet.ParquetDataset" title="pyarrow.parquet.ParquetDataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">pyarrow.parquet.ParquetDataset</span></code></a> for |
| reading Parquet datasets: <code class="docutils literal notranslate"><span class="pre">pyarrow.dataset</span></code>’s goal is similar but not specific |
| to the Parquet format and not tied to Python: the same datasets API is exposed |
| in the R bindings or Arrow. In addition <code class="docutils literal notranslate"><span class="pre">pyarrow.dataset</span></code> boasts improved |
| perfomance and new features (e.g. filtering within files rather than only on |
| partition keys).</p> |
| <div class="section" id="reading-datasets"> |
| <h2>Reading Datasets<a class="headerlink" href="#reading-datasets" title="Permalink to this headline">¶</a></h2> |
| <p>For the examples below, let’s create a small dataset consisting |
| of a directory with two parquet files:</p> |
| <div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [1]: </span><span class="kn">import</span> <span class="nn">tempfile</span> |
| |
| <span class="gp">In [2]: </span><span class="kn">import</span> <span class="nn">pathlib</span> |
| |
| <span class="gp">In [3]: </span><span class="kn">import</span> <span class="nn">pyarrow</span> <span class="kn">as</span> <span class="nn">pa</span> |
| |
| <span class="gp">In [4]: </span><span class="kn">import</span> <span class="nn">pyarrow.parquet</span> <span class="kn">as</span> <span class="nn">pq</span> |
| |
| <span class="gp">In [5]: </span><span class="n">base</span> <span class="o">=</span> <span class="n">pathlib</span><span class="o">.</span><span class="n">Path</span><span class="p">(</span><span class="n">tempfile</span><span class="o">.</span><span class="n">gettempdir</span><span class="p">())</span> |
| |
| <span class="gp">In [6]: </span><span class="p">(</span><span class="n">base</span> <span class="o">/</span> <span class="s2">"parquet_dataset"</span><span class="p">)</span><span class="o">.</span><span class="n">mkdir</span><span class="p">(</span><span class="n">exist_ok</span><span class="o">=</span><span class="bp">True</span><span class="p">)</span> |
| |
| <span class="go"># creating an Arrow Table</span> |
| <span class="gp">In [7]: </span><span class="n">table</span> <span class="o">=</span> <span class="n">pa</span><span class="o">.</span><span class="n">table</span><span class="p">({</span><span class="s1">'a'</span><span class="p">:</span> <span class="nb">range</span><span class="p">(</span><span class="mi">10</span><span class="p">),</span> <span class="s1">'b'</span><span class="p">:</span> <span class="n">np</span><span class="o">.</span><span class="n">random</span><span class="o">.</span><span class="n">randn</span><span class="p">(</span><span class="mi">10</span><span class="p">),</span> <span class="s1">'c'</span><span class="p">:</span> <span class="p">[</span><span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">]</span> <span class="o">*</span> <span class="mi">5</span><span class="p">})</span> |
| |
| <span class="go"># writing it into two parquet files</span> |
| <span class="gp">In [8]: </span><span class="n">pq</span><span class="o">.</span><span class="n">write_table</span><span class="p">(</span><span class="n">table</span><span class="o">.</span><span class="n">slice</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="mi">5</span><span class="p">),</span> <span class="n">base</span> <span class="o">/</span> <span class="s2">"parquet_dataset/data1.parquet"</span><span class="p">)</span> |
| |
| <span class="gp">In [9]: </span><span class="n">pq</span><span class="o">.</span><span class="n">write_table</span><span class="p">(</span><span class="n">table</span><span class="o">.</span><span class="n">slice</span><span class="p">(</span><span class="mi">5</span><span class="p">,</span> <span class="mi">10</span><span class="p">),</span> <span class="n">base</span> <span class="o">/</span> <span class="s2">"parquet_dataset/data2.parquet"</span><span class="p">)</span> |
| </pre></div> |
| </div> |
| <div class="section" id="dataset-discovery"> |
| <h3>Dataset discovery<a class="headerlink" href="#dataset-discovery" title="Permalink to this headline">¶</a></h3> |
| <p>A <a class="reference internal" href="generated/pyarrow.dataset.Dataset.html#pyarrow.dataset.Dataset" title="pyarrow.dataset.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">Dataset</span></code></a> object can be created with the <a class="reference internal" href="generated/pyarrow.dataset.dataset.html#pyarrow.dataset.dataset" title="pyarrow.dataset.dataset"><code class="xref py py-func docutils literal notranslate"><span class="pre">dataset()</span></code></a> function. We |
| can pass it the path to the directory containing the data files:</p> |
| <div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [10]: </span><span class="kn">import</span> <span class="nn">pyarrow.dataset</span> <span class="kn">as</span> <span class="nn">ds</span> |
| |
| <span class="gp">In [11]: </span><span class="n">dataset</span> <span class="o">=</span> <span class="n">ds</span><span class="o">.</span><span class="n">dataset</span><span class="p">(</span><span class="n">base</span> <span class="o">/</span> <span class="s2">"parquet_dataset"</span><span class="p">,</span> <span class="n">format</span><span class="o">=</span><span class="s2">"parquet"</span><span class="p">)</span> |
| |
| <span class="gp">In [12]: </span><span class="n">dataset</span> |
| <span class="gh">Out[12]: </span><span class="go"><pyarrow._dataset.FileSystemDataset at 0x7fea83e1f8f0></span> |
| </pre></div> |
| </div> |
| <p>In addition to a base directory path, <a class="reference internal" href="generated/pyarrow.dataset.dataset.html#pyarrow.dataset.dataset" title="pyarrow.dataset.dataset"><code class="xref py py-func docutils literal notranslate"><span class="pre">dataset()</span></code></a> accepts a path to a single |
| file or a list of file paths.</p> |
| <p>Creating a <a class="reference internal" href="generated/pyarrow.dataset.Dataset.html#pyarrow.dataset.Dataset" title="pyarrow.dataset.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">Dataset</span></code></a> object loads nothing into memory, it only crawls the |
| directory to find all the files:</p> |
| <div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [13]: </span><span class="n">dataset</span><span class="o">.</span><span class="n">files</span> |
| <span class="gh">Out[13]: </span><span class="go">['/tmp/parquet_dataset/data1.parquet', '/tmp/parquet_dataset/data2.parquet']</span> |
| </pre></div> |
| </div> |
| <p>… and infers the dataset’s schema (by default from the first file):</p> |
| <div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [14]: </span><span class="k">print</span><span class="p">(</span><span class="n">dataset</span><span class="o">.</span><span class="n">schema</span><span class="o">.</span><span class="n">to_string</span><span class="p">(</span><span class="n">show_field_metadata</span><span class="o">=</span><span class="bp">False</span><span class="p">))</span> |
| <span class="go">a: int64</span> |
| <span class="go">b: double</span> |
| <span class="go">c: int64</span> |
| </pre></div> |
| </div> |
| <p>Using the <a class="reference internal" href="generated/pyarrow.dataset.Dataset.html#pyarrow.dataset.Dataset.to_table" title="pyarrow.dataset.Dataset.to_table"><code class="xref py py-meth docutils literal notranslate"><span class="pre">Dataset.to_table()</span></code></a> method we can read the dataset (or a portion |
| of it) into a pyarrow Table (note that depending on the size of your dataset |
| this can require a lot of memory, see below on filtering / iterative loading):</p> |
| <div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [15]: </span><span class="n">dataset</span><span class="o">.</span><span class="n">to_table</span><span class="p">()</span> |
| <span class="gh">Out[15]: </span><span class="go"></span> |
| <span class="go">pyarrow.Table</span> |
| <span class="go">a: int64</span> |
| <span class="go">b: double</span> |
| <span class="go">c: int64</span> |
| |
| <span class="go"># converting to pandas to see the contents of the scanned table</span> |
| <span class="gp">In [16]: </span><span class="n">dataset</span><span class="o">.</span><span class="n">to_table</span><span class="p">()</span><span class="o">.</span><span class="n">to_pandas</span><span class="p">()</span> |
| <span class="gh">Out[16]: </span><span class="go"></span> |
| <span class="go"> a b c</span> |
| <span class="go">0 0 1.537236 1</span> |
| <span class="go">1 1 -0.165335 2</span> |
| <span class="go">2 2 -0.200643 1</span> |
| <span class="go">3 3 0.606877 2</span> |
| <span class="go">4 4 -0.788530 1</span> |
| <span class="go">5 5 0.872821 2</span> |
| <span class="go">6 6 0.263579 1</span> |
| <span class="go">7 7 -1.392005 2</span> |
| <span class="go">8 8 0.346150 1</span> |
| <span class="go">9 9 -0.414411 2</span> |
| </pre></div> |
| </div> |
| </div> |
| <div class="section" id="reading-different-file-formats"> |
| <h3>Reading different file formats<a class="headerlink" href="#reading-different-file-formats" title="Permalink to this headline">¶</a></h3> |
| <p>The above examples use Parquet files as dataset source but the Dataset API |
| provides a consistent interface across multiple file formats and sources. |
| Currently, Parquet and Feather / Arrow IPC file format are supported; more |
| formats are planned in the future.</p> |
| <p>If we save the table as a Feather file instead of Parquet files:</p> |
| <div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [17]: </span><span class="kn">import</span> <span class="nn">pyarrow.feather</span> <span class="kn">as</span> <span class="nn">feather</span> |
| |
| <span class="gp">In [18]: </span><span class="n">feather</span><span class="o">.</span><span class="n">write_feather</span><span class="p">(</span><span class="n">table</span><span class="p">,</span> <span class="n">base</span> <span class="o">/</span> <span class="s2">"data.feather"</span><span class="p">)</span> |
| </pre></div> |
| </div> |
| <p>then we can read the Feather file using the same functions, but with specifying |
| <code class="docutils literal notranslate"><span class="pre">format="feather"</span></code>:</p> |
| <div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [19]: </span><span class="n">dataset</span> <span class="o">=</span> <span class="n">ds</span><span class="o">.</span><span class="n">dataset</span><span class="p">(</span><span class="n">base</span> <span class="o">/</span> <span class="s2">"data.feather"</span><span class="p">,</span> <span class="n">format</span><span class="o">=</span><span class="s2">"feather"</span><span class="p">)</span> |
| |
| <span class="gp">In [20]: </span><span class="n">dataset</span><span class="o">.</span><span class="n">to_table</span><span class="p">()</span><span class="o">.</span><span class="n">to_pandas</span><span class="p">()</span><span class="o">.</span><span class="n">head</span><span class="p">()</span> |
| <span class="gh">Out[20]: </span><span class="go"></span> |
| <span class="go"> a b c</span> |
| <span class="go">0 0 1.537236 1</span> |
| <span class="go">1 1 -0.165335 2</span> |
| <span class="go">2 2 -0.200643 1</span> |
| <span class="go">3 3 0.606877 2</span> |
| <span class="go">4 4 -0.788530 1</span> |
| </pre></div> |
| </div> |
| </div> |
| <div class="section" id="customizing-file-formats"> |
| <h3>Customizing file formats<a class="headerlink" href="#customizing-file-formats" title="Permalink to this headline">¶</a></h3> |
| <p>The format name as a string, like:</p> |
| <div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">ds</span><span class="o">.</span><span class="n">dataset</span><span class="p">(</span><span class="o">...</span><span class="p">,</span> <span class="nb">format</span><span class="o">=</span><span class="s2">"parquet"</span><span class="p">)</span> |
| </pre></div> |
| </div> |
| <p>is short hand for a default constructed <a class="reference internal" href="generated/pyarrow.dataset.ParquetFileFormat.html#pyarrow.dataset.ParquetFileFormat" title="pyarrow.dataset.ParquetFileFormat"><code class="xref py py-class docutils literal notranslate"><span class="pre">ParquetFileFormat</span></code></a>:</p> |
| <div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">ds</span><span class="o">.</span><span class="n">dataset</span><span class="p">(</span><span class="o">...</span><span class="p">,</span> <span class="nb">format</span><span class="o">=</span><span class="n">ds</span><span class="o">.</span><span class="n">ParquetFileForma</span><span class="p">())</span> |
| </pre></div> |
| </div> |
| <p>The <a class="reference internal" href="generated/pyarrow.dataset.FileFormat.html#pyarrow.dataset.FileFormat" title="pyarrow.dataset.FileFormat"><code class="xref py py-class docutils literal notranslate"><span class="pre">FileFormat</span></code></a> objects can be customized using keywords. For example:</p> |
| <div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">parquet_format</span> <span class="o">=</span> <span class="n">ds</span><span class="o">.</span><span class="n">ParquetFileFormat</span><span class="p">(</span><span class="n">read_options</span><span class="o">=</span><span class="p">{</span><span class="s1">'dictionary_columns'</span><span class="p">:</span> <span class="p">[</span><span class="s1">'a'</span><span class="p">]})</span> |
| <span class="n">ds</span><span class="o">.</span><span class="n">dataset</span><span class="p">(</span><span class="o">...</span><span class="p">,</span> <span class="nb">format</span><span class="o">=</span><span class="n">parquet_format</span><span class="p">)</span> |
| </pre></div> |
| </div> |
| <p>Will configure column <code class="docutils literal notranslate"><span class="pre">"a"</span></code> to be dictionary encoded on scan.</p> |
| </div> |
| </div> |
| <div class="section" id="filtering-data"> |
| <h2>Filtering data<a class="headerlink" href="#filtering-data" title="Permalink to this headline">¶</a></h2> |
| <p>To avoid reading all data when only needing a subset, the <code class="docutils literal notranslate"><span class="pre">columns</span></code> and |
| <code class="docutils literal notranslate"><span class="pre">filter</span></code> keywords can be used.</p> |
| <p>The <code class="docutils literal notranslate"><span class="pre">columns</span></code> keyword can be used to only read the specified columns:</p> |
| <div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [21]: </span><span class="n">dataset</span> <span class="o">=</span> <span class="n">ds</span><span class="o">.</span><span class="n">dataset</span><span class="p">(</span><span class="n">base</span> <span class="o">/</span> <span class="s2">"parquet_dataset"</span><span class="p">,</span> <span class="n">format</span><span class="o">=</span><span class="s2">"parquet"</span><span class="p">)</span> |
| |
| <span class="gp">In [22]: </span><span class="n">dataset</span><span class="o">.</span><span class="n">to_table</span><span class="p">(</span><span class="n">columns</span><span class="o">=</span><span class="p">[</span><span class="s1">'a'</span><span class="p">,</span> <span class="s1">'b'</span><span class="p">])</span><span class="o">.</span><span class="n">to_pandas</span><span class="p">()</span> |
| <span class="gh">Out[22]: </span><span class="go"></span> |
| <span class="go"> a b</span> |
| <span class="go">0 0 1.537236</span> |
| <span class="go">1 1 -0.165335</span> |
| <span class="go">2 2 -0.200643</span> |
| <span class="go">3 3 0.606877</span> |
| <span class="go">4 4 -0.788530</span> |
| <span class="go">5 5 0.872821</span> |
| <span class="go">6 6 0.263579</span> |
| <span class="go">7 7 -1.392005</span> |
| <span class="go">8 8 0.346150</span> |
| <span class="go">9 9 -0.414411</span> |
| </pre></div> |
| </div> |
| <p>With the <code class="docutils literal notranslate"><span class="pre">filter</span></code> keyword, rows which do not match the filter predicate will |
| not be included in the returned table. The keyword expects a boolean |
| <a class="reference internal" href="generated/pyarrow.dataset.Expression.html#pyarrow.dataset.Expression" title="pyarrow.dataset.Expression"><code class="xref py py-class docutils literal notranslate"><span class="pre">Expression</span></code></a> referencing at least one of the columns:</p> |
| <div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [23]: </span><span class="n">dataset</span><span class="o">.</span><span class="n">to_table</span><span class="p">(</span><span class="nb">filter</span><span class="o">=</span><span class="n">ds</span><span class="o">.</span><span class="n">field</span><span class="p">(</span><span class="s1">'a'</span><span class="p">)</span> <span class="o">>=</span> <span class="mi">7</span><span class="p">)</span><span class="o">.</span><span class="n">to_pandas</span><span class="p">()</span> |
| <span class="gh">Out[23]: </span><span class="go"></span> |
| <span class="go"> a b c</span> |
| <span class="go">0 7 -1.392005 2</span> |
| <span class="go">1 8 0.346150 1</span> |
| <span class="go">2 9 -0.414411 2</span> |
| |
| <span class="gp">In [24]: </span><span class="n">dataset</span><span class="o">.</span><span class="n">to_table</span><span class="p">(</span><span class="nb">filter</span><span class="o">=</span><span class="n">ds</span><span class="o">.</span><span class="n">field</span><span class="p">(</span><span class="s1">'c'</span><span class="p">)</span> <span class="o">==</span> <span class="mi">2</span><span class="p">)</span><span class="o">.</span><span class="n">to_pandas</span><span class="p">()</span> |
| <span class="gh">Out[24]: </span><span class="go"></span> |
| <span class="go"> a b c</span> |
| <span class="go">0 1 -0.165335 2</span> |
| <span class="go">1 3 0.606877 2</span> |
| <span class="go">2 5 0.872821 2</span> |
| <span class="go">3 7 -1.392005 2</span> |
| <span class="go">4 9 -0.414411 2</span> |
| </pre></div> |
| </div> |
| <p>The easiest way to construct those <a class="reference internal" href="generated/pyarrow.dataset.Expression.html#pyarrow.dataset.Expression" title="pyarrow.dataset.Expression"><code class="xref py py-class docutils literal notranslate"><span class="pre">Expression</span></code></a> objects is by using the |
| <a class="reference internal" href="generated/pyarrow.dataset.field.html#pyarrow.dataset.field" title="pyarrow.dataset.field"><code class="xref py py-func docutils literal notranslate"><span class="pre">field()</span></code></a> helper function. Any column - not just partition columns - can be |
| referenced using the <a class="reference internal" href="generated/pyarrow.dataset.field.html#pyarrow.dataset.field" title="pyarrow.dataset.field"><code class="xref py py-func docutils literal notranslate"><span class="pre">field()</span></code></a> function (which creates a |
| <code class="xref py py-class docutils literal notranslate"><span class="pre">FieldExpression</span></code>). Operator overloads are provided to compose filters |
| including the comparisons (equal, larger/less than, etc), set membership |
| testing, and boolean combinations (and, or, not):</p> |
| <div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [25]: </span><span class="n">ds</span><span class="o">.</span><span class="n">field</span><span class="p">(</span><span class="s1">'a'</span><span class="p">)</span> <span class="o">!=</span> <span class="mi">3</span> |
| <span class="gh">Out[25]: </span><span class="go"><pyarrow.dataset.Expression (a != 3:int64)></span> |
| |
| <span class="gp">In [26]: </span><span class="n">ds</span><span class="o">.</span><span class="n">field</span><span class="p">(</span><span class="s1">'a'</span><span class="p">)</span><span class="o">.</span><span class="n">isin</span><span class="p">([</span><span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="mi">3</span><span class="p">])</span> |
| <span class="gh">Out[26]: </span><span class="go"></span> |
| <span class="go"><pyarrow.dataset.Expression (a is in [</span> |
| <span class="go"> 1,</span> |
| <span class="go"> 2,</span> |
| <span class="go"> 3</span> |
| <span class="go">])></span> |
| |
| <span class="gp">In [27]: </span><span class="p">(</span><span class="n">ds</span><span class="o">.</span><span class="n">field</span><span class="p">(</span><span class="s1">'a'</span><span class="p">)</span> <span class="o">></span> <span class="n">ds</span><span class="o">.</span><span class="n">field</span><span class="p">(</span><span class="s1">'b'</span><span class="p">))</span> <span class="o">&</span> <span class="p">(</span><span class="n">ds</span><span class="o">.</span><span class="n">field</span><span class="p">(</span><span class="s1">'b'</span><span class="p">)</span> <span class="o">></span> <span class="mi">1</span><span class="p">)</span> |
| <span class="gh">Out[27]: </span><span class="go"><pyarrow.dataset.Expression ((a > b) and (b > 1:int64))></span> |
| </pre></div> |
| </div> |
| </div> |
| <div class="section" id="reading-partitioned-data"> |
| <h2>Reading partitioned data<a class="headerlink" href="#reading-partitioned-data" title="Permalink to this headline">¶</a></h2> |
| <p>Above, a dataset consisting of a flat directory with files was shown. However, a |
| dataset can exploit a nested directory structure defining a partitioned dataset, |
| where the sub-directory names hold information about which subset of the data is |
| stored in that directory.</p> |
| <p>For example, a dataset partitioned by year and month may look like on disk:</p> |
| <div class="highlight-text notranslate"><div class="highlight"><pre><span></span>dataset_name/ |
| year=2007/ |
| month=01/ |
| data0.parquet |
| data1.parquet |
| ... |
| month=02/ |
| data0.parquet |
| data1.parquet |
| ... |
| month=03/ |
| ... |
| year=2008/ |
| month=01/ |
| ... |
| ... |
| </pre></div> |
| </div> |
| <p>The above partitioning scheme is using “/key=value/” directory names, as found |
| in Apache Hive.</p> |
| <p>Let’s create a small partitioned dataset. The <a class="reference internal" href="generated/pyarrow.parquet.write_to_dataset.html#pyarrow.parquet.write_to_dataset" title="pyarrow.parquet.write_to_dataset"><code class="xref py py-func docutils literal notranslate"><span class="pre">write_to_dataset()</span></code></a> |
| function can write such hive-like partitioned datasets.</p> |
| <div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [28]: </span><span class="n">table</span> <span class="o">=</span> <span class="n">pa</span><span class="o">.</span><span class="n">table</span><span class="p">({</span><span class="s1">'a'</span><span class="p">:</span> <span class="nb">range</span><span class="p">(</span><span class="mi">10</span><span class="p">),</span> <span class="s1">'b'</span><span class="p">:</span> <span class="n">np</span><span class="o">.</span><span class="n">random</span><span class="o">.</span><span class="n">randn</span><span class="p">(</span><span class="mi">10</span><span class="p">),</span> <span class="s1">'c'</span><span class="p">:</span> <span class="p">[</span><span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">]</span> <span class="o">*</span> <span class="mi">5</span><span class="p">,</span> |
| <span class="gp"> ....: </span> <span class="s1">'part'</span><span class="p">:</span> <span class="p">[</span><span class="s1">'a'</span><span class="p">]</span> <span class="o">*</span> <span class="mi">5</span> <span class="o">+</span> <span class="p">[</span><span class="s1">'b'</span><span class="p">]</span> <span class="o">*</span> <span class="mi">5</span><span class="p">})</span> |
| <span class="gp"> ....: </span> |
| |
| <span class="gp">In [29]: </span><span class="n">pq</span><span class="o">.</span><span class="n">write_to_dataset</span><span class="p">(</span><span class="n">table</span><span class="p">,</span> <span class="nb">str</span><span class="p">(</span><span class="n">base</span> <span class="o">/</span> <span class="s2">"parquet_dataset_partitioned"</span><span class="p">),</span> |
| <span class="gp"> ....: </span> <span class="n">partition_cols</span><span class="o">=</span><span class="p">[</span><span class="s1">'part'</span><span class="p">])</span> |
| <span class="gp"> ....: </span> |
| </pre></div> |
| </div> |
| <p>The above created a directory with two subdirectories (“part=a” and “part=b”), |
| and the Parquet files written in those directories no longer include the “part” |
| column.</p> |
| <p>Reading this dataset with <a class="reference internal" href="generated/pyarrow.dataset.dataset.html#pyarrow.dataset.dataset" title="pyarrow.dataset.dataset"><code class="xref py py-func docutils literal notranslate"><span class="pre">dataset()</span></code></a>, we now specify that the dataset |
| uses a hive-like partitioning scheme with the <cite>partitioning</cite> keyword:</p> |
| <div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [30]: </span><span class="n">dataset</span> <span class="o">=</span> <span class="n">ds</span><span class="o">.</span><span class="n">dataset</span><span class="p">(</span><span class="nb">str</span><span class="p">(</span><span class="n">base</span> <span class="o">/</span> <span class="s2">"parquet_dataset_partitioned"</span><span class="p">),</span> <span class="n">format</span><span class="o">=</span><span class="s2">"parquet"</span><span class="p">,</span> |
| <span class="gp"> ....: </span> <span class="n">partitioning</span><span class="o">=</span><span class="s2">"hive"</span><span class="p">)</span> |
| <span class="gp"> ....: </span> |
| |
| <span class="gp">In [31]: </span><span class="n">dataset</span><span class="o">.</span><span class="n">files</span> |
| <span class="gh">Out[31]: </span><span class="go"></span> |
| <span class="go">['/tmp/parquet_dataset_partitioned/part=a/fc4d5802496645d8a712ccdf6f1e02da.parquet',</span> |
| <span class="go"> '/tmp/parquet_dataset_partitioned/part=b/348d983e005740aeb57da4b7733ac09c.parquet']</span> |
| </pre></div> |
| </div> |
| <p>Although the partition fields are not included in the actual Parquet files, |
| they will be added back to the resulting table when scanning this dataset:</p> |
| <div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [32]: </span><span class="n">dataset</span><span class="o">.</span><span class="n">to_table</span><span class="p">()</span><span class="o">.</span><span class="n">to_pandas</span><span class="p">()</span><span class="o">.</span><span class="n">head</span><span class="p">(</span><span class="mi">3</span><span class="p">)</span> |
| <span class="gh">Out[32]: </span><span class="go"></span> |
| <span class="go"> a b c part</span> |
| <span class="go">0 0 -0.416769 1 a</span> |
| <span class="go">1 1 -0.597459 2 a</span> |
| <span class="go">2 2 0.076372 1 a</span> |
| </pre></div> |
| </div> |
| <p>We can now filter on the partition keys, which avoids loading files |
| altogether if they do not match the predicate:</p> |
| <div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [33]: </span><span class="n">dataset</span><span class="o">.</span><span class="n">to_table</span><span class="p">(</span><span class="nb">filter</span><span class="o">=</span><span class="n">ds</span><span class="o">.</span><span class="n">field</span><span class="p">(</span><span class="s2">"part"</span><span class="p">)</span> <span class="o">==</span> <span class="s2">"b"</span><span class="p">)</span><span class="o">.</span><span class="n">to_pandas</span><span class="p">()</span> |
| <span class="gh">Out[33]: </span><span class="go"></span> |
| <span class="go"> a b c part</span> |
| <span class="go">0 5 -1.236938 2 b</span> |
| <span class="go">1 6 0.563724 1 b</span> |
| <span class="go">2 7 -0.984011 2 b</span> |
| <span class="go">3 8 0.428099 1 b</span> |
| <span class="go">4 9 -0.482604 2 b</span> |
| </pre></div> |
| </div> |
| <div class="section" id="different-partitioning-schemes"> |
| <h3>Different partitioning schemes<a class="headerlink" href="#different-partitioning-schemes" title="Permalink to this headline">¶</a></h3> |
| <p>The above example uses a hive-like directory scheme, such as “/year=2009/month=11/day=15”. |
| We specified this passing the <code class="docutils literal notranslate"><span class="pre">partitioning="hive"</span></code> keyword. In this case, |
| the types of the partition keys are inferred from the file paths.</p> |
| <p>It is also possible to explicitly define the schema of the partition keys |
| using the <a class="reference internal" href="generated/pyarrow.dataset.partitioning.html#pyarrow.dataset.partitioning" title="pyarrow.dataset.partitioning"><code class="xref py py-func docutils literal notranslate"><span class="pre">partitioning()</span></code></a> function. For example:</p> |
| <div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="n">part</span> <span class="o">=</span> <span class="n">ds</span><span class="o">.</span><span class="n">partitioning</span><span class="p">(</span> |
| <span class="n">pa</span><span class="o">.</span><span class="n">schema</span><span class="p">([(</span><span class="s2">"year"</span><span class="p">,</span> <span class="n">pa</span><span class="o">.</span><span class="n">int16</span><span class="p">()),</span> <span class="p">(</span><span class="s2">"month"</span><span class="p">,</span> <span class="n">pa</span><span class="o">.</span><span class="n">int8</span><span class="p">()),</span> <span class="p">(</span><span class="s2">"day"</span><span class="p">,</span> <span class="n">pa</span><span class="o">.</span><span class="n">int32</span><span class="p">())]),</span> |
| <span class="n">flavor</span><span class="o">=</span><span class="s2">"hive"</span> |
| <span class="p">)</span> |
| <span class="n">dataset</span> <span class="o">=</span> <span class="n">ds</span><span class="o">.</span><span class="n">dataset</span><span class="p">(</span><span class="o">...</span><span class="p">,</span> <span class="n">partitioning</span><span class="o">=</span><span class="n">part</span><span class="p">)</span> |
| </pre></div> |
| </div> |
| <p>“Directory partitioning” is also supported, where the segments in the file path |
| represent the values of the partition keys without including the name (the |
| field name are implicit in the segment’s index). For example, given field names |
| “year”, “month”, and “day”, one path might be “/2019/11/15”.</p> |
| <p>Since the names are not included in the file paths, these must be specified |
| when constructing a directory partitioning:</p> |
| <div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="n">part</span> <span class="o">=</span> <span class="n">ds</span><span class="o">.</span><span class="n">partitioning</span><span class="p">(</span><span class="n">field_names</span><span class="o">=</span><span class="p">[</span><span class="s2">"year"</span><span class="p">,</span> <span class="s2">"month"</span><span class="p">,</span> <span class="s2">"day"</span><span class="p">])</span> |
| </pre></div> |
| </div> |
| <p>Directory partitioning also supports providing a full schema rather than inferring |
| types from file paths.</p> |
| </div> |
| </div> |
| <div class="section" id="reading-from-cloud-storage"> |
| <h2>Reading from cloud storage<a class="headerlink" href="#reading-from-cloud-storage" title="Permalink to this headline">¶</a></h2> |
| <p>In addition to local files, pyarrow also supports reading from cloud storage. |
| Currently, <a class="reference internal" href="generated/pyarrow.fs.HadoopFileSystem.html#pyarrow.fs.HadoopFileSystem" title="pyarrow.fs.HadoopFileSystem"><code class="xref py py-class docutils literal notranslate"><span class="pre">HDFS</span></code></a> and |
| <code class="xref py py-class docutils literal notranslate"><span class="pre">Amazon</span> <span class="pre">S3-compatible</span> <span class="pre">storage</span></code> are supported.</p> |
| <p>When passing a file URI, the file system will be inferred. For example, |
| specifying a S3 path:</p> |
| <div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="n">dataset</span> <span class="o">=</span> <span class="n">ds</span><span class="o">.</span><span class="n">dataset</span><span class="p">(</span><span class="s2">"s3://ursa-labs-taxi-data/"</span><span class="p">,</span> <span class="n">partitioning</span><span class="o">=</span><span class="p">[</span><span class="s2">"year"</span><span class="p">,</span> <span class="s2">"month"</span><span class="p">])</span> |
| </pre></div> |
| </div> |
| <p>Typically, you will want to customize the connection parameters, and then |
| a file system object can be created and passed to the <code class="docutils literal notranslate"><span class="pre">filesystem</span></code> keyword:</p> |
| <div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="kn">from</span> <span class="nn">pyarrow</span> <span class="kn">import</span> <span class="n">fs</span> |
| |
| <span class="n">s3</span> <span class="o">=</span> <span class="n">fs</span><span class="o">.</span><span class="n">S3FileSystem</span><span class="p">(</span><span class="n">region</span><span class="o">=</span><span class="s2">"us-east-2"</span><span class="p">)</span> |
| <span class="n">dataset</span> <span class="o">=</span> <span class="n">ds</span><span class="o">.</span><span class="n">dataset</span><span class="p">(</span><span class="s2">"ursa-labs-taxi-data/"</span><span class="p">,</span> <span class="n">filesystem</span><span class="o">=</span><span class="n">s3</span><span class="p">,</span> |
| <span class="n">partitioning</span><span class="o">=</span><span class="p">[</span><span class="s2">"year"</span><span class="p">,</span> <span class="s2">"month"</span><span class="p">])</span> |
| </pre></div> |
| </div> |
| <p>The currently available classes are <code class="xref py py-class docutils literal notranslate"><span class="pre">S3FileSystem</span></code> and |
| <a class="reference internal" href="generated/pyarrow.fs.HadoopFileSystem.html#pyarrow.fs.HadoopFileSystem" title="pyarrow.fs.HadoopFileSystem"><code class="xref py py-class docutils literal notranslate"><span class="pre">HadoopFileSystem</span></code></a>. See the <a class="reference internal" href="filesystems.html#filesystem"><span class="std std-ref">Filesystem Interface</span></a> docs for more |
| details.</p> |
| </div> |
| <div class="section" id="reading-from-minio"> |
| <h2>Reading from Minio<a class="headerlink" href="#reading-from-minio" title="Permalink to this headline">¶</a></h2> |
| <p>In addition to cloud storage, pyarrow also supports reading from a |
| <a class="reference external" href="https://github.com/minio/minio">MinIO</a> object storage instance emulating S3 |
| APIs. Paired with <a class="reference external" href="https://github.com/shopify/toxiproxy">toxiproxy</a>, this is |
| useful for testing or benchmarking.</p> |
| <div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="kn">from</span> <span class="nn">pyarrow</span> <span class="kn">import</span> <span class="n">fs</span> |
| |
| <span class="c1"># By default, MinIO will listen for unencrypted HTTP traffic.</span> |
| <span class="n">minio</span> <span class="o">=</span> <span class="n">fs</span><span class="o">.</span><span class="n">S3FileSystem</span><span class="p">(</span><span class="n">scheme</span><span class="o">=</span><span class="s2">"http"</span><span class="p">,</span> <span class="n">endpoint</span><span class="o">=</span><span class="s2">"localhost:9000"</span><span class="p">)</span> |
| <span class="n">dataset</span> <span class="o">=</span> <span class="n">ds</span><span class="o">.</span><span class="n">dataset</span><span class="p">(</span><span class="s2">"ursa-labs-taxi-data/"</span><span class="p">,</span> <span class="n">filesystem</span><span class="o">=</span><span class="n">minio</span><span class="p">,</span> |
| <span class="n">partitioning</span><span class="o">=</span><span class="p">[</span><span class="s2">"year"</span><span class="p">,</span> <span class="s2">"month"</span><span class="p">])</span> |
| </pre></div> |
| </div> |
| </div> |
| <div class="section" id="working-with-parquet-datasets"> |
| <h2>Working with Parquet Datasets<a class="headerlink" href="#working-with-parquet-datasets" title="Permalink to this headline">¶</a></h2> |
| <p>While the Datasets API provides a unified interface to different file formats, |
| some specific methods exist for Parquet Datasets.</p> |
| <p>Some processing frameworks such as Dask (optionally) use a <code class="docutils literal notranslate"><span class="pre">_metadata</span></code> file |
| with partitioned datasets which includes information about the schema and the |
| row group metadata of the full dataset. Using such file can give a more |
| efficient creation of a parquet Dataset, since it does not need to infer the |
| schema and crawl the directories for all Parquet files (this is especially the |
| case for filesystems where accessing files is expensive). The |
| <a class="reference internal" href="generated/pyarrow.dataset.parquet_dataset.html#pyarrow.dataset.parquet_dataset" title="pyarrow.dataset.parquet_dataset"><code class="xref py py-func docutils literal notranslate"><span class="pre">parquet_dataset()</span></code></a> function allows to create a Dataset from a partitioned |
| dataset with a <code class="docutils literal notranslate"><span class="pre">_metadata</span></code> file:</p> |
| <div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="n">dataset</span> <span class="o">=</span> <span class="n">ds</span><span class="o">.</span><span class="n">parquet_dataset</span><span class="p">(</span><span class="s2">"/path/to/dir/_metadata"</span><span class="p">)</span> |
| </pre></div> |
| </div> |
| <p>By default, the constructed <a class="reference internal" href="generated/pyarrow.dataset.Dataset.html#pyarrow.dataset.Dataset" title="pyarrow.dataset.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">Dataset</span></code></a> object for Parquet datasets maps |
| each fragment to a single Parquet file. If you want fragments mapping to each |
| row group of a Parquet file, you can use the <code class="docutils literal notranslate"><span class="pre">split_by_row_group()</span></code> method of |
| the fragments:</p> |
| <div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="n">fragments</span> <span class="o">=</span> <span class="nb">list</span><span class="p">(</span><span class="n">dataset</span><span class="o">.</span><span class="n">get_fragments</span><span class="p">())</span> |
| <span class="n">fragments</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">split_by_row_group</span><span class="p">()</span> |
| </pre></div> |
| </div> |
| <p>This method returns a list of new Fragments mapping to each row group of |
| the original Fragment (Parquet file). Both <code class="docutils literal notranslate"><span class="pre">get_fragments()</span></code> and |
| <code class="docutils literal notranslate"><span class="pre">split_by_row_group()</span></code> accept an optional filter expression to get a |
| filtered list of fragments.</p> |
| </div> |
| <div class="section" id="manual-specification-of-the-dataset"> |
| <h2>Manual specification of the Dataset<a class="headerlink" href="#manual-specification-of-the-dataset" title="Permalink to this headline">¶</a></h2> |
| <p>The <a class="reference internal" href="generated/pyarrow.dataset.dataset.html#pyarrow.dataset.dataset" title="pyarrow.dataset.dataset"><code class="xref py py-func docutils literal notranslate"><span class="pre">dataset()</span></code></a> function allows easy creation of a Dataset viewing a directory, |
| crawling all subdirectories for files and partitioning information. However |
| sometimes discovery is not required and the dataset’s files and partitions |
| are already known (for example, when this information is stored in metadata). |
| In this case it is possible to create a Dataset explicitly without any |
| automatic discovery or inference.</p> |
| <p>For the example here, we are going to use a dataset where the file names contain |
| additional partitioning information:</p> |
| <div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="go"># creating a dummy dataset: directory with two files</span> |
| <span class="gp">In [34]: </span><span class="n">table</span> <span class="o">=</span> <span class="n">pa</span><span class="o">.</span><span class="n">table</span><span class="p">({</span><span class="s1">'col1'</span><span class="p">:</span> <span class="nb">range</span><span class="p">(</span><span class="mi">3</span><span class="p">),</span> <span class="s1">'col2'</span><span class="p">:</span> <span class="n">np</span><span class="o">.</span><span class="n">random</span><span class="o">.</span><span class="n">randn</span><span class="p">(</span><span class="mi">3</span><span class="p">)})</span> |
| |
| <span class="gp">In [35]: </span><span class="p">(</span><span class="n">base</span> <span class="o">/</span> <span class="s2">"parquet_dataset_manual"</span><span class="p">)</span><span class="o">.</span><span class="n">mkdir</span><span class="p">(</span><span class="n">exist_ok</span><span class="o">=</span><span class="bp">True</span><span class="p">)</span> |
| |
| <span class="gp">In [36]: </span><span class="n">pq</span><span class="o">.</span><span class="n">write_table</span><span class="p">(</span><span class="n">table</span><span class="p">,</span> <span class="n">base</span> <span class="o">/</span> <span class="s2">"parquet_dataset_manual"</span> <span class="o">/</span> <span class="s2">"data_2018.parquet"</span><span class="p">)</span> |
| |
| <span class="gp">In [37]: </span><span class="n">pq</span><span class="o">.</span><span class="n">write_table</span><span class="p">(</span><span class="n">table</span><span class="p">,</span> <span class="n">base</span> <span class="o">/</span> <span class="s2">"parquet_dataset_manual"</span> <span class="o">/</span> <span class="s2">"data_2019.parquet"</span><span class="p">)</span> |
| </pre></div> |
| </div> |
| <p>To create a Dataset from a list of files, we need to specify the paths, schema, |
| format, filesystem, and partition expressions manually:</p> |
| <div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [38]: </span><span class="kn">from</span> <span class="nn">pyarrow</span> <span class="kn">import</span> <span class="n">fs</span> |
| |
| <span class="gp">In [39]: </span><span class="n">schema</span> <span class="o">=</span> <span class="n">pa</span><span class="o">.</span><span class="n">schema</span><span class="p">([(</span><span class="s2">"year"</span><span class="p">,</span> <span class="n">pa</span><span class="o">.</span><span class="n">int64</span><span class="p">()),</span> <span class="p">(</span><span class="s2">"col1"</span><span class="p">,</span> <span class="n">pa</span><span class="o">.</span><span class="n">int64</span><span class="p">()),</span> <span class="p">(</span><span class="s2">"col2"</span><span class="p">,</span> <span class="n">pa</span><span class="o">.</span><span class="n">float64</span><span class="p">())])</span> |
| |
| <span class="gp">In [40]: </span><span class="n">dataset</span> <span class="o">=</span> <span class="n">ds</span><span class="o">.</span><span class="n">FileSystemDataset</span><span class="o">.</span><span class="n">from_paths</span><span class="p">(</span> |
| <span class="gp"> ....: </span> <span class="p">[</span><span class="s2">"data_2018.parquet"</span><span class="p">,</span> <span class="s2">"data_2019.parquet"</span><span class="p">],</span> <span class="n">schema</span><span class="o">=</span><span class="n">schema</span><span class="p">,</span> <span class="n">format</span><span class="o">=</span><span class="n">ds</span><span class="o">.</span><span class="n">ParquetFileFormat</span><span class="p">(),</span> |
| <span class="gp"> ....: </span> <span class="n">filesystem</span><span class="o">=</span><span class="n">fs</span><span class="o">.</span><span class="n">SubTreeFileSystem</span><span class="p">(</span><span class="nb">str</span><span class="p">(</span><span class="n">base</span> <span class="o">/</span> <span class="s2">"parquet_dataset_manual"</span><span class="p">),</span> <span class="n">fs</span><span class="o">.</span><span class="n">LocalFileSystem</span><span class="p">()),</span> |
| <span class="gp"> ....: </span> <span class="n">partitions</span><span class="o">=</span><span class="p">[</span><span class="n">ds</span><span class="o">.</span><span class="n">field</span><span class="p">(</span><span class="s1">'year'</span><span class="p">)</span> <span class="o">==</span> <span class="mi">2018</span><span class="p">,</span> <span class="n">ds</span><span class="o">.</span><span class="n">field</span><span class="p">(</span><span class="s1">'year'</span><span class="p">)</span> <span class="o">==</span> <span class="mi">2019</span><span class="p">])</span> |
| <span class="gp"> ....: </span> |
| </pre></div> |
| </div> |
| <p>Since we specified the “partition expressions” for our files, this information |
| is materialized as columns when reading the data and can be used for filtering:</p> |
| <div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [41]: </span><span class="n">dataset</span><span class="o">.</span><span class="n">to_table</span><span class="p">()</span><span class="o">.</span><span class="n">to_pandas</span><span class="p">()</span> |
| <span class="gh">Out[41]: </span><span class="go"></span> |
| <span class="go"> year col1 col2</span> |
| <span class="go">0 2018 0 -0.093183</span> |
| <span class="go">1 2018 1 0.509221</span> |
| <span class="go">2 2018 2 0.072163</span> |
| <span class="go">3 2019 0 -0.093183</span> |
| <span class="go">4 2019 1 0.509221</span> |
| <span class="go">5 2019 2 0.072163</span> |
| |
| <span class="gp">In [42]: </span><span class="n">dataset</span><span class="o">.</span><span class="n">to_table</span><span class="p">(</span><span class="nb">filter</span><span class="o">=</span><span class="n">ds</span><span class="o">.</span><span class="n">field</span><span class="p">(</span><span class="s1">'year'</span><span class="p">)</span> <span class="o">==</span> <span class="mi">2019</span><span class="p">)</span><span class="o">.</span><span class="n">to_pandas</span><span class="p">()</span> |
| <span class="gh">Out[42]: </span><span class="go"></span> |
| <span class="go"> year col1 col2</span> |
| <span class="go">0 2019 0 -0.093183</span> |
| <span class="go">1 2019 1 0.509221</span> |
| <span class="go">2 2019 2 0.072163</span> |
| </pre></div> |
| </div> |
| </div> |
| <div class="section" id="manual-scheduling"> |
| <h2>Manual scheduling<a class="headerlink" href="#manual-scheduling" title="Permalink to this headline">¶</a></h2> |
| <p>The <a class="reference internal" href="generated/pyarrow.dataset.Dataset.html#pyarrow.dataset.Dataset.to_table" title="pyarrow.dataset.Dataset.to_table"><code class="xref py py-func docutils literal notranslate"><span class="pre">to_table()</span></code></a> method loads all selected data into memory |
| at once resulting in a pyarrow Table. Alternatively, a dataset can also be |
| scanned one RecordBatch at a time in an iterative manner using the |
| <a class="reference internal" href="generated/pyarrow.dataset.Dataset.html#pyarrow.dataset.Dataset.scan" title="pyarrow.dataset.Dataset.scan"><code class="xref py py-func docutils literal notranslate"><span class="pre">scan()</span></code></a> method:</p> |
| <div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="k">for</span> <span class="n">scan_task</span> <span class="ow">in</span> <span class="n">dataset</span><span class="o">.</span><span class="n">scan</span><span class="p">(</span><span class="n">columns</span><span class="o">=</span><span class="p">[</span><span class="o">...</span><span class="p">],</span> <span class="nb">filter</span><span class="o">=...</span><span class="p">):</span> |
| <span class="k">for</span> <span class="n">record_batch</span> <span class="ow">in</span> <span class="n">scan_task</span><span class="o">.</span><span class="n">execute</span><span class="p">():</span> |
| <span class="c1"># process the record batch</span> |
| </pre></div> |
| </div> |
| </div> |
| </div> |
| |
| |
| </div> |
| |
| </div> |
| <footer> |
| |
| <div class="rst-footer-buttons" role="navigation" aria-label="footer navigation"> |
| |
| <a href="cuda.html" class="btn btn-neutral float-right" title="CUDA Integration" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right"></span></a> |
| |
| |
| <a href="parquet.html" class="btn btn-neutral float-left" title="Reading and Writing the Apache Parquet Format" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a> |
| |
| </div> |
| |
| |
| <hr/> |
| |
| <div role="contentinfo"> |
| <p> |
| |
| © Copyright 2016-2019 Apache Software Foundation |
| |
| </p> |
| </div> |
| |
| |
| |
| Built with <a href="http://sphinx-doc.org/">Sphinx</a> using a |
| |
| <a href="https://github.com/rtfd/sphinx_rtd_theme">theme</a> |
| |
| provided by <a href="https://readthedocs.org">Read the Docs</a>. |
| |
| </footer> |
| |
| </div> |
| </div> |
| |
| </section> |
| |
| </div> |
| |
| |
| <script type="text/javascript"> |
| jQuery(function () { |
| SphinxRtdTheme.Navigation.enable(true); |
| }); |
| </script> |
| |
| |
| |
| |
| |
| |
| |
| <script type="text/javascript" src="/docs/_static/versionwarning.js"></script></body> |
| </html> |