blob: 2fce434855e1d1b808223ae58dd4ea49974a113f [file]
<!DOCTYPE html>
<html class="writer-html5" lang="en" >
<head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Tabular Datasets &mdash; Apache Arrow v2.0.0</title>
<link rel="stylesheet" href="../_static/css/theme.css" type="text/css" />
<link rel="stylesheet" href="../_static/pygments.css" type="text/css" />
<!--[if lt IE 9]>
<script src="../_static/js/html5shiv.min.js"></script>
<![endif]-->
<script type="text/javascript" id="documentation_options" data-url_root="../" src="../_static/documentation_options.js"></script>
<script src="../_static/jquery.js"></script>
<script src="../_static/underscore.js"></script>
<script src="../_static/doctools.js"></script>
<script src="../_static/language_data.js"></script>
<script type="text/javascript" src="../_static/js/theme.js"></script>
<link rel="canonical" href="https://arrow.apache.org/docs/python/dataset.html" />
<link rel="index" title="Index" href="../genindex.html" />
<link rel="search" title="Search" href="../search.html" />
<link rel="next" title="CUDA Integration" href="cuda.html" />
<link rel="prev" title="Reading and Writing the Apache Parquet Format" href="parquet.html" />
<!-- Matomo -->
<script>
var _paq = window._paq = window._paq || [];
/* tracker methods like "setCustomDimension" should be called before "trackPageView" */
_paq.push(["setDoNotTrack", true]);
_paq.push(["disableCookies"]);
_paq.push(['trackPageView']);
_paq.push(['enableLinkTracking']);
(function() {
var u="https://analytics.apache.org/";
_paq.push(['setTrackerUrl', u+'matomo.php']);
_paq.push(['setSiteId', '20']);
var d=document, g=d.createElement('script'), s=d.getElementsByTagName('script')[0];
g.async=true; g.src=u+'matomo.js'; s.parentNode.insertBefore(g,s);
})();
</script>
<!-- End Matomo Code -->
</head>
<body class="wy-body-for-nav">
<div class="wy-grid-for-nav">
<nav data-toggle="wy-nav-shift" class="wy-nav-side">
<div class="wy-side-scroll">
<div class="wy-side-nav-search" >
<a href="../index.html" class="icon icon-home" alt="Documentation Home"> Apache Arrow
</a>
<div class="version">
2.0.0
</div>
<div role="search">
<form id="rtd-search-form" class="wy-form" action="../search.html" method="get">
<input type="text" name="q" placeholder="Search docs" />
<input type="hidden" name="check_keywords" value="yes" />
<input type="hidden" name="area" value="default" />
</form>
</div>
</div>
<div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="main navigation">
<p class="caption"><span class="caption-text">Specifications and Protocols</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="../format/Versioning.html">Format Versioning and Stability</a></li>
<li class="toctree-l1"><a class="reference internal" href="../format/Columnar.html">Arrow Columnar Format</a></li>
<li class="toctree-l1"><a class="reference internal" href="../format/Flight.html">Arrow Flight RPC</a></li>
<li class="toctree-l1"><a class="reference internal" href="../format/Integration.html">Integration Testing</a></li>
<li class="toctree-l1"><a class="reference internal" href="../format/CDataInterface.html">The Arrow C data interface</a></li>
<li class="toctree-l1"><a class="reference internal" href="../format/CStreamInterface.html">The Arrow C stream interface</a></li>
<li class="toctree-l1"><a class="reference internal" href="../format/Other.html">Other Data Structures</a></li>
</ul>
<p class="caption"><span class="caption-text">Libraries</span></p>
<ul class="current">
<li class="toctree-l1"><a class="reference internal" href="../status.html">Implementation Status</a></li>
<li class="toctree-l1"><a class="reference external" href="https://arrow.apache.org/docs/c_glib/">C/GLib</a></li>
<li class="toctree-l1"><a class="reference internal" href="../cpp/index.html">C++</a></li>
<li class="toctree-l1"><a class="reference external" href="https://github.com/apache/arrow/blob/master/csharp/README.md">C#</a></li>
<li class="toctree-l1"><a class="reference external" href="https://godoc.org/github.com/apache/arrow/go/arrow">Go</a></li>
<li class="toctree-l1"><a class="reference internal" href="../java/index.html">Java</a></li>
<li class="toctree-l1"><a class="reference external" href="https://arrow.apache.org/docs/js/">JavaScript</a></li>
<li class="toctree-l1"><a class="reference external" href="https://github.com/apache/arrow/blob/master/matlab/README.md">MATLAB</a></li>
<li class="toctree-l1 current"><a class="reference internal" href="index.html">Python</a><ul class="current">
<li class="toctree-l2"><a class="reference internal" href="install.html">Installing PyArrow</a></li>
<li class="toctree-l2"><a class="reference internal" href="memory.html">Memory and IO Interfaces</a></li>
<li class="toctree-l2"><a class="reference internal" href="data.html">Data Types and In-Memory Data Model</a></li>
<li class="toctree-l2"><a class="reference internal" href="compute.html">Compute Functions</a></li>
<li class="toctree-l2"><a class="reference internal" href="ipc.html">Streaming, Serialization, and IPC</a></li>
<li class="toctree-l2"><a class="reference internal" href="filesystems.html">Filesystem Interface</a></li>
<li class="toctree-l2"><a class="reference internal" href="filesystems_deprecated.html">Filesystem Interface (legacy)</a></li>
<li class="toctree-l2"><a class="reference internal" href="plasma.html">The Plasma In-Memory Object Store</a></li>
<li class="toctree-l2"><a class="reference internal" href="numpy.html">NumPy Integration</a></li>
<li class="toctree-l2"><a class="reference internal" href="pandas.html">Pandas Integration</a></li>
<li class="toctree-l2"><a class="reference internal" href="timestamps.html">Timestamps</a></li>
<li class="toctree-l2"><a class="reference internal" href="csv.html">Reading CSV files</a></li>
<li class="toctree-l2"><a class="reference internal" href="feather.html">Feather File Format</a></li>
<li class="toctree-l2"><a class="reference internal" href="json.html">Reading JSON files</a></li>
<li class="toctree-l2"><a class="reference internal" href="parquet.html">Reading and Writing the Apache Parquet Format</a></li>
<li class="toctree-l2 current"><a class="current reference internal" href="#">Tabular Datasets</a><ul>
<li class="toctree-l3"><a class="reference internal" href="#reading-datasets">Reading Datasets</a><ul>
<li class="toctree-l4"><a class="reference internal" href="#dataset-discovery">Dataset discovery</a></li>
<li class="toctree-l4"><a class="reference internal" href="#reading-different-file-formats">Reading different file formats</a></li>
<li class="toctree-l4"><a class="reference internal" href="#customizing-file-formats">Customizing file formats</a></li>
</ul>
</li>
<li class="toctree-l3"><a class="reference internal" href="#filtering-data">Filtering data</a></li>
<li class="toctree-l3"><a class="reference internal" href="#reading-partitioned-data">Reading partitioned data</a><ul>
<li class="toctree-l4"><a class="reference internal" href="#different-partitioning-schemes">Different partitioning schemes</a></li>
</ul>
</li>
<li class="toctree-l3"><a class="reference internal" href="#reading-from-cloud-storage">Reading from cloud storage</a></li>
<li class="toctree-l3"><a class="reference internal" href="#reading-from-minio">Reading from Minio</a></li>
<li class="toctree-l3"><a class="reference internal" href="#working-with-parquet-datasets">Working with Parquet Datasets</a></li>
<li class="toctree-l3"><a class="reference internal" href="#manual-specification-of-the-dataset">Manual specification of the Dataset</a></li>
<li class="toctree-l3"><a class="reference internal" href="#manual-scheduling">Manual scheduling</a></li>
</ul>
</li>
<li class="toctree-l2"><a class="reference internal" href="cuda.html">CUDA Integration</a></li>
<li class="toctree-l2"><a class="reference internal" href="extending_types.html">Extending pyarrow</a></li>
<li class="toctree-l2"><a class="reference internal" href="extending.html">Using pyarrow from C++ and Cython Code</a></li>
<li class="toctree-l2"><a class="reference internal" href="api.html">API Reference</a></li>
<li class="toctree-l2"><a class="reference internal" href="getting_involved.html">Getting Involved</a></li>
<li class="toctree-l2"><a class="reference internal" href="benchmarks.html">Benchmarks</a></li>
</ul>
</li>
<li class="toctree-l1"><a class="reference external" href="https://arrow.apache.org/docs/r/">R</a></li>
<li class="toctree-l1"><a class="reference external" href="https://github.com/apache/arrow/blob/master/ruby/README.md">Ruby</a></li>
<li class="toctree-l1"><a class="reference external" href="https://docs.rs/crate/arrow/">Rust</a></li>
</ul>
<p class="caption"><span class="caption-text">Development</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="../developers/contributing.html">Contributing to Apache Arrow</a></li>
<li class="toctree-l1"><a class="reference internal" href="../developers/cpp/index.html">C++ Development</a></li>
<li class="toctree-l1"><a class="reference internal" href="../developers/python.html">Python Development</a></li>
<li class="toctree-l1"><a class="reference internal" href="../developers/archery.html">Daily Development using Archery</a></li>
<li class="toctree-l1"><a class="reference internal" href="../developers/crossbow.html">Packaging and Testing with Crossbow</a></li>
<li class="toctree-l1"><a class="reference internal" href="../developers/docker.html">Running Docker Builds</a></li>
<li class="toctree-l1"><a class="reference internal" href="../developers/benchmarks.html">Benchmarks</a></li>
<li class="toctree-l1"><a class="reference internal" href="../developers/documentation.html">Building the Documentation</a></li>
</ul>
</div>
</div>
</nav>
<section data-toggle="wy-nav-shift" class="wy-nav-content-wrap">
<nav class="wy-nav-top" aria-label="top navigation">
<i data-toggle="wy-nav-top" class="fa fa-bars"></i>
<a href="../index.html">Apache Arrow</a>
</nav>
<div class="wy-nav-content">
<div class="rst-content">
<div role="navigation" aria-label="breadcrumbs navigation">
<ul class="wy-breadcrumbs">
<li><a href="../index.html" class="icon icon-home"></a> &raquo;</li>
<li><a href="index.html">Python bindings</a> &raquo;</li>
<li>Tabular Datasets</li>
<li class="wy-breadcrumbs-aside">
<a href="../_sources/python/dataset.rst.txt" rel="nofollow"> View page source</a>
</li>
</ul>
<hr/>
</div>
<div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
<div itemprop="articleBody">
<div class="section" id="tabular-datasets">
<span id="dataset"></span><h1>Tabular Datasets<a class="headerlink" href="#tabular-datasets" title="Permalink to this headline"></a></h1>
<div class="admonition warning">
<p class="admonition-title">Warning</p>
<p>The <code class="docutils literal notranslate"><span class="pre">pyarrow.dataset</span></code> module is experimental (specifically the classes),
and a stable API is not yet guaranteed.</p>
</div>
<p>The <code class="docutils literal notranslate"><span class="pre">pyarrow.dataset</span></code> module provides functionality to efficiently work with
tabular, potentially larger than memory and multi-file datasets:</p>
<ul class="simple">
<li><p>A unified interface for different sources: supporting different sources and
file formats (Parquet, Feather files) and different file systems (local,
cloud).</p></li>
<li><p>Discovery of sources (crawling directories, handle directory-based partitioned
datasets, basic schema normalization, ..)</p></li>
<li><p>Optimized reading with predicate pushdown (filtering rows), projection
(selecting columns), parallel reading or fine-grained managing of tasks.</p></li>
</ul>
<p>Currently, only Parquet and Feather / Arrow IPC files are supported. The goal
is to expand this in the future to other file formats and data sources (e.g.
database connections).</p>
<p>For those familiar with the existing <a class="reference internal" href="generated/pyarrow.parquet.ParquetDataset.html#pyarrow.parquet.ParquetDataset" title="pyarrow.parquet.ParquetDataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">pyarrow.parquet.ParquetDataset</span></code></a> for
reading Parquet datasets: <code class="docutils literal notranslate"><span class="pre">pyarrow.dataset</span></code>’s goal is similar but not specific
to the Parquet format and not tied to Python: the same datasets API is exposed
in the R bindings or Arrow. In addition <code class="docutils literal notranslate"><span class="pre">pyarrow.dataset</span></code> boasts improved
perfomance and new features (e.g. filtering within files rather than only on
partition keys).</p>
<div class="section" id="reading-datasets">
<h2>Reading Datasets<a class="headerlink" href="#reading-datasets" title="Permalink to this headline"></a></h2>
<p>For the examples below, let’s create a small dataset consisting
of a directory with two parquet files:</p>
<div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [1]: </span><span class="kn">import</span> <span class="nn">tempfile</span>
<span class="gp">In [2]: </span><span class="kn">import</span> <span class="nn">pathlib</span>
<span class="gp">In [3]: </span><span class="kn">import</span> <span class="nn">pyarrow</span> <span class="kn">as</span> <span class="nn">pa</span>
<span class="gp">In [4]: </span><span class="kn">import</span> <span class="nn">pyarrow.parquet</span> <span class="kn">as</span> <span class="nn">pq</span>
<span class="gp">In [5]: </span><span class="n">base</span> <span class="o">=</span> <span class="n">pathlib</span><span class="o">.</span><span class="n">Path</span><span class="p">(</span><span class="n">tempfile</span><span class="o">.</span><span class="n">gettempdir</span><span class="p">())</span>
<span class="gp">In [6]: </span><span class="p">(</span><span class="n">base</span> <span class="o">/</span> <span class="s2">&quot;parquet_dataset&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">mkdir</span><span class="p">(</span><span class="n">exist_ok</span><span class="o">=</span><span class="bp">True</span><span class="p">)</span>
<span class="go"># creating an Arrow Table</span>
<span class="gp">In [7]: </span><span class="n">table</span> <span class="o">=</span> <span class="n">pa</span><span class="o">.</span><span class="n">table</span><span class="p">({</span><span class="s1">&#39;a&#39;</span><span class="p">:</span> <span class="nb">range</span><span class="p">(</span><span class="mi">10</span><span class="p">),</span> <span class="s1">&#39;b&#39;</span><span class="p">:</span> <span class="n">np</span><span class="o">.</span><span class="n">random</span><span class="o">.</span><span class="n">randn</span><span class="p">(</span><span class="mi">10</span><span class="p">),</span> <span class="s1">&#39;c&#39;</span><span class="p">:</span> <span class="p">[</span><span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">]</span> <span class="o">*</span> <span class="mi">5</span><span class="p">})</span>
<span class="go"># writing it into two parquet files</span>
<span class="gp">In [8]: </span><span class="n">pq</span><span class="o">.</span><span class="n">write_table</span><span class="p">(</span><span class="n">table</span><span class="o">.</span><span class="n">slice</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="mi">5</span><span class="p">),</span> <span class="n">base</span> <span class="o">/</span> <span class="s2">&quot;parquet_dataset/data1.parquet&quot;</span><span class="p">)</span>
<span class="gp">In [9]: </span><span class="n">pq</span><span class="o">.</span><span class="n">write_table</span><span class="p">(</span><span class="n">table</span><span class="o">.</span><span class="n">slice</span><span class="p">(</span><span class="mi">5</span><span class="p">,</span> <span class="mi">10</span><span class="p">),</span> <span class="n">base</span> <span class="o">/</span> <span class="s2">&quot;parquet_dataset/data2.parquet&quot;</span><span class="p">)</span>
</pre></div>
</div>
<div class="section" id="dataset-discovery">
<h3>Dataset discovery<a class="headerlink" href="#dataset-discovery" title="Permalink to this headline"></a></h3>
<p>A <a class="reference internal" href="generated/pyarrow.dataset.Dataset.html#pyarrow.dataset.Dataset" title="pyarrow.dataset.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">Dataset</span></code></a> object can be created with the <a class="reference internal" href="generated/pyarrow.dataset.dataset.html#pyarrow.dataset.dataset" title="pyarrow.dataset.dataset"><code class="xref py py-func docutils literal notranslate"><span class="pre">dataset()</span></code></a> function. We
can pass it the path to the directory containing the data files:</p>
<div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [10]: </span><span class="kn">import</span> <span class="nn">pyarrow.dataset</span> <span class="kn">as</span> <span class="nn">ds</span>
<span class="gp">In [11]: </span><span class="n">dataset</span> <span class="o">=</span> <span class="n">ds</span><span class="o">.</span><span class="n">dataset</span><span class="p">(</span><span class="n">base</span> <span class="o">/</span> <span class="s2">&quot;parquet_dataset&quot;</span><span class="p">,</span> <span class="n">format</span><span class="o">=</span><span class="s2">&quot;parquet&quot;</span><span class="p">)</span>
<span class="gp">In [12]: </span><span class="n">dataset</span>
<span class="gh">Out[12]: </span><span class="go">&lt;pyarrow._dataset.FileSystemDataset at 0x7fea83e1f8f0&gt;</span>
</pre></div>
</div>
<p>In addition to a base directory path, <a class="reference internal" href="generated/pyarrow.dataset.dataset.html#pyarrow.dataset.dataset" title="pyarrow.dataset.dataset"><code class="xref py py-func docutils literal notranslate"><span class="pre">dataset()</span></code></a> accepts a path to a single
file or a list of file paths.</p>
<p>Creating a <a class="reference internal" href="generated/pyarrow.dataset.Dataset.html#pyarrow.dataset.Dataset" title="pyarrow.dataset.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">Dataset</span></code></a> object loads nothing into memory, it only crawls the
directory to find all the files:</p>
<div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [13]: </span><span class="n">dataset</span><span class="o">.</span><span class="n">files</span>
<span class="gh">Out[13]: </span><span class="go">[&#39;/tmp/parquet_dataset/data1.parquet&#39;, &#39;/tmp/parquet_dataset/data2.parquet&#39;]</span>
</pre></div>
</div>
<p>… and infers the dataset’s schema (by default from the first file):</p>
<div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [14]: </span><span class="k">print</span><span class="p">(</span><span class="n">dataset</span><span class="o">.</span><span class="n">schema</span><span class="o">.</span><span class="n">to_string</span><span class="p">(</span><span class="n">show_field_metadata</span><span class="o">=</span><span class="bp">False</span><span class="p">))</span>
<span class="go">a: int64</span>
<span class="go">b: double</span>
<span class="go">c: int64</span>
</pre></div>
</div>
<p>Using the <a class="reference internal" href="generated/pyarrow.dataset.Dataset.html#pyarrow.dataset.Dataset.to_table" title="pyarrow.dataset.Dataset.to_table"><code class="xref py py-meth docutils literal notranslate"><span class="pre">Dataset.to_table()</span></code></a> method we can read the dataset (or a portion
of it) into a pyarrow Table (note that depending on the size of your dataset
this can require a lot of memory, see below on filtering / iterative loading):</p>
<div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [15]: </span><span class="n">dataset</span><span class="o">.</span><span class="n">to_table</span><span class="p">()</span>
<span class="gh">Out[15]: </span><span class="go"></span>
<span class="go">pyarrow.Table</span>
<span class="go">a: int64</span>
<span class="go">b: double</span>
<span class="go">c: int64</span>
<span class="go"># converting to pandas to see the contents of the scanned table</span>
<span class="gp">In [16]: </span><span class="n">dataset</span><span class="o">.</span><span class="n">to_table</span><span class="p">()</span><span class="o">.</span><span class="n">to_pandas</span><span class="p">()</span>
<span class="gh">Out[16]: </span><span class="go"></span>
<span class="go"> a b c</span>
<span class="go">0 0 1.537236 1</span>
<span class="go">1 1 -0.165335 2</span>
<span class="go">2 2 -0.200643 1</span>
<span class="go">3 3 0.606877 2</span>
<span class="go">4 4 -0.788530 1</span>
<span class="go">5 5 0.872821 2</span>
<span class="go">6 6 0.263579 1</span>
<span class="go">7 7 -1.392005 2</span>
<span class="go">8 8 0.346150 1</span>
<span class="go">9 9 -0.414411 2</span>
</pre></div>
</div>
</div>
<div class="section" id="reading-different-file-formats">
<h3>Reading different file formats<a class="headerlink" href="#reading-different-file-formats" title="Permalink to this headline"></a></h3>
<p>The above examples use Parquet files as dataset source but the Dataset API
provides a consistent interface across multiple file formats and sources.
Currently, Parquet and Feather / Arrow IPC file format are supported; more
formats are planned in the future.</p>
<p>If we save the table as a Feather file instead of Parquet files:</p>
<div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [17]: </span><span class="kn">import</span> <span class="nn">pyarrow.feather</span> <span class="kn">as</span> <span class="nn">feather</span>
<span class="gp">In [18]: </span><span class="n">feather</span><span class="o">.</span><span class="n">write_feather</span><span class="p">(</span><span class="n">table</span><span class="p">,</span> <span class="n">base</span> <span class="o">/</span> <span class="s2">&quot;data.feather&quot;</span><span class="p">)</span>
</pre></div>
</div>
<p>then we can read the Feather file using the same functions, but with specifying
<code class="docutils literal notranslate"><span class="pre">format=&quot;feather&quot;</span></code>:</p>
<div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [19]: </span><span class="n">dataset</span> <span class="o">=</span> <span class="n">ds</span><span class="o">.</span><span class="n">dataset</span><span class="p">(</span><span class="n">base</span> <span class="o">/</span> <span class="s2">&quot;data.feather&quot;</span><span class="p">,</span> <span class="n">format</span><span class="o">=</span><span class="s2">&quot;feather&quot;</span><span class="p">)</span>
<span class="gp">In [20]: </span><span class="n">dataset</span><span class="o">.</span><span class="n">to_table</span><span class="p">()</span><span class="o">.</span><span class="n">to_pandas</span><span class="p">()</span><span class="o">.</span><span class="n">head</span><span class="p">()</span>
<span class="gh">Out[20]: </span><span class="go"></span>
<span class="go"> a b c</span>
<span class="go">0 0 1.537236 1</span>
<span class="go">1 1 -0.165335 2</span>
<span class="go">2 2 -0.200643 1</span>
<span class="go">3 3 0.606877 2</span>
<span class="go">4 4 -0.788530 1</span>
</pre></div>
</div>
</div>
<div class="section" id="customizing-file-formats">
<h3>Customizing file formats<a class="headerlink" href="#customizing-file-formats" title="Permalink to this headline"></a></h3>
<p>The format name as a string, like:</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">ds</span><span class="o">.</span><span class="n">dataset</span><span class="p">(</span><span class="o">...</span><span class="p">,</span> <span class="nb">format</span><span class="o">=</span><span class="s2">&quot;parquet&quot;</span><span class="p">)</span>
</pre></div>
</div>
<p>is short hand for a default constructed <a class="reference internal" href="generated/pyarrow.dataset.ParquetFileFormat.html#pyarrow.dataset.ParquetFileFormat" title="pyarrow.dataset.ParquetFileFormat"><code class="xref py py-class docutils literal notranslate"><span class="pre">ParquetFileFormat</span></code></a>:</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">ds</span><span class="o">.</span><span class="n">dataset</span><span class="p">(</span><span class="o">...</span><span class="p">,</span> <span class="nb">format</span><span class="o">=</span><span class="n">ds</span><span class="o">.</span><span class="n">ParquetFileForma</span><span class="p">())</span>
</pre></div>
</div>
<p>The <a class="reference internal" href="generated/pyarrow.dataset.FileFormat.html#pyarrow.dataset.FileFormat" title="pyarrow.dataset.FileFormat"><code class="xref py py-class docutils literal notranslate"><span class="pre">FileFormat</span></code></a> objects can be customized using keywords. For example:</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">parquet_format</span> <span class="o">=</span> <span class="n">ds</span><span class="o">.</span><span class="n">ParquetFileFormat</span><span class="p">(</span><span class="n">read_options</span><span class="o">=</span><span class="p">{</span><span class="s1">&#39;dictionary_columns&#39;</span><span class="p">:</span> <span class="p">[</span><span class="s1">&#39;a&#39;</span><span class="p">]})</span>
<span class="n">ds</span><span class="o">.</span><span class="n">dataset</span><span class="p">(</span><span class="o">...</span><span class="p">,</span> <span class="nb">format</span><span class="o">=</span><span class="n">parquet_format</span><span class="p">)</span>
</pre></div>
</div>
<p>Will configure column <code class="docutils literal notranslate"><span class="pre">&quot;a&quot;</span></code> to be dictionary encoded on scan.</p>
</div>
</div>
<div class="section" id="filtering-data">
<h2>Filtering data<a class="headerlink" href="#filtering-data" title="Permalink to this headline"></a></h2>
<p>To avoid reading all data when only needing a subset, the <code class="docutils literal notranslate"><span class="pre">columns</span></code> and
<code class="docutils literal notranslate"><span class="pre">filter</span></code> keywords can be used.</p>
<p>The <code class="docutils literal notranslate"><span class="pre">columns</span></code> keyword can be used to only read the specified columns:</p>
<div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [21]: </span><span class="n">dataset</span> <span class="o">=</span> <span class="n">ds</span><span class="o">.</span><span class="n">dataset</span><span class="p">(</span><span class="n">base</span> <span class="o">/</span> <span class="s2">&quot;parquet_dataset&quot;</span><span class="p">,</span> <span class="n">format</span><span class="o">=</span><span class="s2">&quot;parquet&quot;</span><span class="p">)</span>
<span class="gp">In [22]: </span><span class="n">dataset</span><span class="o">.</span><span class="n">to_table</span><span class="p">(</span><span class="n">columns</span><span class="o">=</span><span class="p">[</span><span class="s1">&#39;a&#39;</span><span class="p">,</span> <span class="s1">&#39;b&#39;</span><span class="p">])</span><span class="o">.</span><span class="n">to_pandas</span><span class="p">()</span>
<span class="gh">Out[22]: </span><span class="go"></span>
<span class="go"> a b</span>
<span class="go">0 0 1.537236</span>
<span class="go">1 1 -0.165335</span>
<span class="go">2 2 -0.200643</span>
<span class="go">3 3 0.606877</span>
<span class="go">4 4 -0.788530</span>
<span class="go">5 5 0.872821</span>
<span class="go">6 6 0.263579</span>
<span class="go">7 7 -1.392005</span>
<span class="go">8 8 0.346150</span>
<span class="go">9 9 -0.414411</span>
</pre></div>
</div>
<p>With the <code class="docutils literal notranslate"><span class="pre">filter</span></code> keyword, rows which do not match the filter predicate will
not be included in the returned table. The keyword expects a boolean
<a class="reference internal" href="generated/pyarrow.dataset.Expression.html#pyarrow.dataset.Expression" title="pyarrow.dataset.Expression"><code class="xref py py-class docutils literal notranslate"><span class="pre">Expression</span></code></a> referencing at least one of the columns:</p>
<div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [23]: </span><span class="n">dataset</span><span class="o">.</span><span class="n">to_table</span><span class="p">(</span><span class="nb">filter</span><span class="o">=</span><span class="n">ds</span><span class="o">.</span><span class="n">field</span><span class="p">(</span><span class="s1">&#39;a&#39;</span><span class="p">)</span> <span class="o">&gt;=</span> <span class="mi">7</span><span class="p">)</span><span class="o">.</span><span class="n">to_pandas</span><span class="p">()</span>
<span class="gh">Out[23]: </span><span class="go"></span>
<span class="go"> a b c</span>
<span class="go">0 7 -1.392005 2</span>
<span class="go">1 8 0.346150 1</span>
<span class="go">2 9 -0.414411 2</span>
<span class="gp">In [24]: </span><span class="n">dataset</span><span class="o">.</span><span class="n">to_table</span><span class="p">(</span><span class="nb">filter</span><span class="o">=</span><span class="n">ds</span><span class="o">.</span><span class="n">field</span><span class="p">(</span><span class="s1">&#39;c&#39;</span><span class="p">)</span> <span class="o">==</span> <span class="mi">2</span><span class="p">)</span><span class="o">.</span><span class="n">to_pandas</span><span class="p">()</span>
<span class="gh">Out[24]: </span><span class="go"></span>
<span class="go"> a b c</span>
<span class="go">0 1 -0.165335 2</span>
<span class="go">1 3 0.606877 2</span>
<span class="go">2 5 0.872821 2</span>
<span class="go">3 7 -1.392005 2</span>
<span class="go">4 9 -0.414411 2</span>
</pre></div>
</div>
<p>The easiest way to construct those <a class="reference internal" href="generated/pyarrow.dataset.Expression.html#pyarrow.dataset.Expression" title="pyarrow.dataset.Expression"><code class="xref py py-class docutils literal notranslate"><span class="pre">Expression</span></code></a> objects is by using the
<a class="reference internal" href="generated/pyarrow.dataset.field.html#pyarrow.dataset.field" title="pyarrow.dataset.field"><code class="xref py py-func docutils literal notranslate"><span class="pre">field()</span></code></a> helper function. Any column - not just partition columns - can be
referenced using the <a class="reference internal" href="generated/pyarrow.dataset.field.html#pyarrow.dataset.field" title="pyarrow.dataset.field"><code class="xref py py-func docutils literal notranslate"><span class="pre">field()</span></code></a> function (which creates a
<code class="xref py py-class docutils literal notranslate"><span class="pre">FieldExpression</span></code>). Operator overloads are provided to compose filters
including the comparisons (equal, larger/less than, etc), set membership
testing, and boolean combinations (and, or, not):</p>
<div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [25]: </span><span class="n">ds</span><span class="o">.</span><span class="n">field</span><span class="p">(</span><span class="s1">&#39;a&#39;</span><span class="p">)</span> <span class="o">!=</span> <span class="mi">3</span>
<span class="gh">Out[25]: </span><span class="go">&lt;pyarrow.dataset.Expression (a != 3:int64)&gt;</span>
<span class="gp">In [26]: </span><span class="n">ds</span><span class="o">.</span><span class="n">field</span><span class="p">(</span><span class="s1">&#39;a&#39;</span><span class="p">)</span><span class="o">.</span><span class="n">isin</span><span class="p">([</span><span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="mi">3</span><span class="p">])</span>
<span class="gh">Out[26]: </span><span class="go"></span>
<span class="go">&lt;pyarrow.dataset.Expression (a is in [</span>
<span class="go"> 1,</span>
<span class="go"> 2,</span>
<span class="go"> 3</span>
<span class="go">])&gt;</span>
<span class="gp">In [27]: </span><span class="p">(</span><span class="n">ds</span><span class="o">.</span><span class="n">field</span><span class="p">(</span><span class="s1">&#39;a&#39;</span><span class="p">)</span> <span class="o">&gt;</span> <span class="n">ds</span><span class="o">.</span><span class="n">field</span><span class="p">(</span><span class="s1">&#39;b&#39;</span><span class="p">))</span> <span class="o">&amp;</span> <span class="p">(</span><span class="n">ds</span><span class="o">.</span><span class="n">field</span><span class="p">(</span><span class="s1">&#39;b&#39;</span><span class="p">)</span> <span class="o">&gt;</span> <span class="mi">1</span><span class="p">)</span>
<span class="gh">Out[27]: </span><span class="go">&lt;pyarrow.dataset.Expression ((a &gt; b) and (b &gt; 1:int64))&gt;</span>
</pre></div>
</div>
</div>
<div class="section" id="reading-partitioned-data">
<h2>Reading partitioned data<a class="headerlink" href="#reading-partitioned-data" title="Permalink to this headline"></a></h2>
<p>Above, a dataset consisting of a flat directory with files was shown. However, a
dataset can exploit a nested directory structure defining a partitioned dataset,
where the sub-directory names hold information about which subset of the data is
stored in that directory.</p>
<p>For example, a dataset partitioned by year and month may look like on disk:</p>
<div class="highlight-text notranslate"><div class="highlight"><pre><span></span>dataset_name/
year=2007/
month=01/
data0.parquet
data1.parquet
...
month=02/
data0.parquet
data1.parquet
...
month=03/
...
year=2008/
month=01/
...
...
</pre></div>
</div>
<p>The above partitioning scheme is using “/key=value/” directory names, as found
in Apache Hive.</p>
<p>Let’s create a small partitioned dataset. The <a class="reference internal" href="generated/pyarrow.parquet.write_to_dataset.html#pyarrow.parquet.write_to_dataset" title="pyarrow.parquet.write_to_dataset"><code class="xref py py-func docutils literal notranslate"><span class="pre">write_to_dataset()</span></code></a>
function can write such hive-like partitioned datasets.</p>
<div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [28]: </span><span class="n">table</span> <span class="o">=</span> <span class="n">pa</span><span class="o">.</span><span class="n">table</span><span class="p">({</span><span class="s1">&#39;a&#39;</span><span class="p">:</span> <span class="nb">range</span><span class="p">(</span><span class="mi">10</span><span class="p">),</span> <span class="s1">&#39;b&#39;</span><span class="p">:</span> <span class="n">np</span><span class="o">.</span><span class="n">random</span><span class="o">.</span><span class="n">randn</span><span class="p">(</span><span class="mi">10</span><span class="p">),</span> <span class="s1">&#39;c&#39;</span><span class="p">:</span> <span class="p">[</span><span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">]</span> <span class="o">*</span> <span class="mi">5</span><span class="p">,</span>
<span class="gp"> ....: </span> <span class="s1">&#39;part&#39;</span><span class="p">:</span> <span class="p">[</span><span class="s1">&#39;a&#39;</span><span class="p">]</span> <span class="o">*</span> <span class="mi">5</span> <span class="o">+</span> <span class="p">[</span><span class="s1">&#39;b&#39;</span><span class="p">]</span> <span class="o">*</span> <span class="mi">5</span><span class="p">})</span>
<span class="gp"> ....: </span>
<span class="gp">In [29]: </span><span class="n">pq</span><span class="o">.</span><span class="n">write_to_dataset</span><span class="p">(</span><span class="n">table</span><span class="p">,</span> <span class="nb">str</span><span class="p">(</span><span class="n">base</span> <span class="o">/</span> <span class="s2">&quot;parquet_dataset_partitioned&quot;</span><span class="p">),</span>
<span class="gp"> ....: </span> <span class="n">partition_cols</span><span class="o">=</span><span class="p">[</span><span class="s1">&#39;part&#39;</span><span class="p">])</span>
<span class="gp"> ....: </span>
</pre></div>
</div>
<p>The above created a directory with two subdirectories (“part=a” and “part=b”),
and the Parquet files written in those directories no longer include the “part”
column.</p>
<p>Reading this dataset with <a class="reference internal" href="generated/pyarrow.dataset.dataset.html#pyarrow.dataset.dataset" title="pyarrow.dataset.dataset"><code class="xref py py-func docutils literal notranslate"><span class="pre">dataset()</span></code></a>, we now specify that the dataset
uses a hive-like partitioning scheme with the <cite>partitioning</cite> keyword:</p>
<div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [30]: </span><span class="n">dataset</span> <span class="o">=</span> <span class="n">ds</span><span class="o">.</span><span class="n">dataset</span><span class="p">(</span><span class="nb">str</span><span class="p">(</span><span class="n">base</span> <span class="o">/</span> <span class="s2">&quot;parquet_dataset_partitioned&quot;</span><span class="p">),</span> <span class="n">format</span><span class="o">=</span><span class="s2">&quot;parquet&quot;</span><span class="p">,</span>
<span class="gp"> ....: </span> <span class="n">partitioning</span><span class="o">=</span><span class="s2">&quot;hive&quot;</span><span class="p">)</span>
<span class="gp"> ....: </span>
<span class="gp">In [31]: </span><span class="n">dataset</span><span class="o">.</span><span class="n">files</span>
<span class="gh">Out[31]: </span><span class="go"></span>
<span class="go">[&#39;/tmp/parquet_dataset_partitioned/part=a/fc4d5802496645d8a712ccdf6f1e02da.parquet&#39;,</span>
<span class="go"> &#39;/tmp/parquet_dataset_partitioned/part=b/348d983e005740aeb57da4b7733ac09c.parquet&#39;]</span>
</pre></div>
</div>
<p>Although the partition fields are not included in the actual Parquet files,
they will be added back to the resulting table when scanning this dataset:</p>
<div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [32]: </span><span class="n">dataset</span><span class="o">.</span><span class="n">to_table</span><span class="p">()</span><span class="o">.</span><span class="n">to_pandas</span><span class="p">()</span><span class="o">.</span><span class="n">head</span><span class="p">(</span><span class="mi">3</span><span class="p">)</span>
<span class="gh">Out[32]: </span><span class="go"></span>
<span class="go"> a b c part</span>
<span class="go">0 0 -0.416769 1 a</span>
<span class="go">1 1 -0.597459 2 a</span>
<span class="go">2 2 0.076372 1 a</span>
</pre></div>
</div>
<p>We can now filter on the partition keys, which avoids loading files
altogether if they do not match the predicate:</p>
<div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [33]: </span><span class="n">dataset</span><span class="o">.</span><span class="n">to_table</span><span class="p">(</span><span class="nb">filter</span><span class="o">=</span><span class="n">ds</span><span class="o">.</span><span class="n">field</span><span class="p">(</span><span class="s2">&quot;part&quot;</span><span class="p">)</span> <span class="o">==</span> <span class="s2">&quot;b&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">to_pandas</span><span class="p">()</span>
<span class="gh">Out[33]: </span><span class="go"></span>
<span class="go"> a b c part</span>
<span class="go">0 5 -1.236938 2 b</span>
<span class="go">1 6 0.563724 1 b</span>
<span class="go">2 7 -0.984011 2 b</span>
<span class="go">3 8 0.428099 1 b</span>
<span class="go">4 9 -0.482604 2 b</span>
</pre></div>
</div>
<div class="section" id="different-partitioning-schemes">
<h3>Different partitioning schemes<a class="headerlink" href="#different-partitioning-schemes" title="Permalink to this headline"></a></h3>
<p>The above example uses a hive-like directory scheme, such as “/year=2009/month=11/day=15”.
We specified this passing the <code class="docutils literal notranslate"><span class="pre">partitioning=&quot;hive&quot;</span></code> keyword. In this case,
the types of the partition keys are inferred from the file paths.</p>
<p>It is also possible to explicitly define the schema of the partition keys
using the <a class="reference internal" href="generated/pyarrow.dataset.partitioning.html#pyarrow.dataset.partitioning" title="pyarrow.dataset.partitioning"><code class="xref py py-func docutils literal notranslate"><span class="pre">partitioning()</span></code></a> function. For example:</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="n">part</span> <span class="o">=</span> <span class="n">ds</span><span class="o">.</span><span class="n">partitioning</span><span class="p">(</span>
<span class="n">pa</span><span class="o">.</span><span class="n">schema</span><span class="p">([(</span><span class="s2">&quot;year&quot;</span><span class="p">,</span> <span class="n">pa</span><span class="o">.</span><span class="n">int16</span><span class="p">()),</span> <span class="p">(</span><span class="s2">&quot;month&quot;</span><span class="p">,</span> <span class="n">pa</span><span class="o">.</span><span class="n">int8</span><span class="p">()),</span> <span class="p">(</span><span class="s2">&quot;day&quot;</span><span class="p">,</span> <span class="n">pa</span><span class="o">.</span><span class="n">int32</span><span class="p">())]),</span>
<span class="n">flavor</span><span class="o">=</span><span class="s2">&quot;hive&quot;</span>
<span class="p">)</span>
<span class="n">dataset</span> <span class="o">=</span> <span class="n">ds</span><span class="o">.</span><span class="n">dataset</span><span class="p">(</span><span class="o">...</span><span class="p">,</span> <span class="n">partitioning</span><span class="o">=</span><span class="n">part</span><span class="p">)</span>
</pre></div>
</div>
<p>“Directory partitioning” is also supported, where the segments in the file path
represent the values of the partition keys without including the name (the
field name are implicit in the segment’s index). For example, given field names
“year”, “month”, and “day”, one path might be “/2019/11/15”.</p>
<p>Since the names are not included in the file paths, these must be specified
when constructing a directory partitioning:</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="n">part</span> <span class="o">=</span> <span class="n">ds</span><span class="o">.</span><span class="n">partitioning</span><span class="p">(</span><span class="n">field_names</span><span class="o">=</span><span class="p">[</span><span class="s2">&quot;year&quot;</span><span class="p">,</span> <span class="s2">&quot;month&quot;</span><span class="p">,</span> <span class="s2">&quot;day&quot;</span><span class="p">])</span>
</pre></div>
</div>
<p>Directory partitioning also supports providing a full schema rather than inferring
types from file paths.</p>
</div>
</div>
<div class="section" id="reading-from-cloud-storage">
<h2>Reading from cloud storage<a class="headerlink" href="#reading-from-cloud-storage" title="Permalink to this headline"></a></h2>
<p>In addition to local files, pyarrow also supports reading from cloud storage.
Currently, <a class="reference internal" href="generated/pyarrow.fs.HadoopFileSystem.html#pyarrow.fs.HadoopFileSystem" title="pyarrow.fs.HadoopFileSystem"><code class="xref py py-class docutils literal notranslate"><span class="pre">HDFS</span></code></a> and
<code class="xref py py-class docutils literal notranslate"><span class="pre">Amazon</span> <span class="pre">S3-compatible</span> <span class="pre">storage</span></code> are supported.</p>
<p>When passing a file URI, the file system will be inferred. For example,
specifying a S3 path:</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="n">dataset</span> <span class="o">=</span> <span class="n">ds</span><span class="o">.</span><span class="n">dataset</span><span class="p">(</span><span class="s2">&quot;s3://ursa-labs-taxi-data/&quot;</span><span class="p">,</span> <span class="n">partitioning</span><span class="o">=</span><span class="p">[</span><span class="s2">&quot;year&quot;</span><span class="p">,</span> <span class="s2">&quot;month&quot;</span><span class="p">])</span>
</pre></div>
</div>
<p>Typically, you will want to customize the connection parameters, and then
a file system object can be created and passed to the <code class="docutils literal notranslate"><span class="pre">filesystem</span></code> keyword:</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="kn">from</span> <span class="nn">pyarrow</span> <span class="kn">import</span> <span class="n">fs</span>
<span class="n">s3</span> <span class="o">=</span> <span class="n">fs</span><span class="o">.</span><span class="n">S3FileSystem</span><span class="p">(</span><span class="n">region</span><span class="o">=</span><span class="s2">&quot;us-east-2&quot;</span><span class="p">)</span>
<span class="n">dataset</span> <span class="o">=</span> <span class="n">ds</span><span class="o">.</span><span class="n">dataset</span><span class="p">(</span><span class="s2">&quot;ursa-labs-taxi-data/&quot;</span><span class="p">,</span> <span class="n">filesystem</span><span class="o">=</span><span class="n">s3</span><span class="p">,</span>
<span class="n">partitioning</span><span class="o">=</span><span class="p">[</span><span class="s2">&quot;year&quot;</span><span class="p">,</span> <span class="s2">&quot;month&quot;</span><span class="p">])</span>
</pre></div>
</div>
<p>The currently available classes are <code class="xref py py-class docutils literal notranslate"><span class="pre">S3FileSystem</span></code> and
<a class="reference internal" href="generated/pyarrow.fs.HadoopFileSystem.html#pyarrow.fs.HadoopFileSystem" title="pyarrow.fs.HadoopFileSystem"><code class="xref py py-class docutils literal notranslate"><span class="pre">HadoopFileSystem</span></code></a>. See the <a class="reference internal" href="filesystems.html#filesystem"><span class="std std-ref">Filesystem Interface</span></a> docs for more
details.</p>
</div>
<div class="section" id="reading-from-minio">
<h2>Reading from Minio<a class="headerlink" href="#reading-from-minio" title="Permalink to this headline"></a></h2>
<p>In addition to cloud storage, pyarrow also supports reading from a
<a class="reference external" href="https://github.com/minio/minio">MinIO</a> object storage instance emulating S3
APIs. Paired with <a class="reference external" href="https://github.com/shopify/toxiproxy">toxiproxy</a>, this is
useful for testing or benchmarking.</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="kn">from</span> <span class="nn">pyarrow</span> <span class="kn">import</span> <span class="n">fs</span>
<span class="c1"># By default, MinIO will listen for unencrypted HTTP traffic.</span>
<span class="n">minio</span> <span class="o">=</span> <span class="n">fs</span><span class="o">.</span><span class="n">S3FileSystem</span><span class="p">(</span><span class="n">scheme</span><span class="o">=</span><span class="s2">&quot;http&quot;</span><span class="p">,</span> <span class="n">endpoint</span><span class="o">=</span><span class="s2">&quot;localhost:9000&quot;</span><span class="p">)</span>
<span class="n">dataset</span> <span class="o">=</span> <span class="n">ds</span><span class="o">.</span><span class="n">dataset</span><span class="p">(</span><span class="s2">&quot;ursa-labs-taxi-data/&quot;</span><span class="p">,</span> <span class="n">filesystem</span><span class="o">=</span><span class="n">minio</span><span class="p">,</span>
<span class="n">partitioning</span><span class="o">=</span><span class="p">[</span><span class="s2">&quot;year&quot;</span><span class="p">,</span> <span class="s2">&quot;month&quot;</span><span class="p">])</span>
</pre></div>
</div>
</div>
<div class="section" id="working-with-parquet-datasets">
<h2>Working with Parquet Datasets<a class="headerlink" href="#working-with-parquet-datasets" title="Permalink to this headline"></a></h2>
<p>While the Datasets API provides a unified interface to different file formats,
some specific methods exist for Parquet Datasets.</p>
<p>Some processing frameworks such as Dask (optionally) use a <code class="docutils literal notranslate"><span class="pre">_metadata</span></code> file
with partitioned datasets which includes information about the schema and the
row group metadata of the full dataset. Using such file can give a more
efficient creation of a parquet Dataset, since it does not need to infer the
schema and crawl the directories for all Parquet files (this is especially the
case for filesystems where accessing files is expensive). The
<a class="reference internal" href="generated/pyarrow.dataset.parquet_dataset.html#pyarrow.dataset.parquet_dataset" title="pyarrow.dataset.parquet_dataset"><code class="xref py py-func docutils literal notranslate"><span class="pre">parquet_dataset()</span></code></a> function allows to create a Dataset from a partitioned
dataset with a <code class="docutils literal notranslate"><span class="pre">_metadata</span></code> file:</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="n">dataset</span> <span class="o">=</span> <span class="n">ds</span><span class="o">.</span><span class="n">parquet_dataset</span><span class="p">(</span><span class="s2">&quot;/path/to/dir/_metadata&quot;</span><span class="p">)</span>
</pre></div>
</div>
<p>By default, the constructed <a class="reference internal" href="generated/pyarrow.dataset.Dataset.html#pyarrow.dataset.Dataset" title="pyarrow.dataset.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">Dataset</span></code></a> object for Parquet datasets maps
each fragment to a single Parquet file. If you want fragments mapping to each
row group of a Parquet file, you can use the <code class="docutils literal notranslate"><span class="pre">split_by_row_group()</span></code> method of
the fragments:</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="n">fragments</span> <span class="o">=</span> <span class="nb">list</span><span class="p">(</span><span class="n">dataset</span><span class="o">.</span><span class="n">get_fragments</span><span class="p">())</span>
<span class="n">fragments</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">split_by_row_group</span><span class="p">()</span>
</pre></div>
</div>
<p>This method returns a list of new Fragments mapping to each row group of
the original Fragment (Parquet file). Both <code class="docutils literal notranslate"><span class="pre">get_fragments()</span></code> and
<code class="docutils literal notranslate"><span class="pre">split_by_row_group()</span></code> accept an optional filter expression to get a
filtered list of fragments.</p>
</div>
<div class="section" id="manual-specification-of-the-dataset">
<h2>Manual specification of the Dataset<a class="headerlink" href="#manual-specification-of-the-dataset" title="Permalink to this headline"></a></h2>
<p>The <a class="reference internal" href="generated/pyarrow.dataset.dataset.html#pyarrow.dataset.dataset" title="pyarrow.dataset.dataset"><code class="xref py py-func docutils literal notranslate"><span class="pre">dataset()</span></code></a> function allows easy creation of a Dataset viewing a directory,
crawling all subdirectories for files and partitioning information. However
sometimes discovery is not required and the dataset’s files and partitions
are already known (for example, when this information is stored in metadata).
In this case it is possible to create a Dataset explicitly without any
automatic discovery or inference.</p>
<p>For the example here, we are going to use a dataset where the file names contain
additional partitioning information:</p>
<div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="go"># creating a dummy dataset: directory with two files</span>
<span class="gp">In [34]: </span><span class="n">table</span> <span class="o">=</span> <span class="n">pa</span><span class="o">.</span><span class="n">table</span><span class="p">({</span><span class="s1">&#39;col1&#39;</span><span class="p">:</span> <span class="nb">range</span><span class="p">(</span><span class="mi">3</span><span class="p">),</span> <span class="s1">&#39;col2&#39;</span><span class="p">:</span> <span class="n">np</span><span class="o">.</span><span class="n">random</span><span class="o">.</span><span class="n">randn</span><span class="p">(</span><span class="mi">3</span><span class="p">)})</span>
<span class="gp">In [35]: </span><span class="p">(</span><span class="n">base</span> <span class="o">/</span> <span class="s2">&quot;parquet_dataset_manual&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">mkdir</span><span class="p">(</span><span class="n">exist_ok</span><span class="o">=</span><span class="bp">True</span><span class="p">)</span>
<span class="gp">In [36]: </span><span class="n">pq</span><span class="o">.</span><span class="n">write_table</span><span class="p">(</span><span class="n">table</span><span class="p">,</span> <span class="n">base</span> <span class="o">/</span> <span class="s2">&quot;parquet_dataset_manual&quot;</span> <span class="o">/</span> <span class="s2">&quot;data_2018.parquet&quot;</span><span class="p">)</span>
<span class="gp">In [37]: </span><span class="n">pq</span><span class="o">.</span><span class="n">write_table</span><span class="p">(</span><span class="n">table</span><span class="p">,</span> <span class="n">base</span> <span class="o">/</span> <span class="s2">&quot;parquet_dataset_manual&quot;</span> <span class="o">/</span> <span class="s2">&quot;data_2019.parquet&quot;</span><span class="p">)</span>
</pre></div>
</div>
<p>To create a Dataset from a list of files, we need to specify the paths, schema,
format, filesystem, and partition expressions manually:</p>
<div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [38]: </span><span class="kn">from</span> <span class="nn">pyarrow</span> <span class="kn">import</span> <span class="n">fs</span>
<span class="gp">In [39]: </span><span class="n">schema</span> <span class="o">=</span> <span class="n">pa</span><span class="o">.</span><span class="n">schema</span><span class="p">([(</span><span class="s2">&quot;year&quot;</span><span class="p">,</span> <span class="n">pa</span><span class="o">.</span><span class="n">int64</span><span class="p">()),</span> <span class="p">(</span><span class="s2">&quot;col1&quot;</span><span class="p">,</span> <span class="n">pa</span><span class="o">.</span><span class="n">int64</span><span class="p">()),</span> <span class="p">(</span><span class="s2">&quot;col2&quot;</span><span class="p">,</span> <span class="n">pa</span><span class="o">.</span><span class="n">float64</span><span class="p">())])</span>
<span class="gp">In [40]: </span><span class="n">dataset</span> <span class="o">=</span> <span class="n">ds</span><span class="o">.</span><span class="n">FileSystemDataset</span><span class="o">.</span><span class="n">from_paths</span><span class="p">(</span>
<span class="gp"> ....: </span> <span class="p">[</span><span class="s2">&quot;data_2018.parquet&quot;</span><span class="p">,</span> <span class="s2">&quot;data_2019.parquet&quot;</span><span class="p">],</span> <span class="n">schema</span><span class="o">=</span><span class="n">schema</span><span class="p">,</span> <span class="n">format</span><span class="o">=</span><span class="n">ds</span><span class="o">.</span><span class="n">ParquetFileFormat</span><span class="p">(),</span>
<span class="gp"> ....: </span> <span class="n">filesystem</span><span class="o">=</span><span class="n">fs</span><span class="o">.</span><span class="n">SubTreeFileSystem</span><span class="p">(</span><span class="nb">str</span><span class="p">(</span><span class="n">base</span> <span class="o">/</span> <span class="s2">&quot;parquet_dataset_manual&quot;</span><span class="p">),</span> <span class="n">fs</span><span class="o">.</span><span class="n">LocalFileSystem</span><span class="p">()),</span>
<span class="gp"> ....: </span> <span class="n">partitions</span><span class="o">=</span><span class="p">[</span><span class="n">ds</span><span class="o">.</span><span class="n">field</span><span class="p">(</span><span class="s1">&#39;year&#39;</span><span class="p">)</span> <span class="o">==</span> <span class="mi">2018</span><span class="p">,</span> <span class="n">ds</span><span class="o">.</span><span class="n">field</span><span class="p">(</span><span class="s1">&#39;year&#39;</span><span class="p">)</span> <span class="o">==</span> <span class="mi">2019</span><span class="p">])</span>
<span class="gp"> ....: </span>
</pre></div>
</div>
<p>Since we specified the “partition expressions” for our files, this information
is materialized as columns when reading the data and can be used for filtering:</p>
<div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [41]: </span><span class="n">dataset</span><span class="o">.</span><span class="n">to_table</span><span class="p">()</span><span class="o">.</span><span class="n">to_pandas</span><span class="p">()</span>
<span class="gh">Out[41]: </span><span class="go"></span>
<span class="go"> year col1 col2</span>
<span class="go">0 2018 0 -0.093183</span>
<span class="go">1 2018 1 0.509221</span>
<span class="go">2 2018 2 0.072163</span>
<span class="go">3 2019 0 -0.093183</span>
<span class="go">4 2019 1 0.509221</span>
<span class="go">5 2019 2 0.072163</span>
<span class="gp">In [42]: </span><span class="n">dataset</span><span class="o">.</span><span class="n">to_table</span><span class="p">(</span><span class="nb">filter</span><span class="o">=</span><span class="n">ds</span><span class="o">.</span><span class="n">field</span><span class="p">(</span><span class="s1">&#39;year&#39;</span><span class="p">)</span> <span class="o">==</span> <span class="mi">2019</span><span class="p">)</span><span class="o">.</span><span class="n">to_pandas</span><span class="p">()</span>
<span class="gh">Out[42]: </span><span class="go"></span>
<span class="go"> year col1 col2</span>
<span class="go">0 2019 0 -0.093183</span>
<span class="go">1 2019 1 0.509221</span>
<span class="go">2 2019 2 0.072163</span>
</pre></div>
</div>
</div>
<div class="section" id="manual-scheduling">
<h2>Manual scheduling<a class="headerlink" href="#manual-scheduling" title="Permalink to this headline"></a></h2>
<p>The <a class="reference internal" href="generated/pyarrow.dataset.Dataset.html#pyarrow.dataset.Dataset.to_table" title="pyarrow.dataset.Dataset.to_table"><code class="xref py py-func docutils literal notranslate"><span class="pre">to_table()</span></code></a> method loads all selected data into memory
at once resulting in a pyarrow Table. Alternatively, a dataset can also be
scanned one RecordBatch at a time in an iterative manner using the
<a class="reference internal" href="generated/pyarrow.dataset.Dataset.html#pyarrow.dataset.Dataset.scan" title="pyarrow.dataset.Dataset.scan"><code class="xref py py-func docutils literal notranslate"><span class="pre">scan()</span></code></a> method:</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="k">for</span> <span class="n">scan_task</span> <span class="ow">in</span> <span class="n">dataset</span><span class="o">.</span><span class="n">scan</span><span class="p">(</span><span class="n">columns</span><span class="o">=</span><span class="p">[</span><span class="o">...</span><span class="p">],</span> <span class="nb">filter</span><span class="o">=...</span><span class="p">):</span>
<span class="k">for</span> <span class="n">record_batch</span> <span class="ow">in</span> <span class="n">scan_task</span><span class="o">.</span><span class="n">execute</span><span class="p">():</span>
<span class="c1"># process the record batch</span>
</pre></div>
</div>
</div>
</div>
</div>
</div>
<footer>
<div class="rst-footer-buttons" role="navigation" aria-label="footer navigation">
<a href="cuda.html" class="btn btn-neutral float-right" title="CUDA Integration" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right"></span></a>
<a href="parquet.html" class="btn btn-neutral float-left" title="Reading and Writing the Apache Parquet Format" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
</div>
<hr/>
<div role="contentinfo">
<p>
&copy; Copyright 2016-2019 Apache Software Foundation
</p>
</div>
Built with <a href="http://sphinx-doc.org/">Sphinx</a> using a
<a href="https://github.com/rtfd/sphinx_rtd_theme">theme</a>
provided by <a href="https://readthedocs.org">Read the Docs</a>.
</footer>
</div>
</div>
</section>
</div>
<script type="text/javascript">
jQuery(function () {
SphinxRtdTheme.Navigation.enable(true);
});
</script>
<script type="text/javascript" src="/docs/_static/versionwarning.js"></script></body>
</html>