blob: 8c82ceccb6d2fe18bc3fcf913a31268a4a2bd3b0 [file] [log] [blame]
<!DOCTYPE html>
<html class="writer-html5" lang="en" >
<head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>pyarrow.dataset.Dataset &mdash; Apache Arrow v2.0.0</title>
<link rel="stylesheet" href="../../_static/css/theme.css" type="text/css" />
<link rel="stylesheet" href="../../_static/pygments.css" type="text/css" />
<!--[if lt IE 9]>
<script src="../../_static/js/html5shiv.min.js"></script>
<![endif]-->
<script type="text/javascript" id="documentation_options" data-url_root="../../" src="../../_static/documentation_options.js"></script>
<script src="../../_static/jquery.js"></script>
<script src="../../_static/underscore.js"></script>
<script src="../../_static/doctools.js"></script>
<script src="../../_static/language_data.js"></script>
<script type="text/javascript" src="../../_static/js/theme.js"></script>
<link rel="canonical" href="https://arrow.apache.org/docs/python/generated/pyarrow.dataset.Dataset.html" />
<link rel="index" title="Index" href="../../genindex.html" />
<link rel="search" title="Search" href="../../search.html" />
<link rel="next" title="pyarrow.dataset.FileSystemDataset" href="pyarrow.dataset.FileSystemDataset.html" />
<link rel="prev" title="pyarrow.dataset.HivePartitioning" href="pyarrow.dataset.HivePartitioning.html" />
<!-- Matomo -->
<script>
var _paq = window._paq = window._paq || [];
/* tracker methods like "setCustomDimension" should be called before "trackPageView" */
_paq.push(["setDoNotTrack", true]);
_paq.push(["disableCookies"]);
_paq.push(['trackPageView']);
_paq.push(['enableLinkTracking']);
(function() {
var u="https://analytics.apache.org/";
_paq.push(['setTrackerUrl', u+'matomo.php']);
_paq.push(['setSiteId', '20']);
var d=document, g=d.createElement('script'), s=d.getElementsByTagName('script')[0];
g.async=true; g.src=u+'matomo.js'; s.parentNode.insertBefore(g,s);
})();
</script>
<!-- End Matomo Code -->
</head>
<body class="wy-body-for-nav">
<div class="wy-grid-for-nav">
<nav data-toggle="wy-nav-shift" class="wy-nav-side">
<div class="wy-side-scroll">
<div class="wy-side-nav-search" >
<a href="../../index.html" class="icon icon-home" alt="Documentation Home"> Apache Arrow
</a>
<div class="version">
2.0.0
</div>
<div role="search">
<form id="rtd-search-form" class="wy-form" action="../../search.html" method="get">
<input type="text" name="q" placeholder="Search docs" />
<input type="hidden" name="check_keywords" value="yes" />
<input type="hidden" name="area" value="default" />
</form>
</div>
</div>
<div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="main navigation">
<p class="caption"><span class="caption-text">Specifications and Protocols</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="../../format/Versioning.html">Format Versioning and Stability</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../format/Columnar.html">Arrow Columnar Format</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../format/Flight.html">Arrow Flight RPC</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../format/Integration.html">Integration Testing</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../format/CDataInterface.html">The Arrow C data interface</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../format/CStreamInterface.html">The Arrow C stream interface</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../format/Other.html">Other Data Structures</a></li>
</ul>
<p class="caption"><span class="caption-text">Libraries</span></p>
<ul class="current">
<li class="toctree-l1"><a class="reference internal" href="../../status.html">Implementation Status</a></li>
<li class="toctree-l1"><a class="reference external" href="https://arrow.apache.org/docs/c_glib/">C/GLib</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../cpp/index.html">C++</a></li>
<li class="toctree-l1"><a class="reference external" href="https://github.com/apache/arrow/blob/master/csharp/README.md">C#</a></li>
<li class="toctree-l1"><a class="reference external" href="https://godoc.org/github.com/apache/arrow/go/arrow">Go</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../java/index.html">Java</a></li>
<li class="toctree-l1"><a class="reference external" href="https://arrow.apache.org/docs/js/">JavaScript</a></li>
<li class="toctree-l1"><a class="reference external" href="https://github.com/apache/arrow/blob/master/matlab/README.md">MATLAB</a></li>
<li class="toctree-l1 current"><a class="reference internal" href="../index.html">Python</a><ul class="current">
<li class="toctree-l2"><a class="reference internal" href="../install.html">Installing PyArrow</a></li>
<li class="toctree-l2"><a class="reference internal" href="../memory.html">Memory and IO Interfaces</a></li>
<li class="toctree-l2"><a class="reference internal" href="../data.html">Data Types and In-Memory Data Model</a></li>
<li class="toctree-l2"><a class="reference internal" href="../compute.html">Compute Functions</a></li>
<li class="toctree-l2"><a class="reference internal" href="../ipc.html">Streaming, Serialization, and IPC</a></li>
<li class="toctree-l2"><a class="reference internal" href="../filesystems.html">Filesystem Interface</a></li>
<li class="toctree-l2"><a class="reference internal" href="../filesystems_deprecated.html">Filesystem Interface (legacy)</a></li>
<li class="toctree-l2"><a class="reference internal" href="../plasma.html">The Plasma In-Memory Object Store</a></li>
<li class="toctree-l2"><a class="reference internal" href="../numpy.html">NumPy Integration</a></li>
<li class="toctree-l2"><a class="reference internal" href="../pandas.html">Pandas Integration</a></li>
<li class="toctree-l2"><a class="reference internal" href="../timestamps.html">Timestamps</a></li>
<li class="toctree-l2"><a class="reference internal" href="../csv.html">Reading CSV files</a></li>
<li class="toctree-l2"><a class="reference internal" href="../feather.html">Feather File Format</a></li>
<li class="toctree-l2"><a class="reference internal" href="../json.html">Reading JSON files</a></li>
<li class="toctree-l2"><a class="reference internal" href="../parquet.html">Reading and Writing the Apache Parquet Format</a></li>
<li class="toctree-l2"><a class="reference internal" href="../dataset.html">Tabular Datasets</a></li>
<li class="toctree-l2"><a class="reference internal" href="../cuda.html">CUDA Integration</a></li>
<li class="toctree-l2"><a class="reference internal" href="../extending_types.html">Extending pyarrow</a></li>
<li class="toctree-l2"><a class="reference internal" href="../extending.html">Using pyarrow from C++ and Cython Code</a></li>
<li class="toctree-l2 current"><a class="reference internal" href="../api.html">API Reference</a><ul class="current">
<li class="toctree-l3"><a class="reference internal" href="../api/datatypes.html">Data Types and Schemas</a></li>
<li class="toctree-l3"><a class="reference internal" href="../api/arrays.html">Arrays and Scalars</a></li>
<li class="toctree-l3"><a class="reference internal" href="../api/memory.html">Buffers and Memory</a></li>
<li class="toctree-l3"><a class="reference internal" href="../api/compute.html">Compute Functions</a></li>
<li class="toctree-l3"><a class="reference internal" href="../api/files.html">Streams and File Access</a></li>
<li class="toctree-l3"><a class="reference internal" href="../api/tables.html">Tables and Tensors</a></li>
<li class="toctree-l3"><a class="reference internal" href="../api/ipc.html">Serialization and IPC</a></li>
<li class="toctree-l3"><a class="reference internal" href="../api/flight.html">Arrow Flight</a></li>
<li class="toctree-l3"><a class="reference internal" href="../api/formats.html">Tabular File Formats</a></li>
<li class="toctree-l3"><a class="reference internal" href="../api/filesystems.html">Filesystems</a></li>
<li class="toctree-l3 current"><a class="reference internal" href="../api/dataset.html">Dataset</a><ul class="current">
<li class="toctree-l4"><a class="reference internal" href="../api/dataset.html#factory-functions">Factory functions</a></li>
<li class="toctree-l4 current"><a class="reference internal" href="../api/dataset.html#classes">Classes</a></li>
</ul>
</li>
<li class="toctree-l3"><a class="reference internal" href="../api/plasma.html">Plasma In-Memory Object Store</a></li>
<li class="toctree-l3"><a class="reference internal" href="../api/cuda.html">CUDA Integration</a></li>
<li class="toctree-l3"><a class="reference internal" href="../api/misc.html">Miscellaneous</a></li>
</ul>
</li>
<li class="toctree-l2"><a class="reference internal" href="../getting_involved.html">Getting Involved</a></li>
<li class="toctree-l2"><a class="reference internal" href="../benchmarks.html">Benchmarks</a></li>
</ul>
</li>
<li class="toctree-l1"><a class="reference external" href="https://arrow.apache.org/docs/r/">R</a></li>
<li class="toctree-l1"><a class="reference external" href="https://github.com/apache/arrow/blob/master/ruby/README.md">Ruby</a></li>
<li class="toctree-l1"><a class="reference external" href="https://docs.rs/crate/arrow/">Rust</a></li>
</ul>
<p class="caption"><span class="caption-text">Development</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="../../developers/contributing.html">Contributing to Apache Arrow</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../developers/cpp/index.html">C++ Development</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../developers/python.html">Python Development</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../developers/archery.html">Daily Development using Archery</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../developers/crossbow.html">Packaging and Testing with Crossbow</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../developers/docker.html">Running Docker Builds</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../developers/benchmarks.html">Benchmarks</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../developers/documentation.html">Building the Documentation</a></li>
</ul>
</div>
</div>
</nav>
<section data-toggle="wy-nav-shift" class="wy-nav-content-wrap">
<nav class="wy-nav-top" aria-label="top navigation">
<i data-toggle="wy-nav-top" class="fa fa-bars"></i>
<a href="../../index.html">Apache Arrow</a>
</nav>
<div class="wy-nav-content">
<div class="rst-content">
<div role="navigation" aria-label="breadcrumbs navigation">
<ul class="wy-breadcrumbs">
<li><a href="../../index.html" class="icon icon-home"></a> &raquo;</li>
<li><a href="../index.html">Python bindings</a> &raquo;</li>
<li><a href="../api.html">API Reference</a> &raquo;</li>
<li><a href="../api/dataset.html">Dataset</a> &raquo;</li>
<li>pyarrow.dataset.Dataset</li>
<li class="wy-breadcrumbs-aside">
<a href="../../_sources/python/generated/pyarrow.dataset.Dataset.rst.txt" rel="nofollow"> View page source</a>
</li>
</ul>
<hr/>
</div>
<div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
<div itemprop="articleBody">
<div class="section" id="pyarrow-dataset-dataset">
<h1>pyarrow.dataset.Dataset<a class="headerlink" href="#pyarrow-dataset-dataset" title="Permalink to this headline"></a></h1>
<dl class="py class">
<dt id="pyarrow.dataset.Dataset">
<em class="property">class </em><code class="sig-prename descclassname">pyarrow.dataset.</code><code class="sig-name descname">Dataset</code><a class="headerlink" href="#pyarrow.dataset.Dataset" title="Permalink to this definition"></a></dt>
<dd><p>Bases: <code class="xref py py-class docutils literal notranslate"><span class="pre">pyarrow.lib._Weakrefable</span></code></p>
<p>Collection of data fragments and potentially child datasets.</p>
<p>Arrow Datasets allow you to query against data that has been split across
multiple files. This sharding of data may indicate partitioning, which
can accelerate queries that only touch some partitions (files).</p>
<dl class="py method">
<dt id="pyarrow.dataset.Dataset.__init__">
<code class="sig-name descname">__init__</code><span class="sig-paren">(</span><em class="sig-param"><span class="o">*</span><span class="n">args</span></em>, <em class="sig-param"><span class="o">**</span><span class="n">kwargs</span></em><span class="sig-paren">)</span><a class="headerlink" href="#pyarrow.dataset.Dataset.__init__" title="Permalink to this definition"></a></dt>
<dd><p>Initialize self. See help(type(self)) for accurate signature.</p>
</dd></dl>
<p class="rubric">Methods</p>
<table class="longtable docutils align-default">
<colgroup>
<col style="width: 10%" />
<col style="width: 90%" />
</colgroup>
<tbody>
<tr class="row-odd"><td><p><a class="reference internal" href="#pyarrow.dataset.Dataset.__init__" title="pyarrow.dataset.Dataset.__init__"><code class="xref py py-obj docutils literal notranslate"><span class="pre">__init__</span></code></a>(*args, **kwargs)</p></td>
<td><p>Initialize self.</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="#pyarrow.dataset.Dataset.get_fragments" title="pyarrow.dataset.Dataset.get_fragments"><code class="xref py py-obj docutils literal notranslate"><span class="pre">get_fragments</span></code></a></p></td>
<td><p>Returns an iterator over the fragments in this dataset.</p></td>
</tr>
<tr class="row-odd"><td><p><a class="reference internal" href="#pyarrow.dataset.Dataset.replace_schema" title="pyarrow.dataset.Dataset.replace_schema"><code class="xref py py-obj docutils literal notranslate"><span class="pre">replace_schema</span></code></a></p></td>
<td><p>Return a copy of this Dataset with a different schema.</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="#pyarrow.dataset.Dataset.scan" title="pyarrow.dataset.Dataset.scan"><code class="xref py py-obj docutils literal notranslate"><span class="pre">scan</span></code></a></p></td>
<td><p>Builds a scan operation against the dataset.</p></td>
</tr>
<tr class="row-odd"><td><p><a class="reference internal" href="#pyarrow.dataset.Dataset.to_batches" title="pyarrow.dataset.Dataset.to_batches"><code class="xref py py-obj docutils literal notranslate"><span class="pre">to_batches</span></code></a></p></td>
<td><p>Read the dataset as materialized record batches.</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="#pyarrow.dataset.Dataset.to_table" title="pyarrow.dataset.Dataset.to_table"><code class="xref py py-obj docutils literal notranslate"><span class="pre">to_table</span></code></a></p></td>
<td><p>Read the dataset to an arrow table.</p></td>
</tr>
</tbody>
</table>
<p class="rubric">Attributes</p>
<table class="longtable docutils align-default">
<colgroup>
<col style="width: 10%" />
<col style="width: 90%" />
</colgroup>
<tbody>
<tr class="row-odd"><td><p><a class="reference internal" href="#pyarrow.dataset.Dataset.partition_expression" title="pyarrow.dataset.Dataset.partition_expression"><code class="xref py py-obj docutils literal notranslate"><span class="pre">partition_expression</span></code></a></p></td>
<td><p>An Expression which evaluates to true for all data viewed by this Dataset.</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="#pyarrow.dataset.Dataset.schema" title="pyarrow.dataset.Dataset.schema"><code class="xref py py-obj docutils literal notranslate"><span class="pre">schema</span></code></a></p></td>
<td><p>The common schema of the full Dataset</p></td>
</tr>
</tbody>
</table>
<dl class="py method">
<dt id="pyarrow.dataset.Dataset.get_fragments">
<code class="sig-name descname">get_fragments</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyarrow.dataset.Dataset.get_fragments" title="Permalink to this definition"></a></dt>
<dd><p>Returns an iterator over the fragments in this dataset.</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters</dt>
<dd class="field-odd"><p><strong>filter</strong> (<a class="reference internal" href="pyarrow.dataset.Expression.html#pyarrow.dataset.Expression" title="pyarrow.dataset.Expression"><em>Expression</em></a><em>, </em><em>default None</em>) – Return fragments matching the optional filter, either using the
partition_expression or internal information like Parquet’s
statistics.</p>
</dd>
<dt class="field-even">Returns</dt>
<dd class="field-even"><p><strong>fragments</strong> (<em>iterator of Fragment</em>)</p>
</dd>
</dl>
</dd></dl>
<dl class="py attribute">
<dt id="pyarrow.dataset.Dataset.partition_expression">
<code class="sig-name descname">partition_expression</code><a class="headerlink" href="#pyarrow.dataset.Dataset.partition_expression" title="Permalink to this definition"></a></dt>
<dd><p>An Expression which evaluates to true for all data viewed by this
Dataset.</p>
</dd></dl>
<dl class="py method">
<dt id="pyarrow.dataset.Dataset.replace_schema">
<code class="sig-name descname">replace_schema</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyarrow.dataset.Dataset.replace_schema" title="Permalink to this definition"></a></dt>
<dd><p>Return a copy of this Dataset with a different schema.</p>
<p>The copy will view the same Fragments. If the new schema is not
compatible with the original dataset’s schema then an error will
be raised.</p>
</dd></dl>
<dl class="py method">
<dt id="pyarrow.dataset.Dataset.scan">
<code class="sig-name descname">scan</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyarrow.dataset.Dataset.scan" title="Permalink to this definition"></a></dt>
<dd><p>Builds a scan operation against the dataset.</p>
<p>It produces a stream of ScanTasks which is meant to be a unit of work
to be dispatched. The tasks are not executed automatically, the user is
responsible to execute and dispatch the individual tasks, so custom
local task scheduling can be implemented.</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters</dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>columns</strong> (<em>list of str</em><em>, </em><em>default None</em>) – List of columns to project. Order and duplicates will be preserved.
The columns will be passed down to Datasets and corresponding data
fragments to avoid loading, copying, and deserializing columns
that will not be required further down the compute chain.
By default all of the available columns are projected. Raises
an exception if any of the referenced column names does not exist
in the dataset’s Schema.</p></li>
<li><p><strong>filter</strong> (<a class="reference internal" href="pyarrow.dataset.Expression.html#pyarrow.dataset.Expression" title="pyarrow.dataset.Expression"><em>Expression</em></a><em>, </em><em>default None</em>) – Scan will return only the rows matching the filter.
If possible the predicate will be pushed down to exploit the
partition information or internal metadata found in the data
source, e.g. Parquet statistics. Otherwise filters the loaded
RecordBatches before yielding them.</p></li>
<li><p><strong>batch_size</strong> (<em>int</em><em>, </em><em>default 1M</em>) – The maximum row count for scanned record batches. If scanned
record batches are overflowing memory then this method can be
called to reduce their size.</p></li>
<li><p><strong>use_threads</strong> (<em>bool</em><em>, </em><em>default True</em>) – If enabled, then maximum parallelism will be used determined by
the number of available CPU cores.</p></li>
<li><p><strong>memory_pool</strong> (<a class="reference internal" href="pyarrow.MemoryPool.html#pyarrow.MemoryPool" title="pyarrow.MemoryPool"><em>MemoryPool</em></a><em>, </em><em>default None</em>) – For memory allocations, if required. If not specified, uses the
default pool.</p></li>
</ul>
</dd>
<dt class="field-even">Returns</dt>
<dd class="field-even"><p><strong>scan_tasks</strong> (<em>iterator of ScanTask</em>)</p>
</dd>
</dl>
</dd></dl>
<dl class="py attribute">
<dt id="pyarrow.dataset.Dataset.schema">
<code class="sig-name descname">schema</code><a class="headerlink" href="#pyarrow.dataset.Dataset.schema" title="Permalink to this definition"></a></dt>
<dd><p>The common schema of the full Dataset</p>
</dd></dl>
<dl class="py method">
<dt id="pyarrow.dataset.Dataset.to_batches">
<code class="sig-name descname">to_batches</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyarrow.dataset.Dataset.to_batches" title="Permalink to this definition"></a></dt>
<dd><p>Read the dataset as materialized record batches.</p>
<p>Builds a scan operation against the dataset and sequentially executes
the ScanTasks as the returned generator gets consumed.</p>
<p>See scan method parameters documentation.</p>
<dl class="field-list simple">
<dt class="field-odd">Returns</dt>
<dd class="field-odd"><p><strong>record_batches</strong> (<em>iterator of RecordBatch</em>)</p>
</dd>
</dl>
</dd></dl>
<dl class="py method">
<dt id="pyarrow.dataset.Dataset.to_table">
<code class="sig-name descname">to_table</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyarrow.dataset.Dataset.to_table" title="Permalink to this definition"></a></dt>
<dd><p>Read the dataset to an arrow table.</p>
<p>Note that this method reads all the selected data from the dataset
into memory.</p>
<p>See scan method parameters documentation.</p>
<dl class="field-list simple">
<dt class="field-odd">Returns</dt>
<dd class="field-odd"><p><strong>table</strong> (<em>Table instance</em>)</p>
</dd>
</dl>
</dd></dl>
</dd></dl>
</div>
</div>
</div>
<footer>
<div class="rst-footer-buttons" role="navigation" aria-label="footer navigation">
<a href="pyarrow.dataset.FileSystemDataset.html" class="btn btn-neutral float-right" title="pyarrow.dataset.FileSystemDataset" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right"></span></a>
<a href="pyarrow.dataset.HivePartitioning.html" class="btn btn-neutral float-left" title="pyarrow.dataset.HivePartitioning" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
</div>
<hr/>
<div role="contentinfo">
<p>
&copy; Copyright 2016-2019 Apache Software Foundation
</p>
</div>
Built with <a href="http://sphinx-doc.org/">Sphinx</a> using a
<a href="https://github.com/rtfd/sphinx_rtd_theme">theme</a>
provided by <a href="https://readthedocs.org">Read the Docs</a>.
</footer>
</div>
</div>
</section>
</div>
<script type="text/javascript">
jQuery(function () {
SphinxRtdTheme.Navigation.enable(true);
});
</script>
</body>
</html>