blob: 8917b83f60ee285b19c9c75818ba70f25a87aa15 [file] [log] [blame]
<!DOCTYPE html>
<html class="writer-html5" lang="en" >
<head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>pyarrow.dataset.Scanner &mdash; Apache Arrow v2.0.0</title>
<link rel="stylesheet" href="../../_static/css/theme.css" type="text/css" />
<link rel="stylesheet" href="../../_static/pygments.css" type="text/css" />
<!--[if lt IE 9]>
<script src="../../_static/js/html5shiv.min.js"></script>
<![endif]-->
<script type="text/javascript" id="documentation_options" data-url_root="../../" src="../../_static/documentation_options.js"></script>
<script src="../../_static/jquery.js"></script>
<script src="../../_static/underscore.js"></script>
<script src="../../_static/doctools.js"></script>
<script src="../../_static/language_data.js"></script>
<script type="text/javascript" src="../../_static/js/theme.js"></script>
<link rel="canonical" href="https://arrow.apache.org/docs/python/generated/pyarrow.dataset.Scanner.html" />
<link rel="index" title="Index" href="../../genindex.html" />
<link rel="search" title="Search" href="../../search.html" />
<link rel="next" title="pyarrow.dataset.Expression" href="pyarrow.dataset.Expression.html" />
<link rel="prev" title="pyarrow.dataset.UnionDataset" href="pyarrow.dataset.UnionDataset.html" />
<!-- Matomo -->
<script>
var _paq = window._paq = window._paq || [];
/* tracker methods like "setCustomDimension" should be called before "trackPageView" */
_paq.push(["setDoNotTrack", true]);
_paq.push(["disableCookies"]);
_paq.push(['trackPageView']);
_paq.push(['enableLinkTracking']);
(function() {
var u="https://analytics.apache.org/";
_paq.push(['setTrackerUrl', u+'matomo.php']);
_paq.push(['setSiteId', '20']);
var d=document, g=d.createElement('script'), s=d.getElementsByTagName('script')[0];
g.async=true; g.src=u+'matomo.js'; s.parentNode.insertBefore(g,s);
})();
</script>
<!-- End Matomo Code -->
</head>
<body class="wy-body-for-nav">
<div class="wy-grid-for-nav">
<nav data-toggle="wy-nav-shift" class="wy-nav-side">
<div class="wy-side-scroll">
<div class="wy-side-nav-search" >
<a href="../../index.html" class="icon icon-home" alt="Documentation Home"> Apache Arrow
</a>
<div class="version">
2.0.0
</div>
<div role="search">
<form id="rtd-search-form" class="wy-form" action="../../search.html" method="get">
<input type="text" name="q" placeholder="Search docs" />
<input type="hidden" name="check_keywords" value="yes" />
<input type="hidden" name="area" value="default" />
</form>
</div>
</div>
<div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="main navigation">
<p class="caption"><span class="caption-text">Specifications and Protocols</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="../../format/Versioning.html">Format Versioning and Stability</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../format/Columnar.html">Arrow Columnar Format</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../format/Flight.html">Arrow Flight RPC</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../format/Integration.html">Integration Testing</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../format/CDataInterface.html">The Arrow C data interface</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../format/CStreamInterface.html">The Arrow C stream interface</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../format/Other.html">Other Data Structures</a></li>
</ul>
<p class="caption"><span class="caption-text">Libraries</span></p>
<ul class="current">
<li class="toctree-l1"><a class="reference internal" href="../../status.html">Implementation Status</a></li>
<li class="toctree-l1"><a class="reference external" href="https://arrow.apache.org/docs/c_glib/">C/GLib</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../cpp/index.html">C++</a></li>
<li class="toctree-l1"><a class="reference external" href="https://github.com/apache/arrow/blob/master/csharp/README.md">C#</a></li>
<li class="toctree-l1"><a class="reference external" href="https://godoc.org/github.com/apache/arrow/go/arrow">Go</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../java/index.html">Java</a></li>
<li class="toctree-l1"><a class="reference external" href="https://arrow.apache.org/docs/js/">JavaScript</a></li>
<li class="toctree-l1"><a class="reference external" href="https://github.com/apache/arrow/blob/master/matlab/README.md">MATLAB</a></li>
<li class="toctree-l1 current"><a class="reference internal" href="../index.html">Python</a><ul class="current">
<li class="toctree-l2"><a class="reference internal" href="../install.html">Installing PyArrow</a></li>
<li class="toctree-l2"><a class="reference internal" href="../memory.html">Memory and IO Interfaces</a></li>
<li class="toctree-l2"><a class="reference internal" href="../data.html">Data Types and In-Memory Data Model</a></li>
<li class="toctree-l2"><a class="reference internal" href="../compute.html">Compute Functions</a></li>
<li class="toctree-l2"><a class="reference internal" href="../ipc.html">Streaming, Serialization, and IPC</a></li>
<li class="toctree-l2"><a class="reference internal" href="../filesystems.html">Filesystem Interface</a></li>
<li class="toctree-l2"><a class="reference internal" href="../filesystems_deprecated.html">Filesystem Interface (legacy)</a></li>
<li class="toctree-l2"><a class="reference internal" href="../plasma.html">The Plasma In-Memory Object Store</a></li>
<li class="toctree-l2"><a class="reference internal" href="../numpy.html">NumPy Integration</a></li>
<li class="toctree-l2"><a class="reference internal" href="../pandas.html">Pandas Integration</a></li>
<li class="toctree-l2"><a class="reference internal" href="../timestamps.html">Timestamps</a></li>
<li class="toctree-l2"><a class="reference internal" href="../csv.html">Reading CSV files</a></li>
<li class="toctree-l2"><a class="reference internal" href="../feather.html">Feather File Format</a></li>
<li class="toctree-l2"><a class="reference internal" href="../json.html">Reading JSON files</a></li>
<li class="toctree-l2"><a class="reference internal" href="../parquet.html">Reading and Writing the Apache Parquet Format</a></li>
<li class="toctree-l2"><a class="reference internal" href="../dataset.html">Tabular Datasets</a></li>
<li class="toctree-l2"><a class="reference internal" href="../cuda.html">CUDA Integration</a></li>
<li class="toctree-l2"><a class="reference internal" href="../extending_types.html">Extending pyarrow</a></li>
<li class="toctree-l2"><a class="reference internal" href="../extending.html">Using pyarrow from C++ and Cython Code</a></li>
<li class="toctree-l2 current"><a class="reference internal" href="../api.html">API Reference</a><ul class="current">
<li class="toctree-l3"><a class="reference internal" href="../api/datatypes.html">Data Types and Schemas</a></li>
<li class="toctree-l3"><a class="reference internal" href="../api/arrays.html">Arrays and Scalars</a></li>
<li class="toctree-l3"><a class="reference internal" href="../api/memory.html">Buffers and Memory</a></li>
<li class="toctree-l3"><a class="reference internal" href="../api/compute.html">Compute Functions</a></li>
<li class="toctree-l3"><a class="reference internal" href="../api/files.html">Streams and File Access</a></li>
<li class="toctree-l3"><a class="reference internal" href="../api/tables.html">Tables and Tensors</a></li>
<li class="toctree-l3"><a class="reference internal" href="../api/ipc.html">Serialization and IPC</a></li>
<li class="toctree-l3"><a class="reference internal" href="../api/flight.html">Arrow Flight</a></li>
<li class="toctree-l3"><a class="reference internal" href="../api/formats.html">Tabular File Formats</a></li>
<li class="toctree-l3"><a class="reference internal" href="../api/filesystems.html">Filesystems</a></li>
<li class="toctree-l3 current"><a class="reference internal" href="../api/dataset.html">Dataset</a><ul class="current">
<li class="toctree-l4"><a class="reference internal" href="../api/dataset.html#factory-functions">Factory functions</a></li>
<li class="toctree-l4 current"><a class="reference internal" href="../api/dataset.html#classes">Classes</a></li>
</ul>
</li>
<li class="toctree-l3"><a class="reference internal" href="../api/plasma.html">Plasma In-Memory Object Store</a></li>
<li class="toctree-l3"><a class="reference internal" href="../api/cuda.html">CUDA Integration</a></li>
<li class="toctree-l3"><a class="reference internal" href="../api/misc.html">Miscellaneous</a></li>
</ul>
</li>
<li class="toctree-l2"><a class="reference internal" href="../getting_involved.html">Getting Involved</a></li>
<li class="toctree-l2"><a class="reference internal" href="../benchmarks.html">Benchmarks</a></li>
</ul>
</li>
<li class="toctree-l1"><a class="reference external" href="https://arrow.apache.org/docs/r/">R</a></li>
<li class="toctree-l1"><a class="reference external" href="https://github.com/apache/arrow/blob/master/ruby/README.md">Ruby</a></li>
<li class="toctree-l1"><a class="reference external" href="https://docs.rs/crate/arrow/">Rust</a></li>
</ul>
<p class="caption"><span class="caption-text">Development</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="../../developers/contributing.html">Contributing to Apache Arrow</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../developers/cpp/index.html">C++ Development</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../developers/python.html">Python Development</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../developers/archery.html">Daily Development using Archery</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../developers/crossbow.html">Packaging and Testing with Crossbow</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../developers/docker.html">Running Docker Builds</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../developers/benchmarks.html">Benchmarks</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../developers/documentation.html">Building the Documentation</a></li>
</ul>
</div>
</div>
</nav>
<section data-toggle="wy-nav-shift" class="wy-nav-content-wrap">
<nav class="wy-nav-top" aria-label="top navigation">
<i data-toggle="wy-nav-top" class="fa fa-bars"></i>
<a href="../../index.html">Apache Arrow</a>
</nav>
<div class="wy-nav-content">
<div class="rst-content">
<div role="navigation" aria-label="breadcrumbs navigation">
<ul class="wy-breadcrumbs">
<li><a href="../../index.html" class="icon icon-home"></a> &raquo;</li>
<li><a href="../index.html">Python bindings</a> &raquo;</li>
<li><a href="../api.html">API Reference</a> &raquo;</li>
<li><a href="../api/dataset.html">Dataset</a> &raquo;</li>
<li>pyarrow.dataset.Scanner</li>
<li class="wy-breadcrumbs-aside">
<a href="../../_sources/python/generated/pyarrow.dataset.Scanner.rst.txt" rel="nofollow"> View page source</a>
</li>
</ul>
<hr/>
</div>
<div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
<div itemprop="articleBody">
<div class="section" id="pyarrow-dataset-scanner">
<h1>pyarrow.dataset.Scanner<a class="headerlink" href="#pyarrow-dataset-scanner" title="Permalink to this headline"></a></h1>
<dl class="py class">
<dt id="pyarrow.dataset.Scanner">
<em class="property">class </em><code class="sig-prename descclassname">pyarrow.dataset.</code><code class="sig-name descname">Scanner</code><a class="headerlink" href="#pyarrow.dataset.Scanner" title="Permalink to this definition"></a></dt>
<dd><p>Bases: <code class="xref py py-class docutils literal notranslate"><span class="pre">pyarrow.lib._Weakrefable</span></code></p>
<p>A materialized scan operation with context and options bound.</p>
<p>A scanner is the class that glues the scan tasks, data fragments and data
sources together.</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters</dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>dataset</strong> (<a class="reference internal" href="pyarrow.dataset.Dataset.html#pyarrow.dataset.Dataset" title="pyarrow.dataset.Dataset"><em>Dataset</em></a>) – Dataset to scan.</p></li>
<li><p><strong>columns</strong> (<em>list of str</em><em>, </em><em>default None</em>) – List of columns to project. Order and duplicates will be preserved.
The columns will be passed down to Datasets and corresponding data
fragments to avoid loading, copying, and deserializing columns
that will not be required further down the compute chain.
By default all of the available columns are projected. Raises
an exception if any of the referenced column names does not exist
in the dataset’s Schema.</p></li>
<li><p><strong>filter</strong> (<a class="reference internal" href="pyarrow.dataset.Expression.html#pyarrow.dataset.Expression" title="pyarrow.dataset.Expression"><em>Expression</em></a><em>, </em><em>default None</em>) – Scan will return only the rows matching the filter.
If possible the predicate will be pushed down to exploit the
partition information or internal metadata found in the data
source, e.g. Parquet statistics. Otherwise filters the loaded
RecordBatches before yielding them.</p></li>
<li><p><strong>batch_size</strong> (<em>int</em><em>, </em><em>default 1M</em>) – The maximum row count for scanned record batches. If scanned
record batches are overflowing memory then this method can be
called to reduce their size.</p></li>
<li><p><strong>use_threads</strong> (<em>bool</em><em>, </em><em>default True</em>) – If enabled, then maximum parallelism will be used determined by
the number of available CPU cores.</p></li>
<li><p><strong>memory_pool</strong> (<a class="reference internal" href="pyarrow.MemoryPool.html#pyarrow.MemoryPool" title="pyarrow.MemoryPool"><em>MemoryPool</em></a><em>, </em><em>default None</em>) – For memory allocations, if required. If not specified, uses the
default pool.</p></li>
</ul>
</dd>
</dl>
<dl class="py method">
<dt id="pyarrow.dataset.Scanner.__init__">
<code class="sig-name descname">__init__</code><span class="sig-paren">(</span><em class="sig-param"><span class="o">*</span><span class="n">args</span></em>, <em class="sig-param"><span class="o">**</span><span class="n">kwargs</span></em><span class="sig-paren">)</span><a class="headerlink" href="#pyarrow.dataset.Scanner.__init__" title="Permalink to this definition"></a></dt>
<dd><p>Initialize self. See help(type(self)) for accurate signature.</p>
</dd></dl>
<p class="rubric">Methods</p>
<table class="longtable docutils align-default">
<colgroup>
<col style="width: 10%" />
<col style="width: 90%" />
</colgroup>
<tbody>
<tr class="row-odd"><td><p><a class="reference internal" href="#pyarrow.dataset.Scanner.__init__" title="pyarrow.dataset.Scanner.__init__"><code class="xref py py-obj docutils literal notranslate"><span class="pre">__init__</span></code></a>(*args, **kwargs)</p></td>
<td><p>Initialize self.</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="#pyarrow.dataset.Scanner.from_dataset" title="pyarrow.dataset.Scanner.from_dataset"><code class="xref py py-obj docutils literal notranslate"><span class="pre">from_dataset</span></code></a></p></td>
<td><p></p></td>
</tr>
<tr class="row-odd"><td><p><a class="reference internal" href="#pyarrow.dataset.Scanner.from_fragment" title="pyarrow.dataset.Scanner.from_fragment"><code class="xref py py-obj docutils literal notranslate"><span class="pre">from_fragment</span></code></a></p></td>
<td><p></p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="#pyarrow.dataset.Scanner.get_fragments" title="pyarrow.dataset.Scanner.get_fragments"><code class="xref py py-obj docutils literal notranslate"><span class="pre">get_fragments</span></code></a></p></td>
<td><p>Returns an iterator over the fragments in this scan.</p></td>
</tr>
<tr class="row-odd"><td><p><a class="reference internal" href="#pyarrow.dataset.Scanner.scan" title="pyarrow.dataset.Scanner.scan"><code class="xref py py-obj docutils literal notranslate"><span class="pre">scan</span></code></a></p></td>
<td><p>Returns a stream of ScanTasks</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="#pyarrow.dataset.Scanner.to_batches" title="pyarrow.dataset.Scanner.to_batches"><code class="xref py py-obj docutils literal notranslate"><span class="pre">to_batches</span></code></a></p></td>
<td><p>Consume a Scanner in record batches.</p></td>
</tr>
<tr class="row-odd"><td><p><a class="reference internal" href="#pyarrow.dataset.Scanner.to_table" title="pyarrow.dataset.Scanner.to_table"><code class="xref py py-obj docutils literal notranslate"><span class="pre">to_table</span></code></a></p></td>
<td><p>Convert a Scanner into a Table.</p></td>
</tr>
</tbody>
</table>
<dl class="py method">
<dt id="pyarrow.dataset.Scanner.from_dataset">
<em class="property">static </em><code class="sig-name descname">from_dataset</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyarrow.dataset.Scanner.from_dataset" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="py method">
<dt id="pyarrow.dataset.Scanner.from_fragment">
<em class="property">static </em><code class="sig-name descname">from_fragment</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyarrow.dataset.Scanner.from_fragment" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="py method">
<dt id="pyarrow.dataset.Scanner.get_fragments">
<code class="sig-name descname">get_fragments</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyarrow.dataset.Scanner.get_fragments" title="Permalink to this definition"></a></dt>
<dd><p>Returns an iterator over the fragments in this scan.</p>
</dd></dl>
<dl class="py method">
<dt id="pyarrow.dataset.Scanner.scan">
<code class="sig-name descname">scan</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyarrow.dataset.Scanner.scan" title="Permalink to this definition"></a></dt>
<dd><p>Returns a stream of ScanTasks</p>
<p>The caller is responsible to dispatch/schedule said tasks. Tasks should
be safe to run in a concurrent fashion and outlive the iterator.</p>
<dl class="field-list simple">
<dt class="field-odd">Returns</dt>
<dd class="field-odd"><p><strong>scan_tasks</strong> (<em>iterator of ScanTask</em>)</p>
</dd>
</dl>
</dd></dl>
<dl class="py method">
<dt id="pyarrow.dataset.Scanner.to_batches">
<code class="sig-name descname">to_batches</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyarrow.dataset.Scanner.to_batches" title="Permalink to this definition"></a></dt>
<dd><p>Consume a Scanner in record batches.</p>
<p>Sequentially executes the ScanTasks as the returned generator gets
consumed.</p>
<dl class="field-list simple">
<dt class="field-odd">Returns</dt>
<dd class="field-odd"><p><strong>record_batches</strong> (<em>iterator of RecordBatch</em>)</p>
</dd>
</dl>
</dd></dl>
<dl class="py method">
<dt id="pyarrow.dataset.Scanner.to_table">
<code class="sig-name descname">to_table</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#pyarrow.dataset.Scanner.to_table" title="Permalink to this definition"></a></dt>
<dd><p>Convert a Scanner into a Table.</p>
<p>Use this convenience utility with care. This will serially materialize
the Scan result in memory before creating the Table.</p>
<dl class="field-list simple">
<dt class="field-odd">Returns</dt>
<dd class="field-odd"><p><strong>table</strong> (<em>Table</em>)</p>
</dd>
</dl>
</dd></dl>
</dd></dl>
</div>
</div>
</div>
<footer>
<div class="rst-footer-buttons" role="navigation" aria-label="footer navigation">
<a href="pyarrow.dataset.Expression.html" class="btn btn-neutral float-right" title="pyarrow.dataset.Expression" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right"></span></a>
<a href="pyarrow.dataset.UnionDataset.html" class="btn btn-neutral float-left" title="pyarrow.dataset.UnionDataset" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
</div>
<hr/>
<div role="contentinfo">
<p>
&copy; Copyright 2016-2019 Apache Software Foundation
</p>
</div>
Built with <a href="http://sphinx-doc.org/">Sphinx</a> using a
<a href="https://github.com/rtfd/sphinx_rtd_theme">theme</a>
provided by <a href="https://readthedocs.org">Read the Docs</a>.
</footer>
</div>
</div>
</section>
</div>
<script type="text/javascript">
jQuery(function () {
SphinxRtdTheme.Navigation.enable(true);
});
</script>
<script type="text/javascript" src="/docs/_static/versionwarning.js"></script></body>
</html>