blob: 86a36b4237b80b98e1bf03e4d12f965b1c30f068 [file] [log] [blame]
<!DOCTYPE html>
<html lang="en" data-content_root="" >
<head>
<meta charset="utf-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<title>pyarrow.dataset &#8212; Apache Arrow v17.0.0.dev52</title>
<script data-cfasync="false">
document.documentElement.dataset.mode = localStorage.getItem("mode") || "";
document.documentElement.dataset.theme = localStorage.getItem("theme") || "light";
</script>
<!-- Loaded before other Sphinx assets -->
<link href="../../_static/styles/theme.css?digest=8d27b9dea8ad943066ae" rel="stylesheet" />
<link href="../../_static/styles/bootstrap.css?digest=8d27b9dea8ad943066ae" rel="stylesheet" />
<link href="../../_static/styles/pydata-sphinx-theme.css?digest=8d27b9dea8ad943066ae" rel="stylesheet" />
<link href="../../_static/vendor/fontawesome/6.5.1/css/all.min.css?digest=8d27b9dea8ad943066ae" rel="stylesheet" />
<link rel="preload" as="font" type="font/woff2" crossorigin href="../../_static/vendor/fontawesome/6.5.1/webfonts/fa-solid-900.woff2" />
<link rel="preload" as="font" type="font/woff2" crossorigin href="../../_static/vendor/fontawesome/6.5.1/webfonts/fa-brands-400.woff2" />
<link rel="preload" as="font" type="font/woff2" crossorigin href="../../_static/vendor/fontawesome/6.5.1/webfonts/fa-regular-400.woff2" />
<link rel="stylesheet" type="text/css" href="../../_static/pygments.css" />
<link rel="stylesheet" type="text/css" href="../../_static/copybutton.css" />
<link rel="stylesheet" type="text/css" href="../../_static/design-style.1e8bd061cd6da7fc9cf755528e8ffc24.min.css" />
<link rel="stylesheet" type="text/css" href="../../_static/theme_overrides.css" />
<!-- Pre-loaded scripts that we'll load fully later -->
<link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=8d27b9dea8ad943066ae" />
<link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=8d27b9dea8ad943066ae" />
<script src="../../_static/vendor/fontawesome/6.5.1/js/all.min.js?digest=8d27b9dea8ad943066ae"></script>
<script data-url_root="../../" id="documentation_options" src="../../_static/documentation_options.js"></script>
<script src="../../_static/doctools.js"></script>
<script src="../../_static/sphinx_highlight.js"></script>
<script src="../../_static/clipboard.min.js"></script>
<script src="../../_static/copybutton.js"></script>
<script src="../../_static/design-tabs.js"></script>
<script>DOCUMENTATION_OPTIONS.pagename = '_modules/pyarrow/dataset';</script>
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.15.2';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = '/docs/_static/versions.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = 'dev/';
DOCUMENTATION_OPTIONS.show_version_warning_banner = true;
</script>
<link rel="canonical" href="https://arrow.apache.org/docs/_modules/pyarrow/dataset.html" />
<link rel="icon" href="../../_static/favicon.ico"/>
<link rel="index" title="Index" href="../../genindex.html" />
<link rel="search" title="Search" href="../../search.html" />
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<!-- Matomo -->
<script>
var _paq = window._paq = window._paq || [];
/* tracker methods like "setCustomDimension" should be called before "trackPageView" */
/* We explicitly disable cookie tracking to avoid privacy issues */
_paq.push(['disableCookies']);
_paq.push(['trackPageView']);
_paq.push(['enableLinkTracking']);
(function() {
var u="https://analytics.apache.org/";
_paq.push(['setTrackerUrl', u+'matomo.php']);
_paq.push(['setSiteId', '20']);
var d=document, g=d.createElement('script'), s=d.getElementsByTagName('script')[0];
g.async=true; g.src=u+'matomo.js'; s.parentNode.insertBefore(g,s);
})();
</script>
<!-- End Matomo Code -->
</head>
<body data-bs-spy="scroll" data-bs-target=".bd-toc-nav" data-offset="180" data-bs-root-margin="0px 0px -60%" data-default-mode="">
<a id="pst-skip-link" class="skip-link" href="#main-content">Skip to main content</a>
<div id="pst-scroll-pixel-helper"></div>
<button type="button" class="btn rounded-pill" id="pst-back-to-top">
<i class="fa-solid fa-arrow-up"></i>
Back to top
</button>
<input type="checkbox"
class="sidebar-toggle"
name="__primary"
id="__primary"/>
<label class="overlay overlay-primary" for="__primary"></label>
<input type="checkbox"
class="sidebar-toggle"
name="__secondary"
id="__secondary"/>
<label class="overlay overlay-secondary" for="__secondary"></label>
<div class="search-button__wrapper">
<div class="search-button__overlay"></div>
<div class="search-button__search-container">
<form class="bd-search d-flex align-items-center"
action="../../search.html"
method="get">
<i class="fa-solid fa-magnifying-glass"></i>
<input type="search"
class="form-control"
name="q"
id="search-input"
placeholder="Search the docs ..."
aria-label="Search the docs ..."
autocomplete="off"
autocorrect="off"
autocapitalize="off"
spellcheck="false"/>
<span class="search-button__kbd-shortcut"><kbd class="kbd-shortcut__modifier">Ctrl</kbd>+<kbd>K</kbd></span>
</form></div>
</div>
<header class="bd-header navbar navbar-expand-lg bd-navbar">
<div class="bd-header__inner bd-page-width">
<label class="sidebar-toggle primary-toggle" for="__primary">
<span class="fa-solid fa-bars"></span>
</label>
<div class="col-lg-3 navbar-header-items__start">
<div class="navbar-item">
<a class="navbar-brand logo" href="../../index.html">
<img src="../../_static/arrow.png" class="logo__image only-light" alt="Apache Arrow v17.0.0.dev52 - Home"/>
<script>document.write(`<img src="../../_static/arrow-dark.png" class="logo__image only-dark" alt="Apache Arrow v17.0.0.dev52 - Home"/>`);</script>
</a></div>
</div>
<div class="col-lg-9 navbar-header-items">
<div class="me-auto navbar-header-items__center">
<div class="navbar-item">
<nav class="navbar-nav">
<ul class="bd-navbar-elements navbar-nav">
<li class="nav-item">
<a class="nav-link nav-internal" href="../../format/index.html">
Specifications
</a>
</li>
<li class="nav-item">
<a class="nav-link nav-internal" href="../../developers/index.html">
Development
</a>
</li>
<li class="nav-item dropdown">
<button class="btn dropdown-toggle nav-item" type="button" data-bs-toggle="dropdown" aria-expanded="false" aria-controls="pst-nav-more-links">
Implementations
</button>
<ul id="pst-nav-more-links" class="dropdown-menu">
<li class="nav-item">
<a class="nav-link dropdown-item nav-internal" href="../../c_glib/index.html">
C/GLib
</a>
</li>
<li class="nav-item">
<a class="nav-link dropdown-item nav-internal" href="../../cpp/index.html">
C++
</a>
</li>
<li class="nav-item">
<a class="nav-link dropdown-item nav-external" href="https://github.com/apache/arrow/blob/main/csharp/README.md">
C#
</a>
</li>
<li class="nav-item">
<a class="nav-link dropdown-item nav-external" href="https://pkg.go.dev/github.com/apache/arrow/go/v17">
Go
</a>
</li>
<li class="nav-item">
<a class="nav-link dropdown-item nav-internal" href="../../java/index.html">
Java
</a>
</li>
<li class="nav-item">
<a class="nav-link dropdown-item nav-internal" href="../../js/index.html">
JavaScript
</a>
</li>
<li class="nav-item">
<a class="nav-link dropdown-item nav-external" href="https://arrow.apache.org/julia/">
Julia
</a>
</li>
<li class="nav-item">
<a class="nav-link dropdown-item nav-external" href="https://github.com/apache/arrow/blob/main/matlab/README.md">
MATLAB
</a>
</li>
<li class="nav-item">
<a class="nav-link dropdown-item nav-external" href="https://arrow.apache.org/nanoarrow/">
nanoarrow
</a>
</li>
<li class="nav-item">
<a class="nav-link dropdown-item nav-internal" href="../../python/index.html">
Python
</a>
</li>
<li class="nav-item">
<a class="nav-link dropdown-item nav-internal" href="../../r/index.html">
R
</a>
</li>
<li class="nav-item">
<a class="nav-link dropdown-item nav-external" href="https://github.com/apache/arrow/blob/main/ruby/README.md">
Ruby
</a>
</li>
<li class="nav-item">
<a class="nav-link dropdown-item nav-external" href="https://docs.rs/crate/arrow/">
Rust
</a>
</li>
<li class="nav-item">
<a class="nav-link dropdown-item nav-internal" href="../../status.html">
Implementation Status
</a>
</li>
<li class="nav-item">
<a class="nav-link dropdown-item nav-external" href="https://arrow.apache.org/cookbook/cpp/">
C++ cookbook
</a>
</li>
<li class="nav-item">
<a class="nav-link dropdown-item nav-external" href="https://arrow.apache.org/cookbook/java/">
Java cookbook
</a>
</li>
<li class="nav-item">
<a class="nav-link dropdown-item nav-external" href="https://arrow.apache.org/cookbook/py/">
Python cookbook
</a>
</li>
<li class="nav-item">
<a class="nav-link dropdown-item nav-external" href="https://arrow.apache.org/cookbook/r/">
R cookbook
</a>
</li>
</ul>
</li>
</ul>
</nav></div>
</div>
<div class="navbar-header-items__end">
<div class="navbar-item navbar-persistent--container">
<script>
document.write(`
<button class="btn navbar-btn search-button-field search-button__button" title="Search" aria-label="Search" data-bs-placement="bottom" data-bs-toggle="tooltip">
<i class="fa-solid fa-magnifying-glass"></i>
<span class="search-button__default-text">Search</span>
<span class="search-button__kbd-shortcut"><kbd class="kbd-shortcut__modifier">Ctrl</kbd>+<kbd class="kbd-shortcut__modifier">K</kbd></span>
</button>
`);
</script>
</div>
<div class="navbar-item">
<script>
document.write(`
<div class="version-switcher__container dropdown">
<button id="pst-version-switcher-button-2"
type="button"
class="version-switcher__button btn btn-sm navbar-btn dropdown-toggle"
data-bs-toggle="dropdown"
aria-haspopup="listbox"
aria-controls="pst-version-switcher-list-2"
aria-label="Version switcher list"
>
Choose version <!-- this text may get changed later by javascript -->
<span class="caret"></span>
</button>
<div id="pst-version-switcher-list-2"
class="version-switcher__menu dropdown-menu list-group-flush py-0"
role="listbox" aria-labelledby="pst-version-switcher-button-2">
<!-- dropdown will be populated by javascript on page load -->
</div>
</div>
`);
</script></div>
<div class="navbar-item">
<script>
document.write(`
<button class="btn btn-sm navbar-btn theme-switch-button" title="light/dark" aria-label="light/dark" data-bs-placement="bottom" data-bs-toggle="tooltip">
<span class="theme-switch nav-link" data-mode="light"><i class="fa-solid fa-sun fa-lg"></i></span>
<span class="theme-switch nav-link" data-mode="dark"><i class="fa-solid fa-moon fa-lg"></i></span>
<span class="theme-switch nav-link" data-mode="auto"><i class="fa-solid fa-circle-half-stroke fa-lg"></i></span>
</button>
`);
</script></div>
<div class="navbar-item"><ul class="navbar-icon-links navbar-nav"
aria-label="Icon Links">
<li class="nav-item">
<a href="https://github.com/apache/arrow" title="GitHub" class="nav-link" rel="noopener" target="_blank" data-bs-toggle="tooltip" data-bs-placement="bottom"><span><i class="fa-brands fa-square-github fa-lg" aria-hidden="true"></i></span>
<span class="sr-only">GitHub</span></a>
</li>
<li class="nav-item">
<a href="https://twitter.com/ApacheArrow" title="X" class="nav-link" rel="noopener" target="_blank" data-bs-toggle="tooltip" data-bs-placement="bottom"><span><i class="fa-brands fa-square-x-twitter fa-lg" aria-hidden="true"></i></span>
<span class="sr-only">X</span></a>
</li>
</ul></div>
</div>
</div>
<div class="navbar-persistent--mobile">
<script>
document.write(`
<button class="btn navbar-btn search-button-field search-button__button" title="Search" aria-label="Search" data-bs-placement="bottom" data-bs-toggle="tooltip">
<i class="fa-solid fa-magnifying-glass"></i>
<span class="search-button__default-text">Search</span>
<span class="search-button__kbd-shortcut"><kbd class="kbd-shortcut__modifier">Ctrl</kbd>+<kbd class="kbd-shortcut__modifier">K</kbd></span>
</button>
`);
</script>
</div>
</div>
</header>
<div class="bd-container">
<div class="bd-container__inner bd-page-width">
<div class="bd-sidebar-primary bd-sidebar hide-on-wide">
<div class="sidebar-header-items sidebar-primary__section">
<div class="sidebar-header-items__center">
<div class="navbar-item">
<nav class="navbar-nav">
<ul class="bd-navbar-elements navbar-nav">
<li class="nav-item">
<a class="nav-link nav-internal" href="../../format/index.html">
Specifications
</a>
</li>
<li class="nav-item">
<a class="nav-link nav-internal" href="../../developers/index.html">
Development
</a>
</li>
<li class="nav-item dropdown">
<button class="btn dropdown-toggle nav-item" type="button" data-bs-toggle="dropdown" aria-expanded="false" aria-controls="pst-nav-more-links-2">
Implementations
</button>
<ul id="pst-nav-more-links-2" class="dropdown-menu">
<li class="nav-item">
<a class="nav-link dropdown-item nav-internal" href="../../c_glib/index.html">
C/GLib
</a>
</li>
<li class="nav-item">
<a class="nav-link dropdown-item nav-internal" href="../../cpp/index.html">
C++
</a>
</li>
<li class="nav-item">
<a class="nav-link dropdown-item nav-external" href="https://github.com/apache/arrow/blob/main/csharp/README.md">
C#
</a>
</li>
<li class="nav-item">
<a class="nav-link dropdown-item nav-external" href="https://pkg.go.dev/github.com/apache/arrow/go/v17">
Go
</a>
</li>
<li class="nav-item">
<a class="nav-link dropdown-item nav-internal" href="../../java/index.html">
Java
</a>
</li>
<li class="nav-item">
<a class="nav-link dropdown-item nav-internal" href="../../js/index.html">
JavaScript
</a>
</li>
<li class="nav-item">
<a class="nav-link dropdown-item nav-external" href="https://arrow.apache.org/julia/">
Julia
</a>
</li>
<li class="nav-item">
<a class="nav-link dropdown-item nav-external" href="https://github.com/apache/arrow/blob/main/matlab/README.md">
MATLAB
</a>
</li>
<li class="nav-item">
<a class="nav-link dropdown-item nav-external" href="https://arrow.apache.org/nanoarrow/">
nanoarrow
</a>
</li>
<li class="nav-item">
<a class="nav-link dropdown-item nav-internal" href="../../python/index.html">
Python
</a>
</li>
<li class="nav-item">
<a class="nav-link dropdown-item nav-internal" href="../../r/index.html">
R
</a>
</li>
<li class="nav-item">
<a class="nav-link dropdown-item nav-external" href="https://github.com/apache/arrow/blob/main/ruby/README.md">
Ruby
</a>
</li>
<li class="nav-item">
<a class="nav-link dropdown-item nav-external" href="https://docs.rs/crate/arrow/">
Rust
</a>
</li>
<li class="nav-item">
<a class="nav-link dropdown-item nav-internal" href="../../status.html">
Implementation Status
</a>
</li>
<li class="nav-item">
<a class="nav-link dropdown-item nav-external" href="https://arrow.apache.org/cookbook/cpp/">
C++ cookbook
</a>
</li>
<li class="nav-item">
<a class="nav-link dropdown-item nav-external" href="https://arrow.apache.org/cookbook/java/">
Java cookbook
</a>
</li>
<li class="nav-item">
<a class="nav-link dropdown-item nav-external" href="https://arrow.apache.org/cookbook/py/">
Python cookbook
</a>
</li>
<li class="nav-item">
<a class="nav-link dropdown-item nav-external" href="https://arrow.apache.org/cookbook/r/">
R cookbook
</a>
</li>
</ul>
</li>
</ul>
</nav></div>
</div>
<div class="sidebar-header-items__end">
<div class="navbar-item">
<script>
document.write(`
<div class="version-switcher__container dropdown">
<button id="pst-version-switcher-button-3"
type="button"
class="version-switcher__button btn btn-sm navbar-btn dropdown-toggle"
data-bs-toggle="dropdown"
aria-haspopup="listbox"
aria-controls="pst-version-switcher-list-3"
aria-label="Version switcher list"
>
Choose version <!-- this text may get changed later by javascript -->
<span class="caret"></span>
</button>
<div id="pst-version-switcher-list-3"
class="version-switcher__menu dropdown-menu list-group-flush py-0"
role="listbox" aria-labelledby="pst-version-switcher-button-3">
<!-- dropdown will be populated by javascript on page load -->
</div>
</div>
`);
</script></div>
<div class="navbar-item">
<script>
document.write(`
<button class="btn btn-sm navbar-btn theme-switch-button" title="light/dark" aria-label="light/dark" data-bs-placement="bottom" data-bs-toggle="tooltip">
<span class="theme-switch nav-link" data-mode="light"><i class="fa-solid fa-sun fa-lg"></i></span>
<span class="theme-switch nav-link" data-mode="dark"><i class="fa-solid fa-moon fa-lg"></i></span>
<span class="theme-switch nav-link" data-mode="auto"><i class="fa-solid fa-circle-half-stroke fa-lg"></i></span>
</button>
`);
</script></div>
<div class="navbar-item"><ul class="navbar-icon-links navbar-nav"
aria-label="Icon Links">
<li class="nav-item">
<a href="https://github.com/apache/arrow" title="GitHub" class="nav-link" rel="noopener" target="_blank" data-bs-toggle="tooltip" data-bs-placement="bottom"><span><i class="fa-brands fa-square-github fa-lg" aria-hidden="true"></i></span>
<span class="sr-only">GitHub</span></a>
</li>
<li class="nav-item">
<a href="https://twitter.com/ApacheArrow" title="X" class="nav-link" rel="noopener" target="_blank" data-bs-toggle="tooltip" data-bs-placement="bottom"><span><i class="fa-brands fa-square-x-twitter fa-lg" aria-hidden="true"></i></span>
<span class="sr-only">X</span></a>
</li>
</ul></div>
</div>
</div>
<div class="sidebar-primary-items__end sidebar-primary__section">
</div>
<div id="rtd-footer-container"></div>
</div>
<main id="main-content" class="bd-main">
<div class="bd-content">
<div class="bd-article-container">
<div class="bd-header-article">
<div class="header-article-items header-article__inner">
<div class="header-article-items__start">
<div class="header-article-item">
<nav aria-label="Breadcrumb">
<ul class="bd-breadcrumbs">
<li class="breadcrumb-item breadcrumb-home">
<a href="../../index.html" class="nav-link" aria-label="Home">
<i class="fa-solid fa-home"></i>
</a>
</li>
<li class="breadcrumb-item"><a href="../index.html" class="nav-link">Module code</a></li>
<li class="breadcrumb-item"><a href="../pyarrow.html" class="nav-link">pyarrow</a></li>
<li class="breadcrumb-item active" aria-current="page">pyarrow.dataset</li>
</ul>
</nav>
</div>
</div>
</div>
</div>
<div id="searchbox"></div>
<article class="bd-article">
<h1>Source code for pyarrow.dataset</h1><div class="highlight"><pre>
<span></span><span class="c1"># Licensed to the Apache Software Foundation (ASF) under one</span>
<span class="c1"># or more contributor license agreements. See the NOTICE file</span>
<span class="c1"># distributed with this work for additional information</span>
<span class="c1"># regarding copyright ownership. The ASF licenses this file</span>
<span class="c1"># to you under the Apache License, Version 2.0 (the</span>
<span class="c1"># &quot;License&quot;); you may not use this file except in compliance</span>
<span class="c1"># with the License. You may obtain a copy of the License at</span>
<span class="c1">#</span>
<span class="c1"># http://www.apache.org/licenses/LICENSE-2.0</span>
<span class="c1">#</span>
<span class="c1"># Unless required by applicable law or agreed to in writing,</span>
<span class="c1"># software distributed under the License is distributed on an</span>
<span class="c1"># &quot;AS IS&quot; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY</span>
<span class="c1"># KIND, either express or implied. See the License for the</span>
<span class="c1"># specific language governing permissions and limitations</span>
<span class="c1"># under the License.</span>
<span class="sd">&quot;&quot;&quot;Dataset is currently unstable. APIs subject to change without notice.&quot;&quot;&quot;</span>
<span class="kn">import</span> <span class="nn">pyarrow</span> <span class="k">as</span> <span class="nn">pa</span>
<span class="kn">from</span> <span class="nn">pyarrow.util</span> <span class="kn">import</span> <span class="n">_is_iterable</span><span class="p">,</span> <span class="n">_stringify_path</span><span class="p">,</span> <span class="n">_is_path_like</span>
<span class="k">try</span><span class="p">:</span>
<span class="kn">from</span> <span class="nn">pyarrow._dataset</span> <span class="kn">import</span> <span class="p">(</span> <span class="c1"># noqa</span>
<span class="n">CsvFileFormat</span><span class="p">,</span>
<span class="n">CsvFragmentScanOptions</span><span class="p">,</span>
<span class="n">JsonFileFormat</span><span class="p">,</span>
<span class="n">JsonFragmentScanOptions</span><span class="p">,</span>
<span class="n">Dataset</span><span class="p">,</span>
<span class="n">DatasetFactory</span><span class="p">,</span>
<span class="n">DirectoryPartitioning</span><span class="p">,</span>
<span class="n">FeatherFileFormat</span><span class="p">,</span>
<span class="n">FilenamePartitioning</span><span class="p">,</span>
<span class="n">FileFormat</span><span class="p">,</span>
<span class="n">FileFragment</span><span class="p">,</span>
<span class="n">FileSystemDataset</span><span class="p">,</span>
<span class="n">FileSystemDatasetFactory</span><span class="p">,</span>
<span class="n">FileSystemFactoryOptions</span><span class="p">,</span>
<span class="n">FileWriteOptions</span><span class="p">,</span>
<span class="n">Fragment</span><span class="p">,</span>
<span class="n">FragmentScanOptions</span><span class="p">,</span>
<span class="n">HivePartitioning</span><span class="p">,</span>
<span class="n">IpcFileFormat</span><span class="p">,</span>
<span class="n">IpcFileWriteOptions</span><span class="p">,</span>
<span class="n">InMemoryDataset</span><span class="p">,</span>
<span class="n">Partitioning</span><span class="p">,</span>
<span class="n">PartitioningFactory</span><span class="p">,</span>
<span class="n">Scanner</span><span class="p">,</span>
<span class="n">TaggedRecordBatch</span><span class="p">,</span>
<span class="n">UnionDataset</span><span class="p">,</span>
<span class="n">UnionDatasetFactory</span><span class="p">,</span>
<span class="n">WrittenFile</span><span class="p">,</span>
<span class="n">get_partition_keys</span><span class="p">,</span>
<span class="n">get_partition_keys</span> <span class="k">as</span> <span class="n">_get_partition_keys</span><span class="p">,</span> <span class="c1"># keep for backwards compatibility</span>
<span class="n">_filesystemdataset_write</span><span class="p">,</span>
<span class="p">)</span>
<span class="k">except</span> <span class="ne">ImportError</span> <span class="k">as</span> <span class="n">exc</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">ImportError</span><span class="p">(</span>
<span class="sa">f</span><span class="s2">&quot;The pyarrow installation is not built with support for &#39;dataset&#39; (</span><span class="si">{</span><span class="nb">str</span><span class="p">(</span><span class="n">exc</span><span class="p">)</span><span class="si">}</span><span class="s2">)&quot;</span>
<span class="p">)</span> <span class="kn">from</span> <span class="kc">None</span>
<span class="c1"># keep Expression functionality exposed here for backwards compatibility</span>
<span class="kn">from</span> <span class="nn">pyarrow.compute</span> <span class="kn">import</span> <span class="n">Expression</span><span class="p">,</span> <span class="n">scalar</span><span class="p">,</span> <span class="n">field</span> <span class="c1"># noqa</span>
<span class="n">_orc_available</span> <span class="o">=</span> <span class="kc">False</span>
<span class="n">_orc_msg</span> <span class="o">=</span> <span class="p">(</span>
<span class="s2">&quot;The pyarrow installation is not built with support for the ORC file &quot;</span>
<span class="s2">&quot;format.&quot;</span>
<span class="p">)</span>
<span class="k">try</span><span class="p">:</span>
<span class="kn">from</span> <span class="nn">pyarrow._dataset_orc</span> <span class="kn">import</span> <span class="n">OrcFileFormat</span>
<span class="n">_orc_available</span> <span class="o">=</span> <span class="kc">True</span>
<span class="k">except</span> <span class="ne">ImportError</span><span class="p">:</span>
<span class="k">pass</span>
<span class="n">_parquet_available</span> <span class="o">=</span> <span class="kc">False</span>
<span class="n">_parquet_msg</span> <span class="o">=</span> <span class="p">(</span>
<span class="s2">&quot;The pyarrow installation is not built with support for the Parquet file &quot;</span>
<span class="s2">&quot;format.&quot;</span>
<span class="p">)</span>
<span class="k">try</span><span class="p">:</span>
<span class="kn">from</span> <span class="nn">pyarrow._dataset_parquet</span> <span class="kn">import</span> <span class="p">(</span> <span class="c1"># noqa</span>
<span class="n">ParquetDatasetFactory</span><span class="p">,</span>
<span class="n">ParquetFactoryOptions</span><span class="p">,</span>
<span class="n">ParquetFileFormat</span><span class="p">,</span>
<span class="n">ParquetFileFragment</span><span class="p">,</span>
<span class="n">ParquetFileWriteOptions</span><span class="p">,</span>
<span class="n">ParquetFragmentScanOptions</span><span class="p">,</span>
<span class="n">ParquetReadOptions</span><span class="p">,</span>
<span class="n">RowGroupInfo</span><span class="p">,</span>
<span class="p">)</span>
<span class="n">_parquet_available</span> <span class="o">=</span> <span class="kc">True</span>
<span class="k">except</span> <span class="ne">ImportError</span><span class="p">:</span>
<span class="k">pass</span>
<span class="k">try</span><span class="p">:</span>
<span class="kn">from</span> <span class="nn">pyarrow._dataset_parquet_encryption</span> <span class="kn">import</span> <span class="p">(</span> <span class="c1"># noqa</span>
<span class="n">ParquetDecryptionConfig</span><span class="p">,</span>
<span class="n">ParquetEncryptionConfig</span><span class="p">,</span>
<span class="p">)</span>
<span class="k">except</span> <span class="ne">ImportError</span><span class="p">:</span>
<span class="k">pass</span>
<span class="k">def</span> <span class="fm">__getattr__</span><span class="p">(</span><span class="n">name</span><span class="p">):</span>
<span class="k">if</span> <span class="n">name</span> <span class="o">==</span> <span class="s2">&quot;OrcFileFormat&quot;</span> <span class="ow">and</span> <span class="ow">not</span> <span class="n">_orc_available</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">ImportError</span><span class="p">(</span><span class="n">_orc_msg</span><span class="p">)</span>
<span class="k">if</span> <span class="n">name</span> <span class="o">==</span> <span class="s2">&quot;ParquetFileFormat&quot;</span> <span class="ow">and</span> <span class="ow">not</span> <span class="n">_parquet_available</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">ImportError</span><span class="p">(</span><span class="n">_parquet_msg</span><span class="p">)</span>
<span class="k">raise</span> <span class="ne">AttributeError</span><span class="p">(</span>
<span class="s2">&quot;module &#39;pyarrow.dataset&#39; has no attribute &#39;</span><span class="si">{0}</span><span class="s2">&#39;&quot;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">name</span><span class="p">)</span>
<span class="p">)</span>
<div class="viewcode-block" id="partitioning"><a class="viewcode-back" href="../../python/generated/pyarrow.dataset.partitioning.html#pyarrow.dataset.partitioning">[docs]</a><span class="k">def</span> <span class="nf">partitioning</span><span class="p">(</span><span class="n">schema</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">field_names</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">flavor</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span>
<span class="n">dictionaries</span><span class="o">=</span><span class="kc">None</span><span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Specify a partitioning scheme.</span>
<span class="sd"> The supported schemes include:</span>
<span class="sd"> - &quot;DirectoryPartitioning&quot;: this scheme expects one segment in the file path</span>
<span class="sd"> for each field in the specified schema (all fields are required to be</span>
<span class="sd"> present). For example given schema&lt;year:int16, month:int8&gt; the path</span>
<span class="sd"> &quot;/2009/11&quot; would be parsed to (&quot;year&quot;_ == 2009 and &quot;month&quot;_ == 11).</span>
<span class="sd"> - &quot;HivePartitioning&quot;: a scheme for &quot;/$key=$value/&quot; nested directories as</span>
<span class="sd"> found in Apache Hive. This is a multi-level, directory based partitioning</span>
<span class="sd"> scheme. Data is partitioned by static values of a particular column in</span>
<span class="sd"> the schema. Partition keys are represented in the form $key=$value in</span>
<span class="sd"> directory names. Field order is ignored, as are missing or unrecognized</span>
<span class="sd"> field names.</span>
<span class="sd"> For example, given schema&lt;year:int16, month:int8, day:int8&gt;, a possible</span>
<span class="sd"> path would be &quot;/year=2009/month=11/day=15&quot; (but the field order does not</span>
<span class="sd"> need to match).</span>
<span class="sd"> - &quot;FilenamePartitioning&quot;: this scheme expects the partitions will have</span>
<span class="sd"> filenames containing the field values separated by &quot;_&quot;.</span>
<span class="sd"> For example, given schema&lt;year:int16, month:int8, day:int8&gt;, a possible</span>
<span class="sd"> partition filename &quot;2009_11_part-0.parquet&quot; would be parsed</span>
<span class="sd"> to (&quot;year&quot;_ == 2009 and &quot;month&quot;_ == 11).</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> schema : pyarrow.Schema, default None</span>
<span class="sd"> The schema that describes the partitions present in the file path.</span>
<span class="sd"> If not specified, and `field_names` and/or `flavor` are specified,</span>
<span class="sd"> the schema will be inferred from the file path (and a</span>
<span class="sd"> PartitioningFactory is returned).</span>
<span class="sd"> field_names : list of str, default None</span>
<span class="sd"> A list of strings (field names). If specified, the schema&#39;s types are</span>
<span class="sd"> inferred from the file paths (only valid for DirectoryPartitioning).</span>
<span class="sd"> flavor : str, default None</span>
<span class="sd"> The default is DirectoryPartitioning. Specify ``flavor=&quot;hive&quot;`` for</span>
<span class="sd"> a HivePartitioning, and ``flavor=&quot;filename&quot;`` for a</span>
<span class="sd"> FilenamePartitioning.</span>
<span class="sd"> dictionaries : dict[str, Array]</span>
<span class="sd"> If the type of any field of `schema` is a dictionary type, the</span>
<span class="sd"> corresponding entry of `dictionaries` must be an array containing</span>
<span class="sd"> every value which may be taken by the corresponding column or an</span>
<span class="sd"> error will be raised in parsing. Alternatively, pass `infer` to have</span>
<span class="sd"> Arrow discover the dictionary values, in which case a</span>
<span class="sd"> PartitioningFactory is returned.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> Partitioning or PartitioningFactory</span>
<span class="sd"> The partitioning scheme</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> Specify the Schema for paths like &quot;/2009/June&quot;:</span>
<span class="sd"> &gt;&gt;&gt; import pyarrow as pa</span>
<span class="sd"> &gt;&gt;&gt; import pyarrow.dataset as ds</span>
<span class="sd"> &gt;&gt;&gt; part = ds.partitioning(pa.schema([(&quot;year&quot;, pa.int16()),</span>
<span class="sd"> ... (&quot;month&quot;, pa.string())]))</span>
<span class="sd"> or let the types be inferred by only specifying the field names:</span>
<span class="sd"> &gt;&gt;&gt; part = ds.partitioning(field_names=[&quot;year&quot;, &quot;month&quot;])</span>
<span class="sd"> For paths like &quot;/2009/June&quot;, the year will be inferred as int32 while month</span>
<span class="sd"> will be inferred as string.</span>
<span class="sd"> Specify a Schema with dictionary encoding, providing dictionary values:</span>
<span class="sd"> &gt;&gt;&gt; part = ds.partitioning(</span>
<span class="sd"> ... pa.schema([</span>
<span class="sd"> ... (&quot;year&quot;, pa.int16()),</span>
<span class="sd"> ... (&quot;month&quot;, pa.dictionary(pa.int8(), pa.string()))</span>
<span class="sd"> ... ]),</span>
<span class="sd"> ... dictionaries={</span>
<span class="sd"> ... &quot;month&quot;: pa.array([&quot;January&quot;, &quot;February&quot;, &quot;March&quot;]),</span>
<span class="sd"> ... })</span>
<span class="sd"> Alternatively, specify a Schema with dictionary encoding, but have Arrow</span>
<span class="sd"> infer the dictionary values:</span>
<span class="sd"> &gt;&gt;&gt; part = ds.partitioning(</span>
<span class="sd"> ... pa.schema([</span>
<span class="sd"> ... (&quot;year&quot;, pa.int16()),</span>
<span class="sd"> ... (&quot;month&quot;, pa.dictionary(pa.int8(), pa.string()))</span>
<span class="sd"> ... ]),</span>
<span class="sd"> ... dictionaries=&quot;infer&quot;)</span>
<span class="sd"> Create a Hive scheme for a path like &quot;/year=2009/month=11&quot;:</span>
<span class="sd"> &gt;&gt;&gt; part = ds.partitioning(</span>
<span class="sd"> ... pa.schema([(&quot;year&quot;, pa.int16()), (&quot;month&quot;, pa.int8())]),</span>
<span class="sd"> ... flavor=&quot;hive&quot;)</span>
<span class="sd"> A Hive scheme can also be discovered from the directory structure (and</span>
<span class="sd"> types will be inferred):</span>
<span class="sd"> &gt;&gt;&gt; part = ds.partitioning(flavor=&quot;hive&quot;)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="n">flavor</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
<span class="c1"># default flavor</span>
<span class="k">if</span> <span class="n">schema</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="k">if</span> <span class="n">field_names</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span>
<span class="s2">&quot;Cannot specify both &#39;schema&#39; and &#39;field_names&#39;&quot;</span><span class="p">)</span>
<span class="k">if</span> <span class="n">dictionaries</span> <span class="o">==</span> <span class="s1">&#39;infer&#39;</span><span class="p">:</span>
<span class="k">return</span> <span class="n">DirectoryPartitioning</span><span class="o">.</span><span class="n">discover</span><span class="p">(</span><span class="n">schema</span><span class="o">=</span><span class="n">schema</span><span class="p">)</span>
<span class="k">return</span> <span class="n">DirectoryPartitioning</span><span class="p">(</span><span class="n">schema</span><span class="p">,</span> <span class="n">dictionaries</span><span class="p">)</span>
<span class="k">elif</span> <span class="n">field_names</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">field_names</span><span class="p">,</span> <span class="nb">list</span><span class="p">):</span>
<span class="k">return</span> <span class="n">DirectoryPartitioning</span><span class="o">.</span><span class="n">discover</span><span class="p">(</span><span class="n">field_names</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span>
<span class="s2">&quot;Expected list of field names, got </span><span class="si">{}</span><span class="s2">&quot;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span>
<span class="nb">type</span><span class="p">(</span><span class="n">field_names</span><span class="p">)))</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span>
<span class="s2">&quot;For the default directory flavor, need to specify &quot;</span>
<span class="s2">&quot;a Schema or a list of field names&quot;</span><span class="p">)</span>
<span class="k">if</span> <span class="n">flavor</span> <span class="o">==</span> <span class="s2">&quot;filename&quot;</span><span class="p">:</span>
<span class="k">if</span> <span class="n">schema</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="k">if</span> <span class="n">field_names</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span>
<span class="s2">&quot;Cannot specify both &#39;schema&#39; and &#39;field_names&#39;&quot;</span><span class="p">)</span>
<span class="k">if</span> <span class="n">dictionaries</span> <span class="o">==</span> <span class="s1">&#39;infer&#39;</span><span class="p">:</span>
<span class="k">return</span> <span class="n">FilenamePartitioning</span><span class="o">.</span><span class="n">discover</span><span class="p">(</span><span class="n">schema</span><span class="o">=</span><span class="n">schema</span><span class="p">)</span>
<span class="k">return</span> <span class="n">FilenamePartitioning</span><span class="p">(</span><span class="n">schema</span><span class="p">,</span> <span class="n">dictionaries</span><span class="p">)</span>
<span class="k">elif</span> <span class="n">field_names</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">field_names</span><span class="p">,</span> <span class="nb">list</span><span class="p">):</span>
<span class="k">return</span> <span class="n">FilenamePartitioning</span><span class="o">.</span><span class="n">discover</span><span class="p">(</span><span class="n">field_names</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span>
<span class="s2">&quot;Expected list of field names, got </span><span class="si">{}</span><span class="s2">&quot;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span>
<span class="nb">type</span><span class="p">(</span><span class="n">field_names</span><span class="p">)))</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span>
<span class="s2">&quot;For the filename flavor, need to specify &quot;</span>
<span class="s2">&quot;a Schema or a list of field names&quot;</span><span class="p">)</span>
<span class="k">elif</span> <span class="n">flavor</span> <span class="o">==</span> <span class="s1">&#39;hive&#39;</span><span class="p">:</span>
<span class="k">if</span> <span class="n">field_names</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">&quot;Cannot specify &#39;field_names&#39; for flavor &#39;hive&#39;&quot;</span><span class="p">)</span>
<span class="k">elif</span> <span class="n">schema</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">schema</span><span class="p">,</span> <span class="n">pa</span><span class="o">.</span><span class="n">Schema</span><span class="p">):</span>
<span class="k">if</span> <span class="n">dictionaries</span> <span class="o">==</span> <span class="s1">&#39;infer&#39;</span><span class="p">:</span>
<span class="k">return</span> <span class="n">HivePartitioning</span><span class="o">.</span><span class="n">discover</span><span class="p">(</span><span class="n">schema</span><span class="o">=</span><span class="n">schema</span><span class="p">)</span>
<span class="k">return</span> <span class="n">HivePartitioning</span><span class="p">(</span><span class="n">schema</span><span class="p">,</span> <span class="n">dictionaries</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span>
<span class="s2">&quot;Expected Schema for &#39;schema&#39;, got </span><span class="si">{}</span><span class="s2">&quot;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span>
<span class="nb">type</span><span class="p">(</span><span class="n">schema</span><span class="p">)))</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">return</span> <span class="n">HivePartitioning</span><span class="o">.</span><span class="n">discover</span><span class="p">()</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">&quot;Unsupported flavor&quot;</span><span class="p">)</span></div>
<span class="k">def</span> <span class="nf">_ensure_partitioning</span><span class="p">(</span><span class="n">scheme</span><span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Validate input and return a Partitioning(Factory).</span>
<span class="sd"> It passes None through if no partitioning scheme is defined.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="n">scheme</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
<span class="k">pass</span>
<span class="k">elif</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">scheme</span><span class="p">,</span> <span class="nb">str</span><span class="p">):</span>
<span class="n">scheme</span> <span class="o">=</span> <span class="n">partitioning</span><span class="p">(</span><span class="n">flavor</span><span class="o">=</span><span class="n">scheme</span><span class="p">)</span>
<span class="k">elif</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">scheme</span><span class="p">,</span> <span class="nb">list</span><span class="p">):</span>
<span class="n">scheme</span> <span class="o">=</span> <span class="n">partitioning</span><span class="p">(</span><span class="n">field_names</span><span class="o">=</span><span class="n">scheme</span><span class="p">)</span>
<span class="k">elif</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">scheme</span><span class="p">,</span> <span class="p">(</span><span class="n">Partitioning</span><span class="p">,</span> <span class="n">PartitioningFactory</span><span class="p">)):</span>
<span class="k">pass</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">&quot;Expected Partitioning or PartitioningFactory, got </span><span class="si">{}</span><span class="s2">&quot;</span>
<span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="nb">type</span><span class="p">(</span><span class="n">scheme</span><span class="p">)))</span>
<span class="k">return</span> <span class="n">scheme</span>
<span class="k">def</span> <span class="nf">_ensure_format</span><span class="p">(</span><span class="n">obj</span><span class="p">):</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">obj</span><span class="p">,</span> <span class="n">FileFormat</span><span class="p">):</span>
<span class="k">return</span> <span class="n">obj</span>
<span class="k">elif</span> <span class="n">obj</span> <span class="o">==</span> <span class="s2">&quot;parquet&quot;</span><span class="p">:</span>
<span class="k">if</span> <span class="ow">not</span> <span class="n">_parquet_available</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="n">_parquet_msg</span><span class="p">)</span>
<span class="k">return</span> <span class="n">ParquetFileFormat</span><span class="p">()</span>
<span class="k">elif</span> <span class="n">obj</span> <span class="ow">in</span> <span class="p">{</span><span class="s2">&quot;ipc&quot;</span><span class="p">,</span> <span class="s2">&quot;arrow&quot;</span><span class="p">}:</span>
<span class="k">return</span> <span class="n">IpcFileFormat</span><span class="p">()</span>
<span class="k">elif</span> <span class="n">obj</span> <span class="o">==</span> <span class="s2">&quot;feather&quot;</span><span class="p">:</span>
<span class="k">return</span> <span class="n">FeatherFileFormat</span><span class="p">()</span>
<span class="k">elif</span> <span class="n">obj</span> <span class="o">==</span> <span class="s2">&quot;csv&quot;</span><span class="p">:</span>
<span class="k">return</span> <span class="n">CsvFileFormat</span><span class="p">()</span>
<span class="k">elif</span> <span class="n">obj</span> <span class="o">==</span> <span class="s2">&quot;orc&quot;</span><span class="p">:</span>
<span class="k">if</span> <span class="ow">not</span> <span class="n">_orc_available</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="n">_orc_msg</span><span class="p">)</span>
<span class="k">return</span> <span class="n">OrcFileFormat</span><span class="p">()</span>
<span class="k">elif</span> <span class="n">obj</span> <span class="o">==</span> <span class="s2">&quot;json&quot;</span><span class="p">:</span>
<span class="k">return</span> <span class="n">JsonFileFormat</span><span class="p">()</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">&quot;format &#39;</span><span class="si">{}</span><span class="s2">&#39; is not supported&quot;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">obj</span><span class="p">))</span>
<span class="k">def</span> <span class="nf">_ensure_multiple_sources</span><span class="p">(</span><span class="n">paths</span><span class="p">,</span> <span class="n">filesystem</span><span class="o">=</span><span class="kc">None</span><span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Treat a list of paths as files belonging to a single file system</span>
<span class="sd"> If the file system is local then also validates that all paths</span>
<span class="sd"> are referencing existing *files* otherwise any non-file paths will be</span>
<span class="sd"> silently skipped (for example on a remote filesystem).</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> paths : list of path-like</span>
<span class="sd"> Note that URIs are not allowed.</span>
<span class="sd"> filesystem : FileSystem or str, optional</span>
<span class="sd"> If an URI is passed, then its path component will act as a prefix for</span>
<span class="sd"> the file paths.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> (FileSystem, list of str)</span>
<span class="sd"> File system object and a list of normalized paths.</span>
<span class="sd"> Raises</span>
<span class="sd"> ------</span>
<span class="sd"> TypeError</span>
<span class="sd"> If the passed filesystem has wrong type.</span>
<span class="sd"> IOError</span>
<span class="sd"> If the file system is local and a referenced path is not available or</span>
<span class="sd"> not a file.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="kn">from</span> <span class="nn">pyarrow.fs</span> <span class="kn">import</span> <span class="p">(</span>
<span class="n">LocalFileSystem</span><span class="p">,</span> <span class="n">SubTreeFileSystem</span><span class="p">,</span> <span class="n">_MockFileSystem</span><span class="p">,</span> <span class="n">FileType</span><span class="p">,</span>
<span class="n">_ensure_filesystem</span>
<span class="p">)</span>
<span class="k">if</span> <span class="n">filesystem</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
<span class="c1"># fall back to local file system as the default</span>
<span class="n">filesystem</span> <span class="o">=</span> <span class="n">LocalFileSystem</span><span class="p">()</span>
<span class="k">else</span><span class="p">:</span>
<span class="c1"># construct a filesystem if it is a valid URI</span>
<span class="n">filesystem</span> <span class="o">=</span> <span class="n">_ensure_filesystem</span><span class="p">(</span><span class="n">filesystem</span><span class="p">)</span>
<span class="n">is_local</span> <span class="o">=</span> <span class="p">(</span>
<span class="nb">isinstance</span><span class="p">(</span><span class="n">filesystem</span><span class="p">,</span> <span class="p">(</span><span class="n">LocalFileSystem</span><span class="p">,</span> <span class="n">_MockFileSystem</span><span class="p">))</span> <span class="ow">or</span>
<span class="p">(</span><span class="nb">isinstance</span><span class="p">(</span><span class="n">filesystem</span><span class="p">,</span> <span class="n">SubTreeFileSystem</span><span class="p">)</span> <span class="ow">and</span>
<span class="nb">isinstance</span><span class="p">(</span><span class="n">filesystem</span><span class="o">.</span><span class="n">base_fs</span><span class="p">,</span> <span class="n">LocalFileSystem</span><span class="p">))</span>
<span class="p">)</span>
<span class="c1"># allow normalizing irregular paths such as Windows local paths</span>
<span class="n">paths</span> <span class="o">=</span> <span class="p">[</span><span class="n">filesystem</span><span class="o">.</span><span class="n">normalize_path</span><span class="p">(</span><span class="n">_stringify_path</span><span class="p">(</span><span class="n">p</span><span class="p">))</span> <span class="k">for</span> <span class="n">p</span> <span class="ow">in</span> <span class="n">paths</span><span class="p">]</span>
<span class="c1"># validate that all of the paths are pointing to existing *files*</span>
<span class="c1"># possible improvement is to group the file_infos by type and raise for</span>
<span class="c1"># multiple paths per error category</span>
<span class="k">if</span> <span class="n">is_local</span><span class="p">:</span>
<span class="k">for</span> <span class="n">info</span> <span class="ow">in</span> <span class="n">filesystem</span><span class="o">.</span><span class="n">get_file_info</span><span class="p">(</span><span class="n">paths</span><span class="p">):</span>
<span class="n">file_type</span> <span class="o">=</span> <span class="n">info</span><span class="o">.</span><span class="n">type</span>
<span class="k">if</span> <span class="n">file_type</span> <span class="o">==</span> <span class="n">FileType</span><span class="o">.</span><span class="n">File</span><span class="p">:</span>
<span class="k">continue</span>
<span class="k">elif</span> <span class="n">file_type</span> <span class="o">==</span> <span class="n">FileType</span><span class="o">.</span><span class="n">NotFound</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">FileNotFoundError</span><span class="p">(</span><span class="n">info</span><span class="o">.</span><span class="n">path</span><span class="p">)</span>
<span class="k">elif</span> <span class="n">file_type</span> <span class="o">==</span> <span class="n">FileType</span><span class="o">.</span><span class="n">Directory</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">IsADirectoryError</span><span class="p">(</span>
<span class="s1">&#39;Path </span><span class="si">{}</span><span class="s1"> points to a directory, but only file paths are &#39;</span>
<span class="s1">&#39;supported. To construct a nested or union dataset pass &#39;</span>
<span class="s1">&#39;a list of dataset objects instead.&#39;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">info</span><span class="o">.</span><span class="n">path</span><span class="p">)</span>
<span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">IOError</span><span class="p">(</span>
<span class="s1">&#39;Path </span><span class="si">{}</span><span class="s1"> exists but its type is unknown (could be a &#39;</span>
<span class="s1">&#39;special file such as a Unix socket or character device, &#39;</span>
<span class="s1">&#39;or Windows NUL / CON / ...)&#39;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">info</span><span class="o">.</span><span class="n">path</span><span class="p">)</span>
<span class="p">)</span>
<span class="k">return</span> <span class="n">filesystem</span><span class="p">,</span> <span class="n">paths</span>
<span class="k">def</span> <span class="nf">_ensure_single_source</span><span class="p">(</span><span class="n">path</span><span class="p">,</span> <span class="n">filesystem</span><span class="o">=</span><span class="kc">None</span><span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Treat path as either a recursively traversable directory or a single file.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> path : path-like</span>
<span class="sd"> filesystem : FileSystem or str, optional</span>
<span class="sd"> If an URI is passed, then its path component will act as a prefix for</span>
<span class="sd"> the file paths.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> (FileSystem, list of str or fs.Selector)</span>
<span class="sd"> File system object and either a single item list pointing to a file or</span>
<span class="sd"> an fs.Selector object pointing to a directory.</span>
<span class="sd"> Raises</span>
<span class="sd"> ------</span>
<span class="sd"> TypeError</span>
<span class="sd"> If the passed filesystem has wrong type.</span>
<span class="sd"> FileNotFoundError</span>
<span class="sd"> If the referenced file or directory doesn&#39;t exist.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="kn">from</span> <span class="nn">pyarrow.fs</span> <span class="kn">import</span> <span class="n">FileType</span><span class="p">,</span> <span class="n">FileSelector</span><span class="p">,</span> <span class="n">_resolve_filesystem_and_path</span>
<span class="c1"># at this point we already checked that `path` is a path-like</span>
<span class="n">filesystem</span><span class="p">,</span> <span class="n">path</span> <span class="o">=</span> <span class="n">_resolve_filesystem_and_path</span><span class="p">(</span><span class="n">path</span><span class="p">,</span> <span class="n">filesystem</span><span class="p">)</span>
<span class="c1"># ensure that the path is normalized before passing to dataset discovery</span>
<span class="n">path</span> <span class="o">=</span> <span class="n">filesystem</span><span class="o">.</span><span class="n">normalize_path</span><span class="p">(</span><span class="n">path</span><span class="p">)</span>
<span class="c1"># retrieve the file descriptor</span>
<span class="n">file_info</span> <span class="o">=</span> <span class="n">filesystem</span><span class="o">.</span><span class="n">get_file_info</span><span class="p">(</span><span class="n">path</span><span class="p">)</span>
<span class="c1"># depending on the path type either return with a recursive</span>
<span class="c1"># directory selector or as a list containing a single file</span>
<span class="k">if</span> <span class="n">file_info</span><span class="o">.</span><span class="n">type</span> <span class="o">==</span> <span class="n">FileType</span><span class="o">.</span><span class="n">Directory</span><span class="p">:</span>
<span class="n">paths_or_selector</span> <span class="o">=</span> <span class="n">FileSelector</span><span class="p">(</span><span class="n">path</span><span class="p">,</span> <span class="n">recursive</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
<span class="k">elif</span> <span class="n">file_info</span><span class="o">.</span><span class="n">type</span> <span class="o">==</span> <span class="n">FileType</span><span class="o">.</span><span class="n">File</span><span class="p">:</span>
<span class="n">paths_or_selector</span> <span class="o">=</span> <span class="p">[</span><span class="n">path</span><span class="p">]</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">FileNotFoundError</span><span class="p">(</span><span class="n">path</span><span class="p">)</span>
<span class="k">return</span> <span class="n">filesystem</span><span class="p">,</span> <span class="n">paths_or_selector</span>
<span class="k">def</span> <span class="nf">_filesystem_dataset</span><span class="p">(</span><span class="n">source</span><span class="p">,</span> <span class="n">schema</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">filesystem</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span>
<span class="n">partitioning</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="nb">format</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span>
<span class="n">partition_base_dir</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">exclude_invalid_files</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span>
<span class="n">selector_ignore_prefixes</span><span class="o">=</span><span class="kc">None</span><span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Create a FileSystemDataset which can be used to build a Dataset.</span>
<span class="sd"> Parameters are documented in the dataset function.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> FileSystemDataset</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="kn">from</span> <span class="nn">pyarrow.fs</span> <span class="kn">import</span> <span class="n">LocalFileSystem</span><span class="p">,</span> <span class="n">_ensure_filesystem</span><span class="p">,</span> <span class="n">FileInfo</span>
<span class="nb">format</span> <span class="o">=</span> <span class="n">_ensure_format</span><span class="p">(</span><span class="nb">format</span> <span class="ow">or</span> <span class="s1">&#39;parquet&#39;</span><span class="p">)</span>
<span class="n">partitioning</span> <span class="o">=</span> <span class="n">_ensure_partitioning</span><span class="p">(</span><span class="n">partitioning</span><span class="p">)</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">source</span><span class="p">,</span> <span class="p">(</span><span class="nb">list</span><span class="p">,</span> <span class="nb">tuple</span><span class="p">)):</span>
<span class="k">if</span> <span class="n">source</span> <span class="ow">and</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">source</span><span class="p">[</span><span class="mi">0</span><span class="p">],</span> <span class="n">FileInfo</span><span class="p">):</span>
<span class="k">if</span> <span class="n">filesystem</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
<span class="c1"># fall back to local file system as the default</span>
<span class="n">fs</span> <span class="o">=</span> <span class="n">LocalFileSystem</span><span class="p">()</span>
<span class="k">else</span><span class="p">:</span>
<span class="c1"># construct a filesystem if it is a valid URI</span>
<span class="n">fs</span> <span class="o">=</span> <span class="n">_ensure_filesystem</span><span class="p">(</span><span class="n">filesystem</span><span class="p">)</span>
<span class="n">paths_or_selector</span> <span class="o">=</span> <span class="n">source</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">fs</span><span class="p">,</span> <span class="n">paths_or_selector</span> <span class="o">=</span> <span class="n">_ensure_multiple_sources</span><span class="p">(</span><span class="n">source</span><span class="p">,</span> <span class="n">filesystem</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">fs</span><span class="p">,</span> <span class="n">paths_or_selector</span> <span class="o">=</span> <span class="n">_ensure_single_source</span><span class="p">(</span><span class="n">source</span><span class="p">,</span> <span class="n">filesystem</span><span class="p">)</span>
<span class="n">options</span> <span class="o">=</span> <span class="n">FileSystemFactoryOptions</span><span class="p">(</span>
<span class="n">partitioning</span><span class="o">=</span><span class="n">partitioning</span><span class="p">,</span>
<span class="n">partition_base_dir</span><span class="o">=</span><span class="n">partition_base_dir</span><span class="p">,</span>
<span class="n">exclude_invalid_files</span><span class="o">=</span><span class="n">exclude_invalid_files</span><span class="p">,</span>
<span class="n">selector_ignore_prefixes</span><span class="o">=</span><span class="n">selector_ignore_prefixes</span>
<span class="p">)</span>
<span class="n">factory</span> <span class="o">=</span> <span class="n">FileSystemDatasetFactory</span><span class="p">(</span><span class="n">fs</span><span class="p">,</span> <span class="n">paths_or_selector</span><span class="p">,</span> <span class="nb">format</span><span class="p">,</span> <span class="n">options</span><span class="p">)</span>
<span class="k">return</span> <span class="n">factory</span><span class="o">.</span><span class="n">finish</span><span class="p">(</span><span class="n">schema</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">_in_memory_dataset</span><span class="p">(</span><span class="n">source</span><span class="p">,</span> <span class="n">schema</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">):</span>
<span class="k">if</span> <span class="nb">any</span><span class="p">(</span><span class="n">v</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> <span class="k">for</span> <span class="n">v</span> <span class="ow">in</span> <span class="n">kwargs</span><span class="o">.</span><span class="n">values</span><span class="p">()):</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span>
<span class="s2">&quot;For in-memory datasets, you cannot pass any additional arguments&quot;</span><span class="p">)</span>
<span class="k">return</span> <span class="n">InMemoryDataset</span><span class="p">(</span><span class="n">source</span><span class="p">,</span> <span class="n">schema</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">_union_dataset</span><span class="p">(</span><span class="n">children</span><span class="p">,</span> <span class="n">schema</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">):</span>
<span class="k">if</span> <span class="nb">any</span><span class="p">(</span><span class="n">v</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> <span class="k">for</span> <span class="n">v</span> <span class="ow">in</span> <span class="n">kwargs</span><span class="o">.</span><span class="n">values</span><span class="p">()):</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span>
<span class="s2">&quot;When passing a list of Datasets, you cannot pass any additional &quot;</span>
<span class="s2">&quot;arguments&quot;</span>
<span class="p">)</span>
<span class="k">if</span> <span class="n">schema</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
<span class="c1"># unify the children datasets&#39; schemas</span>
<span class="n">schema</span> <span class="o">=</span> <span class="n">pa</span><span class="o">.</span><span class="n">unify_schemas</span><span class="p">([</span><span class="n">child</span><span class="o">.</span><span class="n">schema</span> <span class="k">for</span> <span class="n">child</span> <span class="ow">in</span> <span class="n">children</span><span class="p">])</span>
<span class="k">for</span> <span class="n">child</span> <span class="ow">in</span> <span class="n">children</span><span class="p">:</span>
<span class="k">if</span> <span class="nb">getattr</span><span class="p">(</span><span class="n">child</span><span class="p">,</span> <span class="s2">&quot;_scan_options&quot;</span><span class="p">,</span> <span class="kc">None</span><span class="p">):</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span>
<span class="s2">&quot;Creating an UnionDataset from filtered or projected Datasets &quot;</span>
<span class="s2">&quot;is currently not supported. Union the unfiltered datasets &quot;</span>
<span class="s2">&quot;and apply the filter to the resulting union.&quot;</span>
<span class="p">)</span>
<span class="c1"># create datasets with the requested schema</span>
<span class="n">children</span> <span class="o">=</span> <span class="p">[</span><span class="n">child</span><span class="o">.</span><span class="n">replace_schema</span><span class="p">(</span><span class="n">schema</span><span class="p">)</span> <span class="k">for</span> <span class="n">child</span> <span class="ow">in</span> <span class="n">children</span><span class="p">]</span>
<span class="k">return</span> <span class="n">UnionDataset</span><span class="p">(</span><span class="n">schema</span><span class="p">,</span> <span class="n">children</span><span class="p">)</span>
<div class="viewcode-block" id="parquet_dataset"><a class="viewcode-back" href="../../python/generated/pyarrow.dataset.parquet_dataset.html#pyarrow.dataset.parquet_dataset">[docs]</a><span class="k">def</span> <span class="nf">parquet_dataset</span><span class="p">(</span><span class="n">metadata_path</span><span class="p">,</span> <span class="n">schema</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">filesystem</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="nb">format</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span>
<span class="n">partitioning</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">partition_base_dir</span><span class="o">=</span><span class="kc">None</span><span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Create a FileSystemDataset from a `_metadata` file created via</span>
<span class="sd"> `pyarrow.parquet.write_metadata`.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> metadata_path : path,</span>
<span class="sd"> Path pointing to a single file parquet metadata file</span>
<span class="sd"> schema : Schema, optional</span>
<span class="sd"> Optionally provide the Schema for the Dataset, in which case it will</span>
<span class="sd"> not be inferred from the source.</span>
<span class="sd"> filesystem : FileSystem or URI string, default None</span>
<span class="sd"> If a single path is given as source and filesystem is None, then the</span>
<span class="sd"> filesystem will be inferred from the path.</span>
<span class="sd"> If an URI string is passed, then a filesystem object is constructed</span>
<span class="sd"> using the URI&#39;s optional path component as a directory prefix. See the</span>
<span class="sd"> examples below.</span>
<span class="sd"> Note that the URIs on Windows must follow &#39;file:///C:...&#39; or</span>
<span class="sd"> &#39;file:/C:...&#39; patterns.</span>
<span class="sd"> format : ParquetFileFormat</span>
<span class="sd"> An instance of a ParquetFileFormat if special options needs to be</span>
<span class="sd"> passed.</span>
<span class="sd"> partitioning : Partitioning, PartitioningFactory, str, list of str</span>
<span class="sd"> The partitioning scheme specified with the ``partitioning()``</span>
<span class="sd"> function. A flavor string can be used as shortcut, and with a list of</span>
<span class="sd"> field names a DirectoryPartitioning will be inferred.</span>
<span class="sd"> partition_base_dir : str, optional</span>
<span class="sd"> For the purposes of applying the partitioning, paths will be</span>
<span class="sd"> stripped of the partition_base_dir. Files not matching the</span>
<span class="sd"> partition_base_dir prefix will be skipped for partitioning discovery.</span>
<span class="sd"> The ignored files will still be part of the Dataset, but will not</span>
<span class="sd"> have partition information.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> FileSystemDataset</span>
<span class="sd"> The dataset corresponding to the given metadata</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="kn">from</span> <span class="nn">pyarrow.fs</span> <span class="kn">import</span> <span class="n">LocalFileSystem</span><span class="p">,</span> <span class="n">_ensure_filesystem</span>
<span class="k">if</span> <span class="nb">format</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
<span class="nb">format</span> <span class="o">=</span> <span class="n">ParquetFileFormat</span><span class="p">()</span>
<span class="k">elif</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="nb">format</span><span class="p">,</span> <span class="n">ParquetFileFormat</span><span class="p">):</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">&quot;format argument must be a ParquetFileFormat&quot;</span><span class="p">)</span>
<span class="k">if</span> <span class="n">filesystem</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
<span class="n">filesystem</span> <span class="o">=</span> <span class="n">LocalFileSystem</span><span class="p">()</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">filesystem</span> <span class="o">=</span> <span class="n">_ensure_filesystem</span><span class="p">(</span><span class="n">filesystem</span><span class="p">)</span>
<span class="n">metadata_path</span> <span class="o">=</span> <span class="n">filesystem</span><span class="o">.</span><span class="n">normalize_path</span><span class="p">(</span><span class="n">_stringify_path</span><span class="p">(</span><span class="n">metadata_path</span><span class="p">))</span>
<span class="n">options</span> <span class="o">=</span> <span class="n">ParquetFactoryOptions</span><span class="p">(</span>
<span class="n">partition_base_dir</span><span class="o">=</span><span class="n">partition_base_dir</span><span class="p">,</span>
<span class="n">partitioning</span><span class="o">=</span><span class="n">_ensure_partitioning</span><span class="p">(</span><span class="n">partitioning</span><span class="p">)</span>
<span class="p">)</span>
<span class="n">factory</span> <span class="o">=</span> <span class="n">ParquetDatasetFactory</span><span class="p">(</span>
<span class="n">metadata_path</span><span class="p">,</span> <span class="n">filesystem</span><span class="p">,</span> <span class="nb">format</span><span class="p">,</span> <span class="n">options</span><span class="o">=</span><span class="n">options</span><span class="p">)</span>
<span class="k">return</span> <span class="n">factory</span><span class="o">.</span><span class="n">finish</span><span class="p">(</span><span class="n">schema</span><span class="p">)</span></div>
<div class="viewcode-block" id="dataset"><a class="viewcode-back" href="../../python/generated/pyarrow.dataset.dataset.html#pyarrow.dataset.dataset">[docs]</a><span class="k">def</span> <span class="nf">dataset</span><span class="p">(</span><span class="n">source</span><span class="p">,</span> <span class="n">schema</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="nb">format</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">filesystem</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span>
<span class="n">partitioning</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">partition_base_dir</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span>
<span class="n">exclude_invalid_files</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">ignore_prefixes</span><span class="o">=</span><span class="kc">None</span><span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Open a dataset.</span>
<span class="sd"> Datasets provides functionality to efficiently work with tabular,</span>
<span class="sd"> potentially larger than memory and multi-file dataset.</span>
<span class="sd"> - A unified interface for different sources, like Parquet and Feather</span>
<span class="sd"> - Discovery of sources (crawling directories, handle directory-based</span>
<span class="sd"> partitioned datasets, basic schema normalization)</span>
<span class="sd"> - Optimized reading with predicate pushdown (filtering rows), projection</span>
<span class="sd"> (selecting columns), parallel reading or fine-grained managing of tasks.</span>
<span class="sd"> Note that this is the high-level API, to have more control over the dataset</span>
<span class="sd"> construction use the low-level API classes (FileSystemDataset,</span>
<span class="sd"> FilesystemDatasetFactory, etc.)</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> source : path, list of paths, dataset, list of datasets, (list of) \</span>
<span class="sd">RecordBatch or Table, iterable of RecordBatch, RecordBatchReader, or URI</span>
<span class="sd"> Path pointing to a single file:</span>
<span class="sd"> Open a FileSystemDataset from a single file.</span>
<span class="sd"> Path pointing to a directory:</span>
<span class="sd"> The directory gets discovered recursively according to a</span>
<span class="sd"> partitioning scheme if given.</span>
<span class="sd"> List of file paths:</span>
<span class="sd"> Create a FileSystemDataset from explicitly given files. The files</span>
<span class="sd"> must be located on the same filesystem given by the filesystem</span>
<span class="sd"> parameter.</span>
<span class="sd"> Note that in contrary of construction from a single file, passing</span>
<span class="sd"> URIs as paths is not allowed.</span>
<span class="sd"> List of datasets:</span>
<span class="sd"> A nested UnionDataset gets constructed, it allows arbitrary</span>
<span class="sd"> composition of other datasets.</span>
<span class="sd"> Note that additional keyword arguments are not allowed.</span>
<span class="sd"> (List of) batches or tables, iterable of batches, or RecordBatchReader:</span>
<span class="sd"> Create an InMemoryDataset. If an iterable or empty list is given,</span>
<span class="sd"> a schema must also be given. If an iterable or RecordBatchReader</span>
<span class="sd"> is given, the resulting dataset can only be scanned once; further</span>
<span class="sd"> attempts will raise an error.</span>
<span class="sd"> schema : Schema, optional</span>
<span class="sd"> Optionally provide the Schema for the Dataset, in which case it will</span>
<span class="sd"> not be inferred from the source.</span>
<span class="sd"> format : FileFormat or str</span>
<span class="sd"> Currently &quot;parquet&quot;, &quot;ipc&quot;/&quot;arrow&quot;/&quot;feather&quot;, &quot;csv&quot;, &quot;json&quot;, and &quot;orc&quot; are</span>
<span class="sd"> supported. For Feather, only version 2 files are supported.</span>
<span class="sd"> filesystem : FileSystem or URI string, default None</span>
<span class="sd"> If a single path is given as source and filesystem is None, then the</span>
<span class="sd"> filesystem will be inferred from the path.</span>
<span class="sd"> If an URI string is passed, then a filesystem object is constructed</span>
<span class="sd"> using the URI&#39;s optional path component as a directory prefix. See the</span>
<span class="sd"> examples below.</span>
<span class="sd"> Note that the URIs on Windows must follow &#39;file:///C:...&#39; or</span>
<span class="sd"> &#39;file:/C:...&#39; patterns.</span>
<span class="sd"> partitioning : Partitioning, PartitioningFactory, str, list of str</span>
<span class="sd"> The partitioning scheme specified with the ``partitioning()``</span>
<span class="sd"> function. A flavor string can be used as shortcut, and with a list of</span>
<span class="sd"> field names a DirectoryPartitioning will be inferred.</span>
<span class="sd"> partition_base_dir : str, optional</span>
<span class="sd"> For the purposes of applying the partitioning, paths will be</span>
<span class="sd"> stripped of the partition_base_dir. Files not matching the</span>
<span class="sd"> partition_base_dir prefix will be skipped for partitioning discovery.</span>
<span class="sd"> The ignored files will still be part of the Dataset, but will not</span>
<span class="sd"> have partition information.</span>
<span class="sd"> exclude_invalid_files : bool, optional (default True)</span>
<span class="sd"> If True, invalid files will be excluded (file format specific check).</span>
<span class="sd"> This will incur IO for each files in a serial and single threaded</span>
<span class="sd"> fashion. Disabling this feature will skip the IO, but unsupported</span>
<span class="sd"> files may be present in the Dataset (resulting in an error at scan</span>
<span class="sd"> time).</span>
<span class="sd"> ignore_prefixes : list, optional</span>
<span class="sd"> Files matching any of these prefixes will be ignored by the</span>
<span class="sd"> discovery process. This is matched to the basename of a path.</span>
<span class="sd"> By default this is [&#39;.&#39;, &#39;_&#39;].</span>
<span class="sd"> Note that discovery happens only if a directory is passed as source.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> dataset : Dataset</span>
<span class="sd"> Either a FileSystemDataset or a UnionDataset depending on the source</span>
<span class="sd"> parameter.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> Creating an example Table:</span>
<span class="sd"> &gt;&gt;&gt; import pyarrow as pa</span>
<span class="sd"> &gt;&gt;&gt; import pyarrow.parquet as pq</span>
<span class="sd"> &gt;&gt;&gt; table = pa.table({&#39;year&#39;: [2020, 2022, 2021, 2022, 2019, 2021],</span>
<span class="sd"> ... &#39;n_legs&#39;: [2, 2, 4, 4, 5, 100],</span>
<span class="sd"> ... &#39;animal&#39;: [&quot;Flamingo&quot;, &quot;Parrot&quot;, &quot;Dog&quot;, &quot;Horse&quot;,</span>
<span class="sd"> ... &quot;Brittle stars&quot;, &quot;Centipede&quot;]})</span>
<span class="sd"> &gt;&gt;&gt; pq.write_table(table, &quot;file.parquet&quot;)</span>
<span class="sd"> Opening a single file:</span>
<span class="sd"> &gt;&gt;&gt; import pyarrow.dataset as ds</span>
<span class="sd"> &gt;&gt;&gt; dataset = ds.dataset(&quot;file.parquet&quot;, format=&quot;parquet&quot;)</span>
<span class="sd"> &gt;&gt;&gt; dataset.to_table()</span>
<span class="sd"> pyarrow.Table</span>
<span class="sd"> year: int64</span>
<span class="sd"> n_legs: int64</span>
<span class="sd"> animal: string</span>
<span class="sd"> ----</span>
<span class="sd"> year: [[2020,2022,2021,2022,2019,2021]]</span>
<span class="sd"> n_legs: [[2,2,4,4,5,100]]</span>
<span class="sd"> animal: [[&quot;Flamingo&quot;,&quot;Parrot&quot;,&quot;Dog&quot;,&quot;Horse&quot;,&quot;Brittle stars&quot;,&quot;Centipede&quot;]]</span>
<span class="sd"> Opening a single file with an explicit schema:</span>
<span class="sd"> &gt;&gt;&gt; myschema = pa.schema([</span>
<span class="sd"> ... (&#39;n_legs&#39;, pa.int64()),</span>
<span class="sd"> ... (&#39;animal&#39;, pa.string())])</span>
<span class="sd"> &gt;&gt;&gt; dataset = ds.dataset(&quot;file.parquet&quot;, schema=myschema, format=&quot;parquet&quot;)</span>
<span class="sd"> &gt;&gt;&gt; dataset.to_table()</span>
<span class="sd"> pyarrow.Table</span>
<span class="sd"> n_legs: int64</span>
<span class="sd"> animal: string</span>
<span class="sd"> ----</span>
<span class="sd"> n_legs: [[2,2,4,4,5,100]]</span>
<span class="sd"> animal: [[&quot;Flamingo&quot;,&quot;Parrot&quot;,&quot;Dog&quot;,&quot;Horse&quot;,&quot;Brittle stars&quot;,&quot;Centipede&quot;]]</span>
<span class="sd"> Opening a dataset for a single directory:</span>
<span class="sd"> &gt;&gt;&gt; ds.write_dataset(table, &quot;partitioned_dataset&quot;, format=&quot;parquet&quot;,</span>
<span class="sd"> ... partitioning=[&#39;year&#39;])</span>
<span class="sd"> &gt;&gt;&gt; dataset = ds.dataset(&quot;partitioned_dataset&quot;, format=&quot;parquet&quot;)</span>
<span class="sd"> &gt;&gt;&gt; dataset.to_table()</span>
<span class="sd"> pyarrow.Table</span>
<span class="sd"> n_legs: int64</span>
<span class="sd"> animal: string</span>
<span class="sd"> ----</span>
<span class="sd"> n_legs: [[5],[2],[4,100],[2,4]]</span>
<span class="sd"> animal: [[&quot;Brittle stars&quot;],[&quot;Flamingo&quot;],...[&quot;Parrot&quot;,&quot;Horse&quot;]]</span>
<span class="sd"> For a single directory from a S3 bucket:</span>
<span class="sd"> &gt;&gt;&gt; ds.dataset(&quot;s3://mybucket/nyc-taxi/&quot;,</span>
<span class="sd"> ... format=&quot;parquet&quot;) # doctest: +SKIP</span>
<span class="sd"> Opening a dataset from a list of relatives local paths:</span>
<span class="sd"> &gt;&gt;&gt; dataset = ds.dataset([</span>
<span class="sd"> ... &quot;partitioned_dataset/2019/part-0.parquet&quot;,</span>
<span class="sd"> ... &quot;partitioned_dataset/2020/part-0.parquet&quot;,</span>
<span class="sd"> ... &quot;partitioned_dataset/2021/part-0.parquet&quot;,</span>
<span class="sd"> ... ], format=&#39;parquet&#39;)</span>
<span class="sd"> &gt;&gt;&gt; dataset.to_table()</span>
<span class="sd"> pyarrow.Table</span>
<span class="sd"> n_legs: int64</span>
<span class="sd"> animal: string</span>
<span class="sd"> ----</span>
<span class="sd"> n_legs: [[5],[2],[4,100]]</span>
<span class="sd"> animal: [[&quot;Brittle stars&quot;],[&quot;Flamingo&quot;],[&quot;Dog&quot;,&quot;Centipede&quot;]]</span>
<span class="sd"> With filesystem provided:</span>
<span class="sd"> &gt;&gt;&gt; paths = [</span>
<span class="sd"> ... &#39;part0/data.parquet&#39;,</span>
<span class="sd"> ... &#39;part1/data.parquet&#39;,</span>
<span class="sd"> ... &#39;part3/data.parquet&#39;,</span>
<span class="sd"> ... ]</span>
<span class="sd"> &gt;&gt;&gt; ds.dataset(paths, filesystem=&#39;file:///directory/prefix,</span>
<span class="sd"> ... format=&#39;parquet&#39;) # doctest: +SKIP</span>
<span class="sd"> Which is equivalent with:</span>
<span class="sd"> &gt;&gt;&gt; fs = SubTreeFileSystem(&quot;/directory/prefix&quot;,</span>
<span class="sd"> ... LocalFileSystem()) # doctest: +SKIP</span>
<span class="sd"> &gt;&gt;&gt; ds.dataset(paths, filesystem=fs, format=&#39;parquet&#39;) # doctest: +SKIP</span>
<span class="sd"> With a remote filesystem URI:</span>
<span class="sd"> &gt;&gt;&gt; paths = [</span>
<span class="sd"> ... &#39;nested/directory/part0/data.parquet&#39;,</span>
<span class="sd"> ... &#39;nested/directory/part1/data.parquet&#39;,</span>
<span class="sd"> ... &#39;nested/directory/part3/data.parquet&#39;,</span>
<span class="sd"> ... ]</span>
<span class="sd"> &gt;&gt;&gt; ds.dataset(paths, filesystem=&#39;s3://bucket/&#39;,</span>
<span class="sd"> ... format=&#39;parquet&#39;) # doctest: +SKIP</span>
<span class="sd"> Similarly to the local example, the directory prefix may be included in the</span>
<span class="sd"> filesystem URI:</span>
<span class="sd"> &gt;&gt;&gt; ds.dataset(paths, filesystem=&#39;s3://bucket/nested/directory&#39;,</span>
<span class="sd"> ... format=&#39;parquet&#39;) # doctest: +SKIP</span>
<span class="sd"> Construction of a nested dataset:</span>
<span class="sd"> &gt;&gt;&gt; ds.dataset([</span>
<span class="sd"> ... dataset(&quot;s3://old-taxi-data&quot;, format=&quot;parquet&quot;),</span>
<span class="sd"> ... dataset(&quot;local/path/to/data&quot;, format=&quot;ipc&quot;)</span>
<span class="sd"> ... ]) # doctest: +SKIP</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="kn">from</span> <span class="nn">pyarrow.fs</span> <span class="kn">import</span> <span class="n">FileInfo</span>
<span class="c1"># collect the keyword arguments for later reuse</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="nb">dict</span><span class="p">(</span>
<span class="n">schema</span><span class="o">=</span><span class="n">schema</span><span class="p">,</span>
<span class="n">filesystem</span><span class="o">=</span><span class="n">filesystem</span><span class="p">,</span>
<span class="n">partitioning</span><span class="o">=</span><span class="n">partitioning</span><span class="p">,</span>
<span class="nb">format</span><span class="o">=</span><span class="nb">format</span><span class="p">,</span>
<span class="n">partition_base_dir</span><span class="o">=</span><span class="n">partition_base_dir</span><span class="p">,</span>
<span class="n">exclude_invalid_files</span><span class="o">=</span><span class="n">exclude_invalid_files</span><span class="p">,</span>
<span class="n">selector_ignore_prefixes</span><span class="o">=</span><span class="n">ignore_prefixes</span>
<span class="p">)</span>
<span class="k">if</span> <span class="n">_is_path_like</span><span class="p">(</span><span class="n">source</span><span class="p">):</span>
<span class="k">return</span> <span class="n">_filesystem_dataset</span><span class="p">(</span><span class="n">source</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
<span class="k">elif</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">source</span><span class="p">,</span> <span class="p">(</span><span class="nb">tuple</span><span class="p">,</span> <span class="nb">list</span><span class="p">)):</span>
<span class="k">if</span> <span class="nb">all</span><span class="p">(</span><span class="n">_is_path_like</span><span class="p">(</span><span class="n">elem</span><span class="p">)</span> <span class="ow">or</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">elem</span><span class="p">,</span> <span class="n">FileInfo</span><span class="p">)</span> <span class="k">for</span> <span class="n">elem</span> <span class="ow">in</span> <span class="n">source</span><span class="p">):</span>
<span class="k">return</span> <span class="n">_filesystem_dataset</span><span class="p">(</span><span class="n">source</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
<span class="k">elif</span> <span class="nb">all</span><span class="p">(</span><span class="nb">isinstance</span><span class="p">(</span><span class="n">elem</span><span class="p">,</span> <span class="n">Dataset</span><span class="p">)</span> <span class="k">for</span> <span class="n">elem</span> <span class="ow">in</span> <span class="n">source</span><span class="p">):</span>
<span class="k">return</span> <span class="n">_union_dataset</span><span class="p">(</span><span class="n">source</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
<span class="k">elif</span> <span class="nb">all</span><span class="p">(</span><span class="nb">isinstance</span><span class="p">(</span><span class="n">elem</span><span class="p">,</span> <span class="p">(</span><span class="n">pa</span><span class="o">.</span><span class="n">RecordBatch</span><span class="p">,</span> <span class="n">pa</span><span class="o">.</span><span class="n">Table</span><span class="p">))</span>
<span class="k">for</span> <span class="n">elem</span> <span class="ow">in</span> <span class="n">source</span><span class="p">):</span>
<span class="k">return</span> <span class="n">_in_memory_dataset</span><span class="p">(</span><span class="n">source</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">unique_types</span> <span class="o">=</span> <span class="nb">set</span><span class="p">(</span><span class="nb">type</span><span class="p">(</span><span class="n">elem</span><span class="p">)</span><span class="o">.</span><span class="vm">__name__</span> <span class="k">for</span> <span class="n">elem</span> <span class="ow">in</span> <span class="n">source</span><span class="p">)</span>
<span class="n">type_names</span> <span class="o">=</span> <span class="s1">&#39;, &#39;</span><span class="o">.</span><span class="n">join</span><span class="p">(</span><span class="s1">&#39;</span><span class="si">{}</span><span class="s1">&#39;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">t</span><span class="p">)</span> <span class="k">for</span> <span class="n">t</span> <span class="ow">in</span> <span class="n">unique_types</span><span class="p">)</span>
<span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span>
<span class="s1">&#39;Expected a list of path-like or dataset objects, or a list &#39;</span>
<span class="s1">&#39;of batches or tables. The given list contains the following &#39;</span>
<span class="s1">&#39;types: </span><span class="si">{}</span><span class="s1">&#39;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">type_names</span><span class="p">)</span>
<span class="p">)</span>
<span class="k">elif</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">source</span><span class="p">,</span> <span class="p">(</span><span class="n">pa</span><span class="o">.</span><span class="n">RecordBatch</span><span class="p">,</span> <span class="n">pa</span><span class="o">.</span><span class="n">Table</span><span class="p">)):</span>
<span class="k">return</span> <span class="n">_in_memory_dataset</span><span class="p">(</span><span class="n">source</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span>
<span class="s1">&#39;Expected a path-like, list of path-likes or a list of Datasets &#39;</span>
<span class="s1">&#39;instead of the given type: </span><span class="si">{}</span><span class="s1">&#39;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="nb">type</span><span class="p">(</span><span class="n">source</span><span class="p">)</span><span class="o">.</span><span class="vm">__name__</span><span class="p">)</span>
<span class="p">)</span></div>
<span class="k">def</span> <span class="nf">_ensure_write_partitioning</span><span class="p">(</span><span class="n">part</span><span class="p">,</span> <span class="n">schema</span><span class="p">,</span> <span class="n">flavor</span><span class="p">):</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">part</span><span class="p">,</span> <span class="n">PartitioningFactory</span><span class="p">):</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">&quot;A PartitioningFactory cannot be used. &quot;</span>
<span class="s2">&quot;Did you call the partitioning function &quot;</span>
<span class="s2">&quot;without supplying a schema?&quot;</span><span class="p">)</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">part</span><span class="p">,</span> <span class="n">Partitioning</span><span class="p">)</span> <span class="ow">and</span> <span class="n">flavor</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span>
<span class="s2">&quot;Providing a partitioning_flavor with &quot;</span>
<span class="s2">&quot;a Partitioning object is not supported&quot;</span>
<span class="p">)</span>
<span class="k">elif</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">part</span><span class="p">,</span> <span class="p">(</span><span class="nb">tuple</span><span class="p">,</span> <span class="nb">list</span><span class="p">)):</span>
<span class="c1"># Name of fields were provided instead of a partitioning object.</span>
<span class="c1"># Create a partitioning factory with those field names.</span>
<span class="n">part</span> <span class="o">=</span> <span class="n">partitioning</span><span class="p">(</span>
<span class="n">schema</span><span class="o">=</span><span class="n">pa</span><span class="o">.</span><span class="n">schema</span><span class="p">([</span><span class="n">schema</span><span class="o">.</span><span class="n">field</span><span class="p">(</span><span class="n">f</span><span class="p">)</span> <span class="k">for</span> <span class="n">f</span> <span class="ow">in</span> <span class="n">part</span><span class="p">]),</span>
<span class="n">flavor</span><span class="o">=</span><span class="n">flavor</span>
<span class="p">)</span>
<span class="k">elif</span> <span class="n">part</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
<span class="n">part</span> <span class="o">=</span> <span class="n">partitioning</span><span class="p">(</span><span class="n">pa</span><span class="o">.</span><span class="n">schema</span><span class="p">([]),</span> <span class="n">flavor</span><span class="o">=</span><span class="n">flavor</span><span class="p">)</span>
<span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">part</span><span class="p">,</span> <span class="n">Partitioning</span><span class="p">):</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span>
<span class="s2">&quot;partitioning must be a Partitioning object or &quot;</span>
<span class="s2">&quot;a list of column names&quot;</span>
<span class="p">)</span>
<span class="k">return</span> <span class="n">part</span>
<div class="viewcode-block" id="write_dataset"><a class="viewcode-back" href="../../python/generated/pyarrow.dataset.write_dataset.html#pyarrow.dataset.write_dataset">[docs]</a><span class="k">def</span> <span class="nf">write_dataset</span><span class="p">(</span><span class="n">data</span><span class="p">,</span> <span class="n">base_dir</span><span class="p">,</span> <span class="o">*</span><span class="p">,</span> <span class="n">basename_template</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="nb">format</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span>
<span class="n">partitioning</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">partitioning_flavor</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">schema</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span>
<span class="n">filesystem</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">file_options</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">use_threads</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span>
<span class="n">max_partitions</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">max_open_files</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span>
<span class="n">max_rows_per_file</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">min_rows_per_group</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span>
<span class="n">max_rows_per_group</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">file_visitor</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span>
<span class="n">existing_data_behavior</span><span class="o">=</span><span class="s1">&#39;error&#39;</span><span class="p">,</span> <span class="n">create_dir</span><span class="o">=</span><span class="kc">True</span><span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Write a dataset to a given format and partitioning.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> data : Dataset, Table/RecordBatch, RecordBatchReader, list of \</span>
<span class="sd">Table/RecordBatch, or iterable of RecordBatch</span>
<span class="sd"> The data to write. This can be a Dataset instance or</span>
<span class="sd"> in-memory Arrow data. If an iterable is given, the schema must</span>
<span class="sd"> also be given.</span>
<span class="sd"> base_dir : str</span>
<span class="sd"> The root directory where to write the dataset.</span>
<span class="sd"> basename_template : str, optional</span>
<span class="sd"> A template string used to generate basenames of written data files.</span>
<span class="sd"> The token &#39;{i}&#39; will be replaced with an automatically incremented</span>
<span class="sd"> integer. If not specified, it defaults to</span>
<span class="sd"> &quot;part-{i}.&quot; + format.default_extname</span>
<span class="sd"> format : FileFormat or str</span>
<span class="sd"> The format in which to write the dataset. Currently supported:</span>
<span class="sd"> &quot;parquet&quot;, &quot;ipc&quot;/&quot;arrow&quot;/&quot;feather&quot;, and &quot;csv&quot;. If a FileSystemDataset</span>
<span class="sd"> is being written and `format` is not specified, it defaults to the</span>
<span class="sd"> same format as the specified FileSystemDataset. When writing a</span>
<span class="sd"> Table or RecordBatch, this keyword is required.</span>
<span class="sd"> partitioning : Partitioning or list[str], optional</span>
<span class="sd"> The partitioning scheme specified with the ``partitioning()``</span>
<span class="sd"> function or a list of field names. When providing a list of</span>
<span class="sd"> field names, you can use ``partitioning_flavor`` to drive which</span>
<span class="sd"> partitioning type should be used.</span>
<span class="sd"> partitioning_flavor : str, optional</span>
<span class="sd"> One of the partitioning flavors supported by</span>
<span class="sd"> ``pyarrow.dataset.partitioning``. If omitted will use the</span>
<span class="sd"> default of ``partitioning()`` which is directory partitioning.</span>
<span class="sd"> schema : Schema, optional</span>
<span class="sd"> filesystem : FileSystem, optional</span>
<span class="sd"> file_options : pyarrow.dataset.FileWriteOptions, optional</span>
<span class="sd"> FileFormat specific write options, created using the</span>
<span class="sd"> ``FileFormat.make_write_options()`` function.</span>
<span class="sd"> use_threads : bool, default True</span>
<span class="sd"> Write files in parallel. If enabled, then maximum parallelism will be</span>
<span class="sd"> used determined by the number of available CPU cores.</span>
<span class="sd"> max_partitions : int, default 1024</span>
<span class="sd"> Maximum number of partitions any batch may be written into.</span>
<span class="sd"> max_open_files : int, default 1024</span>
<span class="sd"> If greater than 0 then this will limit the maximum number of</span>
<span class="sd"> files that can be left open. If an attempt is made to open</span>
<span class="sd"> too many files then the least recently used file will be closed.</span>
<span class="sd"> If this setting is set too low you may end up fragmenting your</span>
<span class="sd"> data into many small files.</span>
<span class="sd"> max_rows_per_file : int, default 0</span>
<span class="sd"> Maximum number of rows per file. If greater than 0 then this will</span>
<span class="sd"> limit how many rows are placed in any single file. Otherwise there</span>
<span class="sd"> will be no limit and one file will be created in each output</span>
<span class="sd"> directory unless files need to be closed to respect max_open_files</span>
<span class="sd"> min_rows_per_group : int, default 0</span>
<span class="sd"> Minimum number of rows per group. When the value is greater than 0,</span>
<span class="sd"> the dataset writer will batch incoming data and only write the row</span>
<span class="sd"> groups to the disk when sufficient rows have accumulated.</span>
<span class="sd"> max_rows_per_group : int, default 1024 * 1024</span>
<span class="sd"> Maximum number of rows per group. If the value is greater than 0,</span>
<span class="sd"> then the dataset writer may split up large incoming batches into</span>
<span class="sd"> multiple row groups. If this value is set, then min_rows_per_group</span>
<span class="sd"> should also be set. Otherwise it could end up with very small row</span>
<span class="sd"> groups.</span>
<span class="sd"> file_visitor : function</span>
<span class="sd"> If set, this function will be called with a WrittenFile instance</span>
<span class="sd"> for each file created during the call. This object will have both</span>
<span class="sd"> a path attribute and a metadata attribute.</span>
<span class="sd"> The path attribute will be a string containing the path to</span>
<span class="sd"> the created file.</span>
<span class="sd"> The metadata attribute will be the parquet metadata of the file.</span>
<span class="sd"> This metadata will have the file path attribute set and can be used</span>
<span class="sd"> to build a _metadata file. The metadata attribute will be None if</span>
<span class="sd"> the format is not parquet.</span>
<span class="sd"> Example visitor which simple collects the filenames created::</span>
<span class="sd"> visited_paths = []</span>
<span class="sd"> def file_visitor(written_file):</span>
<span class="sd"> visited_paths.append(written_file.path)</span>
<span class="sd"> existing_data_behavior : &#39;error&#39; | &#39;overwrite_or_ignore&#39; | \</span>
<span class="sd">&#39;delete_matching&#39;</span>
<span class="sd"> Controls how the dataset will handle data that already exists in</span>
<span class="sd"> the destination. The default behavior (&#39;error&#39;) is to raise an error</span>
<span class="sd"> if any data exists in the destination.</span>
<span class="sd"> &#39;overwrite_or_ignore&#39; will ignore any existing data and will</span>
<span class="sd"> overwrite files with the same name as an output file. Other</span>
<span class="sd"> existing files will be ignored. This behavior, in combination</span>
<span class="sd"> with a unique basename_template for each write, will allow for</span>
<span class="sd"> an append workflow.</span>
<span class="sd"> &#39;delete_matching&#39; is useful when you are writing a partitioned</span>
<span class="sd"> dataset. The first time each partition directory is encountered</span>
<span class="sd"> the entire directory will be deleted. This allows you to overwrite</span>
<span class="sd"> old partitions completely.</span>
<span class="sd"> create_dir : bool, default True</span>
<span class="sd"> If False, directories will not be created. This can be useful for</span>
<span class="sd"> filesystems that do not require directories.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="kn">from</span> <span class="nn">pyarrow.fs</span> <span class="kn">import</span> <span class="n">_resolve_filesystem_and_path</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">data</span><span class="p">,</span> <span class="p">(</span><span class="nb">list</span><span class="p">,</span> <span class="nb">tuple</span><span class="p">)):</span>
<span class="n">schema</span> <span class="o">=</span> <span class="n">schema</span> <span class="ow">or</span> <span class="n">data</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">schema</span>
<span class="n">data</span> <span class="o">=</span> <span class="n">InMemoryDataset</span><span class="p">(</span><span class="n">data</span><span class="p">,</span> <span class="n">schema</span><span class="o">=</span><span class="n">schema</span><span class="p">)</span>
<span class="k">elif</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">data</span><span class="p">,</span> <span class="p">(</span><span class="n">pa</span><span class="o">.</span><span class="n">RecordBatch</span><span class="p">,</span> <span class="n">pa</span><span class="o">.</span><span class="n">Table</span><span class="p">)):</span>
<span class="n">schema</span> <span class="o">=</span> <span class="n">schema</span> <span class="ow">or</span> <span class="n">data</span><span class="o">.</span><span class="n">schema</span>
<span class="n">data</span> <span class="o">=</span> <span class="n">InMemoryDataset</span><span class="p">(</span><span class="n">data</span><span class="p">,</span> <span class="n">schema</span><span class="o">=</span><span class="n">schema</span><span class="p">)</span>
<span class="k">elif</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">data</span><span class="p">,</span> <span class="n">pa</span><span class="o">.</span><span class="n">ipc</span><span class="o">.</span><span class="n">RecordBatchReader</span><span class="p">)</span> <span class="ow">or</span> <span class="n">_is_iterable</span><span class="p">(</span><span class="n">data</span><span class="p">):</span>
<span class="n">data</span> <span class="o">=</span> <span class="n">Scanner</span><span class="o">.</span><span class="n">from_batches</span><span class="p">(</span><span class="n">data</span><span class="p">,</span> <span class="n">schema</span><span class="o">=</span><span class="n">schema</span><span class="p">)</span>
<span class="n">schema</span> <span class="o">=</span> <span class="kc">None</span>
<span class="k">elif</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">data</span><span class="p">,</span> <span class="p">(</span><span class="n">Dataset</span><span class="p">,</span> <span class="n">Scanner</span><span class="p">)):</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span>
<span class="s2">&quot;Only Dataset, Scanner, Table/RecordBatch, RecordBatchReader, &quot;</span>
<span class="s2">&quot;a list of Tables/RecordBatches, or iterable of batches are &quot;</span>
<span class="s2">&quot;supported.&quot;</span>
<span class="p">)</span>
<span class="k">if</span> <span class="nb">format</span> <span class="ow">is</span> <span class="kc">None</span> <span class="ow">and</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">data</span><span class="p">,</span> <span class="n">FileSystemDataset</span><span class="p">):</span>
<span class="nb">format</span> <span class="o">=</span> <span class="n">data</span><span class="o">.</span><span class="n">format</span>
<span class="k">else</span><span class="p">:</span>
<span class="nb">format</span> <span class="o">=</span> <span class="n">_ensure_format</span><span class="p">(</span><span class="nb">format</span><span class="p">)</span>
<span class="k">if</span> <span class="n">file_options</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
<span class="n">file_options</span> <span class="o">=</span> <span class="nb">format</span><span class="o">.</span><span class="n">make_write_options</span><span class="p">()</span>
<span class="k">if</span> <span class="nb">format</span> <span class="o">!=</span> <span class="n">file_options</span><span class="o">.</span><span class="n">format</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">&quot;Supplied FileWriteOptions have format </span><span class="si">{}</span><span class="s2">, &quot;</span>
<span class="s2">&quot;which doesn&#39;t match supplied FileFormat </span><span class="si">{}</span><span class="s2">&quot;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span>
<span class="nb">format</span><span class="p">,</span> <span class="n">file_options</span><span class="p">))</span>
<span class="k">if</span> <span class="n">basename_template</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
<span class="n">basename_template</span> <span class="o">=</span> <span class="s2">&quot;part-</span><span class="si">{i}</span><span class="s2">.&quot;</span> <span class="o">+</span> <span class="nb">format</span><span class="o">.</span><span class="n">default_extname</span>
<span class="k">if</span> <span class="n">max_partitions</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
<span class="n">max_partitions</span> <span class="o">=</span> <span class="mi">1024</span>
<span class="k">if</span> <span class="n">max_open_files</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
<span class="n">max_open_files</span> <span class="o">=</span> <span class="mi">1024</span>
<span class="k">if</span> <span class="n">max_rows_per_file</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
<span class="n">max_rows_per_file</span> <span class="o">=</span> <span class="mi">0</span>
<span class="k">if</span> <span class="n">max_rows_per_group</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
<span class="n">max_rows_per_group</span> <span class="o">=</span> <span class="mi">1</span> <span class="o">&lt;&lt;</span> <span class="mi">20</span>
<span class="k">if</span> <span class="n">min_rows_per_group</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
<span class="n">min_rows_per_group</span> <span class="o">=</span> <span class="mi">0</span>
<span class="c1"># at this point data is a Scanner or a Dataset, anything else</span>
<span class="c1"># was converted to one of those two. So we can grab the schema</span>
<span class="c1"># to build the partitioning object from Dataset.</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">data</span><span class="p">,</span> <span class="n">Scanner</span><span class="p">):</span>
<span class="n">partitioning_schema</span> <span class="o">=</span> <span class="n">data</span><span class="o">.</span><span class="n">projected_schema</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">partitioning_schema</span> <span class="o">=</span> <span class="n">data</span><span class="o">.</span><span class="n">schema</span>
<span class="n">partitioning</span> <span class="o">=</span> <span class="n">_ensure_write_partitioning</span><span class="p">(</span><span class="n">partitioning</span><span class="p">,</span>
<span class="n">schema</span><span class="o">=</span><span class="n">partitioning_schema</span><span class="p">,</span>
<span class="n">flavor</span><span class="o">=</span><span class="n">partitioning_flavor</span><span class="p">)</span>
<span class="n">filesystem</span><span class="p">,</span> <span class="n">base_dir</span> <span class="o">=</span> <span class="n">_resolve_filesystem_and_path</span><span class="p">(</span><span class="n">base_dir</span><span class="p">,</span> <span class="n">filesystem</span><span class="p">)</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">data</span><span class="p">,</span> <span class="n">Dataset</span><span class="p">):</span>
<span class="n">scanner</span> <span class="o">=</span> <span class="n">data</span><span class="o">.</span><span class="n">scanner</span><span class="p">(</span><span class="n">use_threads</span><span class="o">=</span><span class="n">use_threads</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="c1"># scanner was passed directly by the user, in which case a schema</span>
<span class="c1"># cannot be passed</span>
<span class="k">if</span> <span class="n">schema</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">&quot;Cannot specify a schema when writing a Scanner&quot;</span><span class="p">)</span>
<span class="n">scanner</span> <span class="o">=</span> <span class="n">data</span>
<span class="n">_filesystemdataset_write</span><span class="p">(</span>
<span class="n">scanner</span><span class="p">,</span> <span class="n">base_dir</span><span class="p">,</span> <span class="n">basename_template</span><span class="p">,</span> <span class="n">filesystem</span><span class="p">,</span> <span class="n">partitioning</span><span class="p">,</span>
<span class="n">file_options</span><span class="p">,</span> <span class="n">max_partitions</span><span class="p">,</span> <span class="n">file_visitor</span><span class="p">,</span> <span class="n">existing_data_behavior</span><span class="p">,</span>
<span class="n">max_open_files</span><span class="p">,</span> <span class="n">max_rows_per_file</span><span class="p">,</span>
<span class="n">min_rows_per_group</span><span class="p">,</span> <span class="n">max_rows_per_group</span><span class="p">,</span> <span class="n">create_dir</span>
<span class="p">)</span></div>
</pre></div>
</article>
<footer class="prev-next-footer">
<div class="prev-next-area">
</div>
</footer>
</div>
<div class="bd-sidebar-secondary bd-toc"></div>
</div>
<footer class="bd-footer-content">
</footer>
</main>
</div>
</div>
<!-- Scripts loaded after <body> so the DOM is not blocked -->
<script src="../../_static/scripts/bootstrap.js?digest=8d27b9dea8ad943066ae"></script>
<script src="../../_static/scripts/pydata-sphinx-theme.js?digest=8d27b9dea8ad943066ae"></script>
<footer class="bd-footer">
<div class="bd-footer__inner bd-page-width">
<div class="footer-items__start">
<div class="footer-item">
<p class="copyright">
© Copyright 2016-2024 Apache Software Foundation.
Apache Arrow, Arrow, Apache, the Apache feather logo, and the Apache Arrow project logo are either registered trademarks or trademarks of The Apache Software Foundation in the United States and other countries.
<br/>
</p>
</div>
<div class="footer-item">
<p class="sphinx-version">
Created using <a href="https://www.sphinx-doc.org/">Sphinx</a> 6.2.0.
<br/>
</p>
</div>
</div>
<div class="footer-items__end">
<div class="footer-item">
<p class="theme-version">
Built with the <a href="https://pydata-sphinx-theme.readthedocs.io/en/stable/index.html">PyData Sphinx Theme</a> 0.15.2.
</p></div>
</div>
</div>
</footer>
</body>
</html>