blob: 8035d849e89fb811066f5d1e957becfd617e7abb [file] [log] [blame]
<!DOCTYPE html>
<html lang="en" data-content_root="" >
<head>
<meta charset="utf-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<title>pyarrow.parquet.core &#8212; Apache Arrow v17.0.0.dev77</title>
<script data-cfasync="false">
document.documentElement.dataset.mode = localStorage.getItem("mode") || "";
document.documentElement.dataset.theme = localStorage.getItem("theme") || "light";
</script>
<!-- Loaded before other Sphinx assets -->
<link href="../../../_static/styles/theme.css?digest=8d27b9dea8ad943066ae" rel="stylesheet" />
<link href="../../../_static/styles/bootstrap.css?digest=8d27b9dea8ad943066ae" rel="stylesheet" />
<link href="../../../_static/styles/pydata-sphinx-theme.css?digest=8d27b9dea8ad943066ae" rel="stylesheet" />
<link href="../../../_static/vendor/fontawesome/6.5.1/css/all.min.css?digest=8d27b9dea8ad943066ae" rel="stylesheet" />
<link rel="preload" as="font" type="font/woff2" crossorigin href="../../../_static/vendor/fontawesome/6.5.1/webfonts/fa-solid-900.woff2" />
<link rel="preload" as="font" type="font/woff2" crossorigin href="../../../_static/vendor/fontawesome/6.5.1/webfonts/fa-brands-400.woff2" />
<link rel="preload" as="font" type="font/woff2" crossorigin href="../../../_static/vendor/fontawesome/6.5.1/webfonts/fa-regular-400.woff2" />
<link rel="stylesheet" type="text/css" href="../../../_static/pygments.css" />
<link rel="stylesheet" type="text/css" href="../../../_static/copybutton.css" />
<link rel="stylesheet" type="text/css" href="../../../_static/design-style.1e8bd061cd6da7fc9cf755528e8ffc24.min.css" />
<link rel="stylesheet" type="text/css" href="../../../_static/theme_overrides.css" />
<!-- Pre-loaded scripts that we'll load fully later -->
<link rel="preload" as="script" href="../../../_static/scripts/bootstrap.js?digest=8d27b9dea8ad943066ae" />
<link rel="preload" as="script" href="../../../_static/scripts/pydata-sphinx-theme.js?digest=8d27b9dea8ad943066ae" />
<script src="../../../_static/vendor/fontawesome/6.5.1/js/all.min.js?digest=8d27b9dea8ad943066ae"></script>
<script data-url_root="../../../" id="documentation_options" src="../../../_static/documentation_options.js"></script>
<script src="../../../_static/doctools.js"></script>
<script src="../../../_static/sphinx_highlight.js"></script>
<script src="../../../_static/clipboard.min.js"></script>
<script src="../../../_static/copybutton.js"></script>
<script src="../../../_static/design-tabs.js"></script>
<script>DOCUMENTATION_OPTIONS.pagename = '_modules/pyarrow/parquet/core';</script>
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.15.2';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = '/docs/_static/versions.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = 'dev/';
DOCUMENTATION_OPTIONS.show_version_warning_banner = true;
</script>
<link rel="canonical" href="https://arrow.apache.org/docs/_modules/pyarrow/parquet/core.html" />
<link rel="icon" href="../../../_static/favicon.ico"/>
<link rel="index" title="Index" href="../../../genindex.html" />
<link rel="search" title="Search" href="../../../search.html" />
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<!-- Matomo -->
<script>
var _paq = window._paq = window._paq || [];
/* tracker methods like "setCustomDimension" should be called before "trackPageView" */
/* We explicitly disable cookie tracking to avoid privacy issues */
_paq.push(['disableCookies']);
_paq.push(['trackPageView']);
_paq.push(['enableLinkTracking']);
(function() {
var u="https://analytics.apache.org/";
_paq.push(['setTrackerUrl', u+'matomo.php']);
_paq.push(['setSiteId', '20']);
var d=document, g=d.createElement('script'), s=d.getElementsByTagName('script')[0];
g.async=true; g.src=u+'matomo.js'; s.parentNode.insertBefore(g,s);
})();
</script>
<!-- End Matomo Code -->
</head>
<body data-bs-spy="scroll" data-bs-target=".bd-toc-nav" data-offset="180" data-bs-root-margin="0px 0px -60%" data-default-mode="">
<a id="pst-skip-link" class="skip-link" href="#main-content">Skip to main content</a>
<div id="pst-scroll-pixel-helper"></div>
<button type="button" class="btn rounded-pill" id="pst-back-to-top">
<i class="fa-solid fa-arrow-up"></i>
Back to top
</button>
<input type="checkbox"
class="sidebar-toggle"
name="__primary"
id="__primary"/>
<label class="overlay overlay-primary" for="__primary"></label>
<input type="checkbox"
class="sidebar-toggle"
name="__secondary"
id="__secondary"/>
<label class="overlay overlay-secondary" for="__secondary"></label>
<div class="search-button__wrapper">
<div class="search-button__overlay"></div>
<div class="search-button__search-container">
<form class="bd-search d-flex align-items-center"
action="../../../search.html"
method="get">
<i class="fa-solid fa-magnifying-glass"></i>
<input type="search"
class="form-control"
name="q"
id="search-input"
placeholder="Search the docs ..."
aria-label="Search the docs ..."
autocomplete="off"
autocorrect="off"
autocapitalize="off"
spellcheck="false"/>
<span class="search-button__kbd-shortcut"><kbd class="kbd-shortcut__modifier">Ctrl</kbd>+<kbd>K</kbd></span>
</form></div>
</div>
<header class="bd-header navbar navbar-expand-lg bd-navbar">
<div class="bd-header__inner bd-page-width">
<label class="sidebar-toggle primary-toggle" for="__primary">
<span class="fa-solid fa-bars"></span>
</label>
<div class="col-lg-3 navbar-header-items__start">
<div class="navbar-item">
<a class="navbar-brand logo" href="../../../index.html">
<img src="../../../_static/arrow.png" class="logo__image only-light" alt="Apache Arrow v17.0.0.dev77 - Home"/>
<script>document.write(`<img src="../../../_static/arrow-dark.png" class="logo__image only-dark" alt="Apache Arrow v17.0.0.dev77 - Home"/>`);</script>
</a></div>
</div>
<div class="col-lg-9 navbar-header-items">
<div class="me-auto navbar-header-items__center">
<div class="navbar-item">
<nav class="navbar-nav">
<ul class="bd-navbar-elements navbar-nav">
<li class="nav-item">
<a class="nav-link nav-internal" href="../../../format/index.html">
Specifications
</a>
</li>
<li class="nav-item">
<a class="nav-link nav-internal" href="../../../developers/index.html">
Development
</a>
</li>
<li class="nav-item dropdown">
<button class="btn dropdown-toggle nav-item" type="button" data-bs-toggle="dropdown" aria-expanded="false" aria-controls="pst-nav-more-links">
Implementations
</button>
<ul id="pst-nav-more-links" class="dropdown-menu">
<li class="nav-item">
<a class="nav-link dropdown-item nav-internal" href="../../../c_glib/index.html">
C/GLib
</a>
</li>
<li class="nav-item">
<a class="nav-link dropdown-item nav-internal" href="../../../cpp/index.html">
C++
</a>
</li>
<li class="nav-item">
<a class="nav-link dropdown-item nav-external" href="https://github.com/apache/arrow/blob/main/csharp/README.md">
C#
</a>
</li>
<li class="nav-item">
<a class="nav-link dropdown-item nav-external" href="https://pkg.go.dev/github.com/apache/arrow/go/v17">
Go
</a>
</li>
<li class="nav-item">
<a class="nav-link dropdown-item nav-internal" href="../../../java/index.html">
Java
</a>
</li>
<li class="nav-item">
<a class="nav-link dropdown-item nav-internal" href="../../../js/index.html">
JavaScript
</a>
</li>
<li class="nav-item">
<a class="nav-link dropdown-item nav-external" href="https://arrow.apache.org/julia/">
Julia
</a>
</li>
<li class="nav-item">
<a class="nav-link dropdown-item nav-external" href="https://github.com/apache/arrow/blob/main/matlab/README.md">
MATLAB
</a>
</li>
<li class="nav-item">
<a class="nav-link dropdown-item nav-external" href="https://arrow.apache.org/nanoarrow/">
nanoarrow
</a>
</li>
<li class="nav-item">
<a class="nav-link dropdown-item nav-internal" href="../../../python/index.html">
Python
</a>
</li>
<li class="nav-item">
<a class="nav-link dropdown-item nav-internal" href="../../../r/index.html">
R
</a>
</li>
<li class="nav-item">
<a class="nav-link dropdown-item nav-external" href="https://github.com/apache/arrow/blob/main/ruby/README.md">
Ruby
</a>
</li>
<li class="nav-item">
<a class="nav-link dropdown-item nav-external" href="https://docs.rs/crate/arrow/">
Rust
</a>
</li>
<li class="nav-item">
<a class="nav-link dropdown-item nav-internal" href="../../../status.html">
Implementation Status
</a>
</li>
<li class="nav-item">
<a class="nav-link dropdown-item nav-external" href="https://arrow.apache.org/cookbook/cpp/">
C++ cookbook
</a>
</li>
<li class="nav-item">
<a class="nav-link dropdown-item nav-external" href="https://arrow.apache.org/cookbook/java/">
Java cookbook
</a>
</li>
<li class="nav-item">
<a class="nav-link dropdown-item nav-external" href="https://arrow.apache.org/cookbook/py/">
Python cookbook
</a>
</li>
<li class="nav-item">
<a class="nav-link dropdown-item nav-external" href="https://arrow.apache.org/cookbook/r/">
R cookbook
</a>
</li>
</ul>
</li>
</ul>
</nav></div>
</div>
<div class="navbar-header-items__end">
<div class="navbar-item navbar-persistent--container">
<script>
document.write(`
<button class="btn navbar-btn search-button-field search-button__button" title="Search" aria-label="Search" data-bs-placement="bottom" data-bs-toggle="tooltip">
<i class="fa-solid fa-magnifying-glass"></i>
<span class="search-button__default-text">Search</span>
<span class="search-button__kbd-shortcut"><kbd class="kbd-shortcut__modifier">Ctrl</kbd>+<kbd class="kbd-shortcut__modifier">K</kbd></span>
</button>
`);
</script>
</div>
<div class="navbar-item">
<script>
document.write(`
<div class="version-switcher__container dropdown">
<button id="pst-version-switcher-button-2"
type="button"
class="version-switcher__button btn btn-sm navbar-btn dropdown-toggle"
data-bs-toggle="dropdown"
aria-haspopup="listbox"
aria-controls="pst-version-switcher-list-2"
aria-label="Version switcher list"
>
Choose version <!-- this text may get changed later by javascript -->
<span class="caret"></span>
</button>
<div id="pst-version-switcher-list-2"
class="version-switcher__menu dropdown-menu list-group-flush py-0"
role="listbox" aria-labelledby="pst-version-switcher-button-2">
<!-- dropdown will be populated by javascript on page load -->
</div>
</div>
`);
</script></div>
<div class="navbar-item">
<script>
document.write(`
<button class="btn btn-sm navbar-btn theme-switch-button" title="light/dark" aria-label="light/dark" data-bs-placement="bottom" data-bs-toggle="tooltip">
<span class="theme-switch nav-link" data-mode="light"><i class="fa-solid fa-sun fa-lg"></i></span>
<span class="theme-switch nav-link" data-mode="dark"><i class="fa-solid fa-moon fa-lg"></i></span>
<span class="theme-switch nav-link" data-mode="auto"><i class="fa-solid fa-circle-half-stroke fa-lg"></i></span>
</button>
`);
</script></div>
<div class="navbar-item"><ul class="navbar-icon-links navbar-nav"
aria-label="Icon Links">
<li class="nav-item">
<a href="https://github.com/apache/arrow" title="GitHub" class="nav-link" rel="noopener" target="_blank" data-bs-toggle="tooltip" data-bs-placement="bottom"><span><i class="fa-brands fa-square-github fa-lg" aria-hidden="true"></i></span>
<span class="sr-only">GitHub</span></a>
</li>
<li class="nav-item">
<a href="https://twitter.com/ApacheArrow" title="X" class="nav-link" rel="noopener" target="_blank" data-bs-toggle="tooltip" data-bs-placement="bottom"><span><i class="fa-brands fa-square-x-twitter fa-lg" aria-hidden="true"></i></span>
<span class="sr-only">X</span></a>
</li>
</ul></div>
</div>
</div>
<div class="navbar-persistent--mobile">
<script>
document.write(`
<button class="btn navbar-btn search-button-field search-button__button" title="Search" aria-label="Search" data-bs-placement="bottom" data-bs-toggle="tooltip">
<i class="fa-solid fa-magnifying-glass"></i>
<span class="search-button__default-text">Search</span>
<span class="search-button__kbd-shortcut"><kbd class="kbd-shortcut__modifier">Ctrl</kbd>+<kbd class="kbd-shortcut__modifier">K</kbd></span>
</button>
`);
</script>
</div>
</div>
</header>
<div class="bd-container">
<div class="bd-container__inner bd-page-width">
<div class="bd-sidebar-primary bd-sidebar hide-on-wide">
<div class="sidebar-header-items sidebar-primary__section">
<div class="sidebar-header-items__center">
<div class="navbar-item">
<nav class="navbar-nav">
<ul class="bd-navbar-elements navbar-nav">
<li class="nav-item">
<a class="nav-link nav-internal" href="../../../format/index.html">
Specifications
</a>
</li>
<li class="nav-item">
<a class="nav-link nav-internal" href="../../../developers/index.html">
Development
</a>
</li>
<li class="nav-item dropdown">
<button class="btn dropdown-toggle nav-item" type="button" data-bs-toggle="dropdown" aria-expanded="false" aria-controls="pst-nav-more-links-2">
Implementations
</button>
<ul id="pst-nav-more-links-2" class="dropdown-menu">
<li class="nav-item">
<a class="nav-link dropdown-item nav-internal" href="../../../c_glib/index.html">
C/GLib
</a>
</li>
<li class="nav-item">
<a class="nav-link dropdown-item nav-internal" href="../../../cpp/index.html">
C++
</a>
</li>
<li class="nav-item">
<a class="nav-link dropdown-item nav-external" href="https://github.com/apache/arrow/blob/main/csharp/README.md">
C#
</a>
</li>
<li class="nav-item">
<a class="nav-link dropdown-item nav-external" href="https://pkg.go.dev/github.com/apache/arrow/go/v17">
Go
</a>
</li>
<li class="nav-item">
<a class="nav-link dropdown-item nav-internal" href="../../../java/index.html">
Java
</a>
</li>
<li class="nav-item">
<a class="nav-link dropdown-item nav-internal" href="../../../js/index.html">
JavaScript
</a>
</li>
<li class="nav-item">
<a class="nav-link dropdown-item nav-external" href="https://arrow.apache.org/julia/">
Julia
</a>
</li>
<li class="nav-item">
<a class="nav-link dropdown-item nav-external" href="https://github.com/apache/arrow/blob/main/matlab/README.md">
MATLAB
</a>
</li>
<li class="nav-item">
<a class="nav-link dropdown-item nav-external" href="https://arrow.apache.org/nanoarrow/">
nanoarrow
</a>
</li>
<li class="nav-item">
<a class="nav-link dropdown-item nav-internal" href="../../../python/index.html">
Python
</a>
</li>
<li class="nav-item">
<a class="nav-link dropdown-item nav-internal" href="../../../r/index.html">
R
</a>
</li>
<li class="nav-item">
<a class="nav-link dropdown-item nav-external" href="https://github.com/apache/arrow/blob/main/ruby/README.md">
Ruby
</a>
</li>
<li class="nav-item">
<a class="nav-link dropdown-item nav-external" href="https://docs.rs/crate/arrow/">
Rust
</a>
</li>
<li class="nav-item">
<a class="nav-link dropdown-item nav-internal" href="../../../status.html">
Implementation Status
</a>
</li>
<li class="nav-item">
<a class="nav-link dropdown-item nav-external" href="https://arrow.apache.org/cookbook/cpp/">
C++ cookbook
</a>
</li>
<li class="nav-item">
<a class="nav-link dropdown-item nav-external" href="https://arrow.apache.org/cookbook/java/">
Java cookbook
</a>
</li>
<li class="nav-item">
<a class="nav-link dropdown-item nav-external" href="https://arrow.apache.org/cookbook/py/">
Python cookbook
</a>
</li>
<li class="nav-item">
<a class="nav-link dropdown-item nav-external" href="https://arrow.apache.org/cookbook/r/">
R cookbook
</a>
</li>
</ul>
</li>
</ul>
</nav></div>
</div>
<div class="sidebar-header-items__end">
<div class="navbar-item">
<script>
document.write(`
<div class="version-switcher__container dropdown">
<button id="pst-version-switcher-button-3"
type="button"
class="version-switcher__button btn btn-sm navbar-btn dropdown-toggle"
data-bs-toggle="dropdown"
aria-haspopup="listbox"
aria-controls="pst-version-switcher-list-3"
aria-label="Version switcher list"
>
Choose version <!-- this text may get changed later by javascript -->
<span class="caret"></span>
</button>
<div id="pst-version-switcher-list-3"
class="version-switcher__menu dropdown-menu list-group-flush py-0"
role="listbox" aria-labelledby="pst-version-switcher-button-3">
<!-- dropdown will be populated by javascript on page load -->
</div>
</div>
`);
</script></div>
<div class="navbar-item">
<script>
document.write(`
<button class="btn btn-sm navbar-btn theme-switch-button" title="light/dark" aria-label="light/dark" data-bs-placement="bottom" data-bs-toggle="tooltip">
<span class="theme-switch nav-link" data-mode="light"><i class="fa-solid fa-sun fa-lg"></i></span>
<span class="theme-switch nav-link" data-mode="dark"><i class="fa-solid fa-moon fa-lg"></i></span>
<span class="theme-switch nav-link" data-mode="auto"><i class="fa-solid fa-circle-half-stroke fa-lg"></i></span>
</button>
`);
</script></div>
<div class="navbar-item"><ul class="navbar-icon-links navbar-nav"
aria-label="Icon Links">
<li class="nav-item">
<a href="https://github.com/apache/arrow" title="GitHub" class="nav-link" rel="noopener" target="_blank" data-bs-toggle="tooltip" data-bs-placement="bottom"><span><i class="fa-brands fa-square-github fa-lg" aria-hidden="true"></i></span>
<span class="sr-only">GitHub</span></a>
</li>
<li class="nav-item">
<a href="https://twitter.com/ApacheArrow" title="X" class="nav-link" rel="noopener" target="_blank" data-bs-toggle="tooltip" data-bs-placement="bottom"><span><i class="fa-brands fa-square-x-twitter fa-lg" aria-hidden="true"></i></span>
<span class="sr-only">X</span></a>
</li>
</ul></div>
</div>
</div>
<div class="sidebar-primary-items__end sidebar-primary__section">
</div>
<div id="rtd-footer-container"></div>
</div>
<main id="main-content" class="bd-main">
<div class="bd-content">
<div class="bd-article-container">
<div class="bd-header-article">
<div class="header-article-items header-article__inner">
<div class="header-article-items__start">
<div class="header-article-item">
<nav aria-label="Breadcrumb">
<ul class="bd-breadcrumbs">
<li class="breadcrumb-item breadcrumb-home">
<a href="../../../index.html" class="nav-link" aria-label="Home">
<i class="fa-solid fa-home"></i>
</a>
</li>
<li class="breadcrumb-item"><a href="../../index.html" class="nav-link">Module code</a></li>
<li class="breadcrumb-item"><a href="../../pyarrow.html" class="nav-link">pyarrow</a></li>
<li class="breadcrumb-item active" aria-current="page">pyarrow.parquet.core</li>
</ul>
</nav>
</div>
</div>
</div>
</div>
<div id="searchbox"></div>
<article class="bd-article">
<h1>Source code for pyarrow.parquet.core</h1><div class="highlight"><pre>
<span></span><span class="c1"># Licensed to the Apache Software Foundation (ASF) under one</span>
<span class="c1"># or more contributor license agreements. See the NOTICE file</span>
<span class="c1"># distributed with this work for additional information</span>
<span class="c1"># regarding copyright ownership. The ASF licenses this file</span>
<span class="c1"># to you under the Apache License, Version 2.0 (the</span>
<span class="c1"># &quot;License&quot;); you may not use this file except in compliance</span>
<span class="c1"># with the License. You may obtain a copy of the License at</span>
<span class="c1">#</span>
<span class="c1"># http://www.apache.org/licenses/LICENSE-2.0</span>
<span class="c1">#</span>
<span class="c1"># Unless required by applicable law or agreed to in writing,</span>
<span class="c1"># software distributed under the License is distributed on an</span>
<span class="c1"># &quot;AS IS&quot; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY</span>
<span class="c1"># KIND, either express or implied. See the License for the</span>
<span class="c1"># specific language governing permissions and limitations</span>
<span class="c1"># under the License.</span>
<span class="kn">from</span> <span class="nn">collections</span> <span class="kn">import</span> <span class="n">defaultdict</span>
<span class="kn">from</span> <span class="nn">contextlib</span> <span class="kn">import</span> <span class="n">nullcontext</span>
<span class="kn">from</span> <span class="nn">functools</span> <span class="kn">import</span> <span class="n">reduce</span>
<span class="kn">import</span> <span class="nn">inspect</span>
<span class="kn">import</span> <span class="nn">json</span>
<span class="kn">import</span> <span class="nn">os</span>
<span class="kn">import</span> <span class="nn">re</span>
<span class="kn">import</span> <span class="nn">operator</span>
<span class="kn">import</span> <span class="nn">warnings</span>
<span class="kn">import</span> <span class="nn">pyarrow</span> <span class="k">as</span> <span class="nn">pa</span>
<span class="k">try</span><span class="p">:</span>
<span class="kn">import</span> <span class="nn">pyarrow._parquet</span> <span class="k">as</span> <span class="nn">_parquet</span>
<span class="k">except</span> <span class="ne">ImportError</span> <span class="k">as</span> <span class="n">exc</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">ImportError</span><span class="p">(</span>
<span class="s2">&quot;The pyarrow installation is not built with support &quot;</span>
<span class="sa">f</span><span class="s2">&quot;for the Parquet file format (</span><span class="si">{</span><span class="nb">str</span><span class="p">(</span><span class="n">exc</span><span class="p">)</span><span class="si">}</span><span class="s2">)&quot;</span>
<span class="p">)</span> <span class="kn">from</span> <span class="kc">None</span>
<span class="kn">from</span> <span class="nn">pyarrow._parquet</span> <span class="kn">import</span> <span class="p">(</span><span class="n">ParquetReader</span><span class="p">,</span> <span class="n">Statistics</span><span class="p">,</span> <span class="c1"># noqa</span>
<span class="n">FileMetaData</span><span class="p">,</span> <span class="n">RowGroupMetaData</span><span class="p">,</span>
<span class="n">ColumnChunkMetaData</span><span class="p">,</span>
<span class="n">ParquetSchema</span><span class="p">,</span> <span class="n">ColumnSchema</span><span class="p">,</span>
<span class="n">ParquetLogicalType</span><span class="p">,</span>
<span class="n">FileEncryptionProperties</span><span class="p">,</span>
<span class="n">FileDecryptionProperties</span><span class="p">,</span>
<span class="n">SortingColumn</span><span class="p">)</span>
<span class="kn">from</span> <span class="nn">pyarrow.fs</span> <span class="kn">import</span> <span class="p">(</span><span class="n">LocalFileSystem</span><span class="p">,</span> <span class="n">FileSystem</span><span class="p">,</span> <span class="n">FileType</span><span class="p">,</span>
<span class="n">_resolve_filesystem_and_path</span><span class="p">,</span> <span class="n">_ensure_filesystem</span><span class="p">)</span>
<span class="kn">from</span> <span class="nn">pyarrow.util</span> <span class="kn">import</span> <span class="n">guid</span><span class="p">,</span> <span class="n">_is_path_like</span><span class="p">,</span> <span class="n">_stringify_path</span><span class="p">,</span> <span class="n">_deprecate_api</span>
<span class="k">def</span> <span class="nf">_check_contains_null</span><span class="p">(</span><span class="n">val</span><span class="p">):</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">val</span><span class="p">,</span> <span class="nb">bytes</span><span class="p">):</span>
<span class="k">for</span> <span class="n">byte</span> <span class="ow">in</span> <span class="n">val</span><span class="p">:</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">byte</span><span class="p">,</span> <span class="nb">bytes</span><span class="p">):</span>
<span class="n">compare_to</span> <span class="o">=</span> <span class="nb">chr</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">compare_to</span> <span class="o">=</span> <span class="mi">0</span>
<span class="k">if</span> <span class="n">byte</span> <span class="o">==</span> <span class="n">compare_to</span><span class="p">:</span>
<span class="k">return</span> <span class="kc">True</span>
<span class="k">elif</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">val</span><span class="p">,</span> <span class="nb">str</span><span class="p">):</span>
<span class="k">return</span> <span class="s1">&#39;</span><span class="se">\x00</span><span class="s1">&#39;</span> <span class="ow">in</span> <span class="n">val</span>
<span class="k">return</span> <span class="kc">False</span>
<span class="k">def</span> <span class="nf">_check_filters</span><span class="p">(</span><span class="n">filters</span><span class="p">,</span> <span class="n">check_null_strings</span><span class="o">=</span><span class="kc">True</span><span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Check if filters are well-formed.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="n">filters</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">filters</span><span class="p">)</span> <span class="o">==</span> <span class="mi">0</span> <span class="ow">or</span> <span class="nb">any</span><span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="n">f</span><span class="p">)</span> <span class="o">==</span> <span class="mi">0</span> <span class="k">for</span> <span class="n">f</span> <span class="ow">in</span> <span class="n">filters</span><span class="p">):</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">&quot;Malformed filters&quot;</span><span class="p">)</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">filters</span><span class="p">[</span><span class="mi">0</span><span class="p">][</span><span class="mi">0</span><span class="p">],</span> <span class="nb">str</span><span class="p">):</span>
<span class="c1"># We have encountered the situation where we have one nesting level</span>
<span class="c1"># too few:</span>
<span class="c1"># We have [(,,), ..] instead of [[(,,), ..]]</span>
<span class="n">filters</span> <span class="o">=</span> <span class="p">[</span><span class="n">filters</span><span class="p">]</span>
<span class="k">if</span> <span class="n">check_null_strings</span><span class="p">:</span>
<span class="k">for</span> <span class="n">conjunction</span> <span class="ow">in</span> <span class="n">filters</span><span class="p">:</span>
<span class="k">for</span> <span class="n">col</span><span class="p">,</span> <span class="n">op</span><span class="p">,</span> <span class="n">val</span> <span class="ow">in</span> <span class="n">conjunction</span><span class="p">:</span>
<span class="k">if</span> <span class="p">(</span>
<span class="nb">isinstance</span><span class="p">(</span><span class="n">val</span><span class="p">,</span> <span class="nb">list</span><span class="p">)</span> <span class="ow">and</span>
<span class="nb">all</span><span class="p">(</span><span class="n">_check_contains_null</span><span class="p">(</span><span class="n">v</span><span class="p">)</span> <span class="k">for</span> <span class="n">v</span> <span class="ow">in</span> <span class="n">val</span><span class="p">)</span> <span class="ow">or</span>
<span class="n">_check_contains_null</span><span class="p">(</span><span class="n">val</span><span class="p">)</span>
<span class="p">):</span>
<span class="k">raise</span> <span class="ne">NotImplementedError</span><span class="p">(</span>
<span class="s2">&quot;Null-terminated binary strings are not supported &quot;</span>
<span class="s2">&quot;as filter values.&quot;</span>
<span class="p">)</span>
<span class="k">return</span> <span class="n">filters</span>
<span class="n">_DNF_filter_doc</span> <span class="o">=</span> <span class="s2">&quot;&quot;&quot;Predicates are expressed using an ``Expression`` or using</span>
<span class="s2"> the disjunctive normal form (DNF), like ``[[(&#39;x&#39;, &#39;=&#39;, 0), ...], ...]``.</span>
<span class="s2"> DNF allows arbitrary boolean logical combinations of single column predicates.</span>
<span class="s2"> The innermost tuples each describe a single column predicate. The list of inner</span>
<span class="s2"> predicates is interpreted as a conjunction (AND), forming a more selective and</span>
<span class="s2"> multiple column predicate. Finally, the most outer list combines these filters</span>
<span class="s2"> as a disjunction (OR).</span>
<span class="s2"> Predicates may also be passed as List[Tuple]. This form is interpreted</span>
<span class="s2"> as a single conjunction. To express OR in predicates, one must</span>
<span class="s2"> use the (preferred) List[List[Tuple]] notation.</span>
<span class="s2"> Each tuple has format: (``key``, ``op``, ``value``) and compares the</span>
<span class="s2"> ``key`` with the ``value``.</span>
<span class="s2"> The supported ``op`` are: ``=`` or ``==``, ``!=``, ``&lt;``, ``&gt;``, ``&lt;=``,</span>
<span class="s2"> ``&gt;=``, ``in`` and ``not in``. If the ``op`` is ``in`` or ``not in``, the</span>
<span class="s2"> ``value`` must be a collection such as a ``list``, a ``set`` or a</span>
<span class="s2"> ``tuple``.</span>
<span class="s2"> Examples:</span>
<span class="s2"> Using the ``Expression`` API:</span>
<span class="s2"> .. code-block:: python</span>
<span class="s2"> import pyarrow.compute as pc</span>
<span class="s2"> pc.field(&#39;x&#39;) = 0</span>
<span class="s2"> pc.field(&#39;y&#39;).isin([&#39;a&#39;, &#39;b&#39;, &#39;c&#39;])</span>
<span class="s2"> ~pc.field(&#39;y&#39;).isin({&#39;a&#39;, &#39;b&#39;})</span>
<span class="s2"> Using the DNF format:</span>
<span class="s2"> .. code-block:: python</span>
<span class="s2"> (&#39;x&#39;, &#39;=&#39;, 0)</span>
<span class="s2"> (&#39;y&#39;, &#39;in&#39;, [&#39;a&#39;, &#39;b&#39;, &#39;c&#39;])</span>
<span class="s2"> (&#39;z&#39;, &#39;not in&#39;, {&#39;a&#39;,&#39;b&#39;})</span>
<span class="s2"> &quot;&quot;&quot;</span>
<span class="k">def</span> <span class="nf">filters_to_expression</span><span class="p">(</span><span class="n">filters</span><span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Check if filters are well-formed and convert to an ``Expression``.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> filters : List[Tuple] or List[List[Tuple]]</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> See internal ``pyarrow._DNF_filter_doc`` attribute for more details.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; filters_to_expression([(&#39;foo&#39;, &#39;==&#39;, &#39;bar&#39;)])</span>
<span class="sd"> &lt;pyarrow.compute.Expression (foo == &quot;bar&quot;)&gt;</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> pyarrow.compute.Expression</span>
<span class="sd"> An Expression representing the filters</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="kn">import</span> <span class="nn">pyarrow.dataset</span> <span class="k">as</span> <span class="nn">ds</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">filters</span><span class="p">,</span> <span class="n">ds</span><span class="o">.</span><span class="n">Expression</span><span class="p">):</span>
<span class="k">return</span> <span class="n">filters</span>
<span class="n">filters</span> <span class="o">=</span> <span class="n">_check_filters</span><span class="p">(</span><span class="n">filters</span><span class="p">,</span> <span class="n">check_null_strings</span><span class="o">=</span><span class="kc">False</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">convert_single_predicate</span><span class="p">(</span><span class="n">col</span><span class="p">,</span> <span class="n">op</span><span class="p">,</span> <span class="n">val</span><span class="p">):</span>
<span class="n">field</span> <span class="o">=</span> <span class="n">ds</span><span class="o">.</span><span class="n">field</span><span class="p">(</span><span class="n">col</span><span class="p">)</span>
<span class="k">if</span> <span class="n">op</span> <span class="o">==</span> <span class="s2">&quot;=&quot;</span> <span class="ow">or</span> <span class="n">op</span> <span class="o">==</span> <span class="s2">&quot;==&quot;</span><span class="p">:</span>
<span class="k">return</span> <span class="n">field</span> <span class="o">==</span> <span class="n">val</span>
<span class="k">elif</span> <span class="n">op</span> <span class="o">==</span> <span class="s2">&quot;!=&quot;</span><span class="p">:</span>
<span class="k">return</span> <span class="n">field</span> <span class="o">!=</span> <span class="n">val</span>
<span class="k">elif</span> <span class="n">op</span> <span class="o">==</span> <span class="s1">&#39;&lt;&#39;</span><span class="p">:</span>
<span class="k">return</span> <span class="n">field</span> <span class="o">&lt;</span> <span class="n">val</span>
<span class="k">elif</span> <span class="n">op</span> <span class="o">==</span> <span class="s1">&#39;&gt;&#39;</span><span class="p">:</span>
<span class="k">return</span> <span class="n">field</span> <span class="o">&gt;</span> <span class="n">val</span>
<span class="k">elif</span> <span class="n">op</span> <span class="o">==</span> <span class="s1">&#39;&lt;=&#39;</span><span class="p">:</span>
<span class="k">return</span> <span class="n">field</span> <span class="o">&lt;=</span> <span class="n">val</span>
<span class="k">elif</span> <span class="n">op</span> <span class="o">==</span> <span class="s1">&#39;&gt;=&#39;</span><span class="p">:</span>
<span class="k">return</span> <span class="n">field</span> <span class="o">&gt;=</span> <span class="n">val</span>
<span class="k">elif</span> <span class="n">op</span> <span class="o">==</span> <span class="s1">&#39;in&#39;</span><span class="p">:</span>
<span class="k">return</span> <span class="n">field</span><span class="o">.</span><span class="n">isin</span><span class="p">(</span><span class="n">val</span><span class="p">)</span>
<span class="k">elif</span> <span class="n">op</span> <span class="o">==</span> <span class="s1">&#39;not in&#39;</span><span class="p">:</span>
<span class="k">return</span> <span class="o">~</span><span class="n">field</span><span class="o">.</span><span class="n">isin</span><span class="p">(</span><span class="n">val</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span>
<span class="s1">&#39;&quot;</span><span class="si">{0}</span><span class="s1">&quot; is not a valid operator in predicates.&#39;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span>
<span class="p">(</span><span class="n">col</span><span class="p">,</span> <span class="n">op</span><span class="p">,</span> <span class="n">val</span><span class="p">)))</span>
<span class="n">disjunction_members</span> <span class="o">=</span> <span class="p">[]</span>
<span class="k">for</span> <span class="n">conjunction</span> <span class="ow">in</span> <span class="n">filters</span><span class="p">:</span>
<span class="n">conjunction_members</span> <span class="o">=</span> <span class="p">[</span>
<span class="n">convert_single_predicate</span><span class="p">(</span><span class="n">col</span><span class="p">,</span> <span class="n">op</span><span class="p">,</span> <span class="n">val</span><span class="p">)</span>
<span class="k">for</span> <span class="n">col</span><span class="p">,</span> <span class="n">op</span><span class="p">,</span> <span class="n">val</span> <span class="ow">in</span> <span class="n">conjunction</span>
<span class="p">]</span>
<span class="n">disjunction_members</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">reduce</span><span class="p">(</span><span class="n">operator</span><span class="o">.</span><span class="n">and_</span><span class="p">,</span> <span class="n">conjunction_members</span><span class="p">))</span>
<span class="k">return</span> <span class="n">reduce</span><span class="p">(</span><span class="n">operator</span><span class="o">.</span><span class="n">or_</span><span class="p">,</span> <span class="n">disjunction_members</span><span class="p">)</span>
<span class="n">_filters_to_expression</span> <span class="o">=</span> <span class="n">_deprecate_api</span><span class="p">(</span>
<span class="s2">&quot;_filters_to_expression&quot;</span><span class="p">,</span> <span class="s2">&quot;filters_to_expression&quot;</span><span class="p">,</span>
<span class="n">filters_to_expression</span><span class="p">,</span> <span class="s2">&quot;10.0.0&quot;</span><span class="p">,</span> <span class="ne">DeprecationWarning</span><span class="p">)</span>
<span class="c1"># ----------------------------------------------------------------------</span>
<span class="c1"># Reading a single Parquet file</span>
<div class="viewcode-block" id="ParquetFile"><a class="viewcode-back" href="../../../python/generated/pyarrow.parquet.ParquetFile.html#pyarrow.parquet.ParquetFile">[docs]</a><span class="k">class</span> <span class="nc">ParquetFile</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Reader interface for a single Parquet file.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> source : str, pathlib.Path, pyarrow.NativeFile, or file-like object</span>
<span class="sd"> Readable source. For passing bytes or buffer-like file containing a</span>
<span class="sd"> Parquet file, use pyarrow.BufferReader.</span>
<span class="sd"> metadata : FileMetaData, default None</span>
<span class="sd"> Use existing metadata object, rather than reading from file.</span>
<span class="sd"> common_metadata : FileMetaData, default None</span>
<span class="sd"> Will be used in reads for pandas schema metadata if not found in the</span>
<span class="sd"> main file&#39;s metadata, no other uses at the moment.</span>
<span class="sd"> read_dictionary : list</span>
<span class="sd"> List of column names to read directly as DictionaryArray.</span>
<span class="sd"> memory_map : bool, default False</span>
<span class="sd"> If the source is a file path, use a memory map to read file, which can</span>
<span class="sd"> improve performance in some environments.</span>
<span class="sd"> buffer_size : int, default 0</span>
<span class="sd"> If positive, perform read buffering when deserializing individual</span>
<span class="sd"> column chunks. Otherwise IO calls are unbuffered.</span>
<span class="sd"> pre_buffer : bool, default False</span>
<span class="sd"> Coalesce and issue file reads in parallel to improve performance on</span>
<span class="sd"> high-latency filesystems (e.g. S3). If True, Arrow will use a</span>
<span class="sd"> background I/O thread pool.</span>
<span class="sd"> coerce_int96_timestamp_unit : str, default None</span>
<span class="sd"> Cast timestamps that are stored in INT96 format to a particular</span>
<span class="sd"> resolution (e.g. &#39;ms&#39;). Setting to None is equivalent to &#39;ns&#39;</span>
<span class="sd"> and therefore INT96 timestamps will be inferred as timestamps</span>
<span class="sd"> in nanoseconds.</span>
<span class="sd"> decryption_properties : FileDecryptionProperties, default None</span>
<span class="sd"> File decryption properties for Parquet Modular Encryption.</span>
<span class="sd"> thrift_string_size_limit : int, default None</span>
<span class="sd"> If not None, override the maximum total string size allocated</span>
<span class="sd"> when decoding Thrift structures. The default limit should be</span>
<span class="sd"> sufficient for most Parquet files.</span>
<span class="sd"> thrift_container_size_limit : int, default None</span>
<span class="sd"> If not None, override the maximum total size of containers allocated</span>
<span class="sd"> when decoding Thrift structures. The default limit should be</span>
<span class="sd"> sufficient for most Parquet files.</span>
<span class="sd"> filesystem : FileSystem, default None</span>
<span class="sd"> If nothing passed, will be inferred based on path.</span>
<span class="sd"> Path will try to be found in the local on-disk filesystem otherwise</span>
<span class="sd"> it will be parsed as an URI to determine the filesystem.</span>
<span class="sd"> page_checksum_verification : bool, default False</span>
<span class="sd"> If True, verify the checksum for each page read from the file.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> Generate an example PyArrow Table and write it to Parquet file:</span>
<span class="sd"> &gt;&gt;&gt; import pyarrow as pa</span>
<span class="sd"> &gt;&gt;&gt; table = pa.table({&#39;n_legs&#39;: [2, 2, 4, 4, 5, 100],</span>
<span class="sd"> ... &#39;animal&#39;: [&quot;Flamingo&quot;, &quot;Parrot&quot;, &quot;Dog&quot;, &quot;Horse&quot;,</span>
<span class="sd"> ... &quot;Brittle stars&quot;, &quot;Centipede&quot;]})</span>
<span class="sd"> &gt;&gt;&gt; import pyarrow.parquet as pq</span>
<span class="sd"> &gt;&gt;&gt; pq.write_table(table, &#39;example.parquet&#39;)</span>
<span class="sd"> Create a ``ParquetFile`` object from the Parquet file:</span>
<span class="sd"> &gt;&gt;&gt; parquet_file = pq.ParquetFile(&#39;example.parquet&#39;)</span>
<span class="sd"> Read the data:</span>
<span class="sd"> &gt;&gt;&gt; parquet_file.read()</span>
<span class="sd"> pyarrow.Table</span>
<span class="sd"> n_legs: int64</span>
<span class="sd"> animal: string</span>
<span class="sd"> ----</span>
<span class="sd"> n_legs: [[2,2,4,4,5,100]]</span>
<span class="sd"> animal: [[&quot;Flamingo&quot;,&quot;Parrot&quot;,&quot;Dog&quot;,&quot;Horse&quot;,&quot;Brittle stars&quot;,&quot;Centipede&quot;]]</span>
<span class="sd"> Create a ParquetFile object with &quot;animal&quot; column as DictionaryArray:</span>
<span class="sd"> &gt;&gt;&gt; parquet_file = pq.ParquetFile(&#39;example.parquet&#39;,</span>
<span class="sd"> ... read_dictionary=[&quot;animal&quot;])</span>
<span class="sd"> &gt;&gt;&gt; parquet_file.read()</span>
<span class="sd"> pyarrow.Table</span>
<span class="sd"> n_legs: int64</span>
<span class="sd"> animal: dictionary&lt;values=string, indices=int32, ordered=0&gt;</span>
<span class="sd"> ----</span>
<span class="sd"> n_legs: [[2,2,4,4,5,100]]</span>
<span class="sd"> animal: [ -- dictionary:</span>
<span class="sd"> [&quot;Flamingo&quot;,&quot;Parrot&quot;,...,&quot;Brittle stars&quot;,&quot;Centipede&quot;] -- indices:</span>
<span class="sd"> [0,1,2,3,4,5]]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<div class="viewcode-block" id="ParquetFile.__init__"><a class="viewcode-back" href="../../../python/generated/pyarrow.parquet.ParquetFile.html#pyarrow.parquet.ParquetFile.__init__">[docs]</a> <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">source</span><span class="p">,</span> <span class="o">*</span><span class="p">,</span> <span class="n">metadata</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">common_metadata</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span>
<span class="n">read_dictionary</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">memory_map</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span> <span class="n">buffer_size</span><span class="o">=</span><span class="mi">0</span><span class="p">,</span>
<span class="n">pre_buffer</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span> <span class="n">coerce_int96_timestamp_unit</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span>
<span class="n">decryption_properties</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">thrift_string_size_limit</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span>
<span class="n">thrift_container_size_limit</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">filesystem</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span>
<span class="n">page_checksum_verification</span><span class="o">=</span><span class="kc">False</span><span class="p">):</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_close_source</span> <span class="o">=</span> <span class="nb">getattr</span><span class="p">(</span><span class="n">source</span><span class="p">,</span> <span class="s1">&#39;closed&#39;</span><span class="p">,</span> <span class="kc">True</span><span class="p">)</span>
<span class="n">filesystem</span><span class="p">,</span> <span class="n">source</span> <span class="o">=</span> <span class="n">_resolve_filesystem_and_path</span><span class="p">(</span>
<span class="n">source</span><span class="p">,</span> <span class="n">filesystem</span><span class="p">,</span> <span class="n">memory_map</span><span class="o">=</span><span class="n">memory_map</span><span class="p">)</span>
<span class="k">if</span> <span class="n">filesystem</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="n">source</span> <span class="o">=</span> <span class="n">filesystem</span><span class="o">.</span><span class="n">open_input_file</span><span class="p">(</span><span class="n">source</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_close_source</span> <span class="o">=</span> <span class="kc">True</span> <span class="c1"># We opened it here, ensure we close it.</span>
<span class="bp">self</span><span class="o">.</span><span class="n">reader</span> <span class="o">=</span> <span class="n">ParquetReader</span><span class="p">()</span>
<span class="bp">self</span><span class="o">.</span><span class="n">reader</span><span class="o">.</span><span class="n">open</span><span class="p">(</span>
<span class="n">source</span><span class="p">,</span> <span class="n">use_memory_map</span><span class="o">=</span><span class="n">memory_map</span><span class="p">,</span>
<span class="n">buffer_size</span><span class="o">=</span><span class="n">buffer_size</span><span class="p">,</span> <span class="n">pre_buffer</span><span class="o">=</span><span class="n">pre_buffer</span><span class="p">,</span>
<span class="n">read_dictionary</span><span class="o">=</span><span class="n">read_dictionary</span><span class="p">,</span> <span class="n">metadata</span><span class="o">=</span><span class="n">metadata</span><span class="p">,</span>
<span class="n">coerce_int96_timestamp_unit</span><span class="o">=</span><span class="n">coerce_int96_timestamp_unit</span><span class="p">,</span>
<span class="n">decryption_properties</span><span class="o">=</span><span class="n">decryption_properties</span><span class="p">,</span>
<span class="n">thrift_string_size_limit</span><span class="o">=</span><span class="n">thrift_string_size_limit</span><span class="p">,</span>
<span class="n">thrift_container_size_limit</span><span class="o">=</span><span class="n">thrift_container_size_limit</span><span class="p">,</span>
<span class="n">page_checksum_verification</span><span class="o">=</span><span class="n">page_checksum_verification</span><span class="p">,</span>
<span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">common_metadata</span> <span class="o">=</span> <span class="n">common_metadata</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_nested_paths_by_prefix</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_build_nested_paths</span><span class="p">()</span></div>
<span class="k">def</span> <span class="fm">__enter__</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="k">return</span> <span class="bp">self</span>
<span class="k">def</span> <span class="fm">__exit__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="n">args</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">):</span>
<span class="bp">self</span><span class="o">.</span><span class="n">close</span><span class="p">()</span>
<span class="k">def</span> <span class="nf">_build_nested_paths</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="n">paths</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">reader</span><span class="o">.</span><span class="n">column_paths</span>
<span class="n">result</span> <span class="o">=</span> <span class="n">defaultdict</span><span class="p">(</span><span class="nb">list</span><span class="p">)</span>
<span class="k">for</span> <span class="n">i</span><span class="p">,</span> <span class="n">path</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="n">paths</span><span class="p">):</span>
<span class="n">key</span> <span class="o">=</span> <span class="n">path</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span>
<span class="n">rest</span> <span class="o">=</span> <span class="n">path</span><span class="p">[</span><span class="mi">1</span><span class="p">:]</span>
<span class="k">while</span> <span class="kc">True</span><span class="p">:</span>
<span class="n">result</span><span class="p">[</span><span class="n">key</span><span class="p">]</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">i</span><span class="p">)</span>
<span class="k">if</span> <span class="ow">not</span> <span class="n">rest</span><span class="p">:</span>
<span class="k">break</span>
<span class="n">key</span> <span class="o">=</span> <span class="s1">&#39;.&#39;</span><span class="o">.</span><span class="n">join</span><span class="p">((</span><span class="n">key</span><span class="p">,</span> <span class="n">rest</span><span class="p">[</span><span class="mi">0</span><span class="p">]))</span>
<span class="n">rest</span> <span class="o">=</span> <span class="n">rest</span><span class="p">[</span><span class="mi">1</span><span class="p">:]</span>
<span class="k">return</span> <span class="n">result</span>
<span class="nd">@property</span>
<span class="k">def</span> <span class="nf">metadata</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Return the Parquet metadata.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">reader</span><span class="o">.</span><span class="n">metadata</span>
<span class="nd">@property</span>
<span class="k">def</span> <span class="nf">schema</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Return the Parquet schema, unconverted to Arrow types</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">metadata</span><span class="o">.</span><span class="n">schema</span>
<span class="nd">@property</span>
<span class="k">def</span> <span class="nf">schema_arrow</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Return the inferred Arrow schema, converted from the whole Parquet</span>
<span class="sd"> file&#39;s schema</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> Generate an example Parquet file:</span>
<span class="sd"> &gt;&gt;&gt; import pyarrow as pa</span>
<span class="sd"> &gt;&gt;&gt; table = pa.table({&#39;n_legs&#39;: [2, 2, 4, 4, 5, 100],</span>
<span class="sd"> ... &#39;animal&#39;: [&quot;Flamingo&quot;, &quot;Parrot&quot;, &quot;Dog&quot;, &quot;Horse&quot;,</span>
<span class="sd"> ... &quot;Brittle stars&quot;, &quot;Centipede&quot;]})</span>
<span class="sd"> &gt;&gt;&gt; import pyarrow.parquet as pq</span>
<span class="sd"> &gt;&gt;&gt; pq.write_table(table, &#39;example.parquet&#39;)</span>
<span class="sd"> &gt;&gt;&gt; parquet_file = pq.ParquetFile(&#39;example.parquet&#39;)</span>
<span class="sd"> Read the Arrow schema:</span>
<span class="sd"> &gt;&gt;&gt; parquet_file.schema_arrow</span>
<span class="sd"> n_legs: int64</span>
<span class="sd"> animal: string</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">reader</span><span class="o">.</span><span class="n">schema_arrow</span>
<span class="nd">@property</span>
<span class="k">def</span> <span class="nf">num_row_groups</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Return the number of row groups of the Parquet file.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; import pyarrow as pa</span>
<span class="sd"> &gt;&gt;&gt; table = pa.table({&#39;n_legs&#39;: [2, 2, 4, 4, 5, 100],</span>
<span class="sd"> ... &#39;animal&#39;: [&quot;Flamingo&quot;, &quot;Parrot&quot;, &quot;Dog&quot;, &quot;Horse&quot;,</span>
<span class="sd"> ... &quot;Brittle stars&quot;, &quot;Centipede&quot;]})</span>
<span class="sd"> &gt;&gt;&gt; import pyarrow.parquet as pq</span>
<span class="sd"> &gt;&gt;&gt; pq.write_table(table, &#39;example.parquet&#39;)</span>
<span class="sd"> &gt;&gt;&gt; parquet_file = pq.ParquetFile(&#39;example.parquet&#39;)</span>
<span class="sd"> &gt;&gt;&gt; parquet_file.num_row_groups</span>
<span class="sd"> 1</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">reader</span><span class="o">.</span><span class="n">num_row_groups</span>
<div class="viewcode-block" id="ParquetFile.close"><a class="viewcode-back" href="../../../python/generated/pyarrow.parquet.ParquetFile.html#pyarrow.parquet.ParquetFile.close">[docs]</a> <span class="k">def</span> <span class="nf">close</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">force</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">):</span>
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">_close_source</span> <span class="ow">or</span> <span class="n">force</span><span class="p">:</span>
<span class="bp">self</span><span class="o">.</span><span class="n">reader</span><span class="o">.</span><span class="n">close</span><span class="p">()</span></div>
<span class="nd">@property</span>
<span class="k">def</span> <span class="nf">closed</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">bool</span><span class="p">:</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">reader</span><span class="o">.</span><span class="n">closed</span>
<div class="viewcode-block" id="ParquetFile.read_row_group"><a class="viewcode-back" href="../../../python/generated/pyarrow.parquet.ParquetFile.html#pyarrow.parquet.ParquetFile.read_row_group">[docs]</a> <span class="k">def</span> <span class="nf">read_row_group</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">i</span><span class="p">,</span> <span class="n">columns</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">use_threads</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span>
<span class="n">use_pandas_metadata</span><span class="o">=</span><span class="kc">False</span><span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Read a single row group from a Parquet file.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> i : int</span>
<span class="sd"> Index of the individual row group that we want to read.</span>
<span class="sd"> columns : list</span>
<span class="sd"> If not None, only these columns will be read from the row group. A</span>
<span class="sd"> column name may be a prefix of a nested field, e.g. &#39;a&#39; will select</span>
<span class="sd"> &#39;a.b&#39;, &#39;a.c&#39;, and &#39;a.d.e&#39;.</span>
<span class="sd"> use_threads : bool, default True</span>
<span class="sd"> Perform multi-threaded column reads.</span>
<span class="sd"> use_pandas_metadata : bool, default False</span>
<span class="sd"> If True and file has custom pandas schema metadata, ensure that</span>
<span class="sd"> index columns are also loaded.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> pyarrow.table.Table</span>
<span class="sd"> Content of the row group as a table (of columns)</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; import pyarrow as pa</span>
<span class="sd"> &gt;&gt;&gt; table = pa.table({&#39;n_legs&#39;: [2, 2, 4, 4, 5, 100],</span>
<span class="sd"> ... &#39;animal&#39;: [&quot;Flamingo&quot;, &quot;Parrot&quot;, &quot;Dog&quot;, &quot;Horse&quot;,</span>
<span class="sd"> ... &quot;Brittle stars&quot;, &quot;Centipede&quot;]})</span>
<span class="sd"> &gt;&gt;&gt; import pyarrow.parquet as pq</span>
<span class="sd"> &gt;&gt;&gt; pq.write_table(table, &#39;example.parquet&#39;)</span>
<span class="sd"> &gt;&gt;&gt; parquet_file = pq.ParquetFile(&#39;example.parquet&#39;)</span>
<span class="sd"> &gt;&gt;&gt; parquet_file.read_row_group(0)</span>
<span class="sd"> pyarrow.Table</span>
<span class="sd"> n_legs: int64</span>
<span class="sd"> animal: string</span>
<span class="sd"> ----</span>
<span class="sd"> n_legs: [[2,2,4,4,5,100]]</span>
<span class="sd"> animal: [[&quot;Flamingo&quot;,&quot;Parrot&quot;,...,&quot;Brittle stars&quot;,&quot;Centipede&quot;]]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">column_indices</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_get_column_indices</span><span class="p">(</span>
<span class="n">columns</span><span class="p">,</span> <span class="n">use_pandas_metadata</span><span class="o">=</span><span class="n">use_pandas_metadata</span><span class="p">)</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">reader</span><span class="o">.</span><span class="n">read_row_group</span><span class="p">(</span><span class="n">i</span><span class="p">,</span> <span class="n">column_indices</span><span class="o">=</span><span class="n">column_indices</span><span class="p">,</span>
<span class="n">use_threads</span><span class="o">=</span><span class="n">use_threads</span><span class="p">)</span></div>
<div class="viewcode-block" id="ParquetFile.read_row_groups"><a class="viewcode-back" href="../../../python/generated/pyarrow.parquet.ParquetFile.html#pyarrow.parquet.ParquetFile.read_row_groups">[docs]</a> <span class="k">def</span> <span class="nf">read_row_groups</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">row_groups</span><span class="p">,</span> <span class="n">columns</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">use_threads</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span>
<span class="n">use_pandas_metadata</span><span class="o">=</span><span class="kc">False</span><span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Read a multiple row groups from a Parquet file.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> row_groups : list</span>
<span class="sd"> Only these row groups will be read from the file.</span>
<span class="sd"> columns : list</span>
<span class="sd"> If not None, only these columns will be read from the row group. A</span>
<span class="sd"> column name may be a prefix of a nested field, e.g. &#39;a&#39; will select</span>
<span class="sd"> &#39;a.b&#39;, &#39;a.c&#39;, and &#39;a.d.e&#39;.</span>
<span class="sd"> use_threads : bool, default True</span>
<span class="sd"> Perform multi-threaded column reads.</span>
<span class="sd"> use_pandas_metadata : bool, default False</span>
<span class="sd"> If True and file has custom pandas schema metadata, ensure that</span>
<span class="sd"> index columns are also loaded.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> pyarrow.table.Table</span>
<span class="sd"> Content of the row groups as a table (of columns).</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; import pyarrow as pa</span>
<span class="sd"> &gt;&gt;&gt; table = pa.table({&#39;n_legs&#39;: [2, 2, 4, 4, 5, 100],</span>
<span class="sd"> ... &#39;animal&#39;: [&quot;Flamingo&quot;, &quot;Parrot&quot;, &quot;Dog&quot;, &quot;Horse&quot;,</span>
<span class="sd"> ... &quot;Brittle stars&quot;, &quot;Centipede&quot;]})</span>
<span class="sd"> &gt;&gt;&gt; import pyarrow.parquet as pq</span>
<span class="sd"> &gt;&gt;&gt; pq.write_table(table, &#39;example.parquet&#39;)</span>
<span class="sd"> &gt;&gt;&gt; parquet_file = pq.ParquetFile(&#39;example.parquet&#39;)</span>
<span class="sd"> &gt;&gt;&gt; parquet_file.read_row_groups([0,0])</span>
<span class="sd"> pyarrow.Table</span>
<span class="sd"> n_legs: int64</span>
<span class="sd"> animal: string</span>
<span class="sd"> ----</span>
<span class="sd"> n_legs: [[2,2,4,4,5,...,2,4,4,5,100]]</span>
<span class="sd"> animal: [[&quot;Flamingo&quot;,&quot;Parrot&quot;,&quot;Dog&quot;,...,&quot;Brittle stars&quot;,&quot;Centipede&quot;]]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">column_indices</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_get_column_indices</span><span class="p">(</span>
<span class="n">columns</span><span class="p">,</span> <span class="n">use_pandas_metadata</span><span class="o">=</span><span class="n">use_pandas_metadata</span><span class="p">)</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">reader</span><span class="o">.</span><span class="n">read_row_groups</span><span class="p">(</span><span class="n">row_groups</span><span class="p">,</span>
<span class="n">column_indices</span><span class="o">=</span><span class="n">column_indices</span><span class="p">,</span>
<span class="n">use_threads</span><span class="o">=</span><span class="n">use_threads</span><span class="p">)</span></div>
<div class="viewcode-block" id="ParquetFile.iter_batches"><a class="viewcode-back" href="../../../python/generated/pyarrow.parquet.ParquetFile.html#pyarrow.parquet.ParquetFile.iter_batches">[docs]</a> <span class="k">def</span> <span class="nf">iter_batches</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">batch_size</span><span class="o">=</span><span class="mi">65536</span><span class="p">,</span> <span class="n">row_groups</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">columns</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span>
<span class="n">use_threads</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">use_pandas_metadata</span><span class="o">=</span><span class="kc">False</span><span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Read streaming batches from a Parquet file.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> batch_size : int, default 64K</span>
<span class="sd"> Maximum number of records to yield per batch. Batches may be</span>
<span class="sd"> smaller if there aren&#39;t enough rows in the file.</span>
<span class="sd"> row_groups : list</span>
<span class="sd"> Only these row groups will be read from the file.</span>
<span class="sd"> columns : list</span>
<span class="sd"> If not None, only these columns will be read from the file. A</span>
<span class="sd"> column name may be a prefix of a nested field, e.g. &#39;a&#39; will select</span>
<span class="sd"> &#39;a.b&#39;, &#39;a.c&#39;, and &#39;a.d.e&#39;.</span>
<span class="sd"> use_threads : boolean, default True</span>
<span class="sd"> Perform multi-threaded column reads.</span>
<span class="sd"> use_pandas_metadata : boolean, default False</span>
<span class="sd"> If True and file has custom pandas schema metadata, ensure that</span>
<span class="sd"> index columns are also loaded.</span>
<span class="sd"> Yields</span>
<span class="sd"> ------</span>
<span class="sd"> pyarrow.RecordBatch</span>
<span class="sd"> Contents of each batch as a record batch</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> Generate an example Parquet file:</span>
<span class="sd"> &gt;&gt;&gt; import pyarrow as pa</span>
<span class="sd"> &gt;&gt;&gt; table = pa.table({&#39;n_legs&#39;: [2, 2, 4, 4, 5, 100],</span>
<span class="sd"> ... &#39;animal&#39;: [&quot;Flamingo&quot;, &quot;Parrot&quot;, &quot;Dog&quot;, &quot;Horse&quot;,</span>
<span class="sd"> ... &quot;Brittle stars&quot;, &quot;Centipede&quot;]})</span>
<span class="sd"> &gt;&gt;&gt; import pyarrow.parquet as pq</span>
<span class="sd"> &gt;&gt;&gt; pq.write_table(table, &#39;example.parquet&#39;)</span>
<span class="sd"> &gt;&gt;&gt; parquet_file = pq.ParquetFile(&#39;example.parquet&#39;)</span>
<span class="sd"> &gt;&gt;&gt; for i in parquet_file.iter_batches():</span>
<span class="sd"> ... print(&quot;RecordBatch&quot;)</span>
<span class="sd"> ... print(i.to_pandas())</span>
<span class="sd"> ...</span>
<span class="sd"> RecordBatch</span>
<span class="sd"> n_legs animal</span>
<span class="sd"> 0 2 Flamingo</span>
<span class="sd"> 1 2 Parrot</span>
<span class="sd"> 2 4 Dog</span>
<span class="sd"> 3 4 Horse</span>
<span class="sd"> 4 5 Brittle stars</span>
<span class="sd"> 5 100 Centipede</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="n">row_groups</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
<span class="n">row_groups</span> <span class="o">=</span> <span class="nb">range</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">metadata</span><span class="o">.</span><span class="n">num_row_groups</span><span class="p">)</span>
<span class="n">column_indices</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_get_column_indices</span><span class="p">(</span>
<span class="n">columns</span><span class="p">,</span> <span class="n">use_pandas_metadata</span><span class="o">=</span><span class="n">use_pandas_metadata</span><span class="p">)</span>
<span class="n">batches</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">reader</span><span class="o">.</span><span class="n">iter_batches</span><span class="p">(</span><span class="n">batch_size</span><span class="p">,</span>
<span class="n">row_groups</span><span class="o">=</span><span class="n">row_groups</span><span class="p">,</span>
<span class="n">column_indices</span><span class="o">=</span><span class="n">column_indices</span><span class="p">,</span>
<span class="n">use_threads</span><span class="o">=</span><span class="n">use_threads</span><span class="p">)</span>
<span class="k">return</span> <span class="n">batches</span></div>
<div class="viewcode-block" id="ParquetFile.read"><a class="viewcode-back" href="../../../python/generated/pyarrow.parquet.ParquetFile.html#pyarrow.parquet.ParquetFile.read">[docs]</a> <span class="k">def</span> <span class="nf">read</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">columns</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">use_threads</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">use_pandas_metadata</span><span class="o">=</span><span class="kc">False</span><span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Read a Table from Parquet format.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> columns : list</span>
<span class="sd"> If not None, only these columns will be read from the file. A</span>
<span class="sd"> column name may be a prefix of a nested field, e.g. &#39;a&#39; will select</span>
<span class="sd"> &#39;a.b&#39;, &#39;a.c&#39;, and &#39;a.d.e&#39;.</span>
<span class="sd"> use_threads : bool, default True</span>
<span class="sd"> Perform multi-threaded column reads.</span>
<span class="sd"> use_pandas_metadata : bool, default False</span>
<span class="sd"> If True and file has custom pandas schema metadata, ensure that</span>
<span class="sd"> index columns are also loaded.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> pyarrow.table.Table</span>
<span class="sd"> Content of the file as a table (of columns).</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> Generate an example Parquet file:</span>
<span class="sd"> &gt;&gt;&gt; import pyarrow as pa</span>
<span class="sd"> &gt;&gt;&gt; table = pa.table({&#39;n_legs&#39;: [2, 2, 4, 4, 5, 100],</span>
<span class="sd"> ... &#39;animal&#39;: [&quot;Flamingo&quot;, &quot;Parrot&quot;, &quot;Dog&quot;, &quot;Horse&quot;,</span>
<span class="sd"> ... &quot;Brittle stars&quot;, &quot;Centipede&quot;]})</span>
<span class="sd"> &gt;&gt;&gt; import pyarrow.parquet as pq</span>
<span class="sd"> &gt;&gt;&gt; pq.write_table(table, &#39;example.parquet&#39;)</span>
<span class="sd"> &gt;&gt;&gt; parquet_file = pq.ParquetFile(&#39;example.parquet&#39;)</span>
<span class="sd"> Read a Table:</span>
<span class="sd"> &gt;&gt;&gt; parquet_file.read(columns=[&quot;animal&quot;])</span>
<span class="sd"> pyarrow.Table</span>
<span class="sd"> animal: string</span>
<span class="sd"> ----</span>
<span class="sd"> animal: [[&quot;Flamingo&quot;,&quot;Parrot&quot;,...,&quot;Brittle stars&quot;,&quot;Centipede&quot;]]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">column_indices</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_get_column_indices</span><span class="p">(</span>
<span class="n">columns</span><span class="p">,</span> <span class="n">use_pandas_metadata</span><span class="o">=</span><span class="n">use_pandas_metadata</span><span class="p">)</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">reader</span><span class="o">.</span><span class="n">read_all</span><span class="p">(</span><span class="n">column_indices</span><span class="o">=</span><span class="n">column_indices</span><span class="p">,</span>
<span class="n">use_threads</span><span class="o">=</span><span class="n">use_threads</span><span class="p">)</span></div>
<div class="viewcode-block" id="ParquetFile.scan_contents"><a class="viewcode-back" href="../../../python/generated/pyarrow.parquet.ParquetFile.html#pyarrow.parquet.ParquetFile.scan_contents">[docs]</a> <span class="k">def</span> <span class="nf">scan_contents</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">columns</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">batch_size</span><span class="o">=</span><span class="mi">65536</span><span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Read contents of file for the given columns and batch size.</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> This function&#39;s primary purpose is benchmarking.</span>
<span class="sd"> The scan is executed on a single thread.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> columns : list of integers, default None</span>
<span class="sd"> Select columns to read, if None scan all columns.</span>
<span class="sd"> batch_size : int, default 64K</span>
<span class="sd"> Number of rows to read at a time internally.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> num_rows : int</span>
<span class="sd"> Number of rows in file</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; import pyarrow as pa</span>
<span class="sd"> &gt;&gt;&gt; table = pa.table({&#39;n_legs&#39;: [2, 2, 4, 4, 5, 100],</span>
<span class="sd"> ... &#39;animal&#39;: [&quot;Flamingo&quot;, &quot;Parrot&quot;, &quot;Dog&quot;, &quot;Horse&quot;,</span>
<span class="sd"> ... &quot;Brittle stars&quot;, &quot;Centipede&quot;]})</span>
<span class="sd"> &gt;&gt;&gt; import pyarrow.parquet as pq</span>
<span class="sd"> &gt;&gt;&gt; pq.write_table(table, &#39;example.parquet&#39;)</span>
<span class="sd"> &gt;&gt;&gt; parquet_file = pq.ParquetFile(&#39;example.parquet&#39;)</span>
<span class="sd"> &gt;&gt;&gt; parquet_file.scan_contents()</span>
<span class="sd"> 6</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">column_indices</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_get_column_indices</span><span class="p">(</span><span class="n">columns</span><span class="p">)</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">reader</span><span class="o">.</span><span class="n">scan_contents</span><span class="p">(</span><span class="n">column_indices</span><span class="p">,</span>
<span class="n">batch_size</span><span class="o">=</span><span class="n">batch_size</span><span class="p">)</span></div>
<span class="k">def</span> <span class="nf">_get_column_indices</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">column_names</span><span class="p">,</span> <span class="n">use_pandas_metadata</span><span class="o">=</span><span class="kc">False</span><span class="p">):</span>
<span class="k">if</span> <span class="n">column_names</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
<span class="k">return</span> <span class="kc">None</span>
<span class="n">indices</span> <span class="o">=</span> <span class="p">[]</span>
<span class="k">for</span> <span class="n">name</span> <span class="ow">in</span> <span class="n">column_names</span><span class="p">:</span>
<span class="k">if</span> <span class="n">name</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_nested_paths_by_prefix</span><span class="p">:</span>
<span class="n">indices</span><span class="o">.</span><span class="n">extend</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_nested_paths_by_prefix</span><span class="p">[</span><span class="n">name</span><span class="p">])</span>
<span class="k">if</span> <span class="n">use_pandas_metadata</span><span class="p">:</span>
<span class="n">file_keyvalues</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">metadata</span><span class="o">.</span><span class="n">metadata</span>
<span class="n">common_keyvalues</span> <span class="o">=</span> <span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">common_metadata</span><span class="o">.</span><span class="n">metadata</span>
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">common_metadata</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span>
<span class="k">else</span> <span class="kc">None</span><span class="p">)</span>
<span class="k">if</span> <span class="n">file_keyvalues</span> <span class="ow">and</span> <span class="sa">b</span><span class="s1">&#39;pandas&#39;</span> <span class="ow">in</span> <span class="n">file_keyvalues</span><span class="p">:</span>
<span class="n">index_columns</span> <span class="o">=</span> <span class="n">_get_pandas_index_columns</span><span class="p">(</span><span class="n">file_keyvalues</span><span class="p">)</span>
<span class="k">elif</span> <span class="n">common_keyvalues</span> <span class="ow">and</span> <span class="sa">b</span><span class="s1">&#39;pandas&#39;</span> <span class="ow">in</span> <span class="n">common_keyvalues</span><span class="p">:</span>
<span class="n">index_columns</span> <span class="o">=</span> <span class="n">_get_pandas_index_columns</span><span class="p">(</span><span class="n">common_keyvalues</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">index_columns</span> <span class="o">=</span> <span class="p">[]</span>
<span class="k">if</span> <span class="n">indices</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> <span class="ow">and</span> <span class="n">index_columns</span><span class="p">:</span>
<span class="n">indices</span> <span class="o">+=</span> <span class="p">[</span><span class="bp">self</span><span class="o">.</span><span class="n">reader</span><span class="o">.</span><span class="n">column_name_idx</span><span class="p">(</span><span class="n">descr</span><span class="p">)</span>
<span class="k">for</span> <span class="n">descr</span> <span class="ow">in</span> <span class="n">index_columns</span>
<span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">descr</span><span class="p">,</span> <span class="nb">dict</span><span class="p">)]</span>
<span class="k">return</span> <span class="n">indices</span></div>
<span class="n">_SPARK_DISALLOWED_CHARS</span> <span class="o">=</span> <span class="n">re</span><span class="o">.</span><span class="n">compile</span><span class="p">(</span><span class="s1">&#39;[ ,;</span><span class="si">{}</span><span class="s1">()</span><span class="se">\n\t</span><span class="s1">=]&#39;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">_sanitized_spark_field_name</span><span class="p">(</span><span class="n">name</span><span class="p">):</span>
<span class="k">return</span> <span class="n">_SPARK_DISALLOWED_CHARS</span><span class="o">.</span><span class="n">sub</span><span class="p">(</span><span class="s1">&#39;_&#39;</span><span class="p">,</span> <span class="n">name</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">_sanitize_schema</span><span class="p">(</span><span class="n">schema</span><span class="p">,</span> <span class="n">flavor</span><span class="p">):</span>
<span class="k">if</span> <span class="s1">&#39;spark&#39;</span> <span class="ow">in</span> <span class="n">flavor</span><span class="p">:</span>
<span class="n">sanitized_fields</span> <span class="o">=</span> <span class="p">[]</span>
<span class="n">schema_changed</span> <span class="o">=</span> <span class="kc">False</span>
<span class="k">for</span> <span class="n">field</span> <span class="ow">in</span> <span class="n">schema</span><span class="p">:</span>
<span class="n">name</span> <span class="o">=</span> <span class="n">field</span><span class="o">.</span><span class="n">name</span>
<span class="n">sanitized_name</span> <span class="o">=</span> <span class="n">_sanitized_spark_field_name</span><span class="p">(</span><span class="n">name</span><span class="p">)</span>
<span class="k">if</span> <span class="n">sanitized_name</span> <span class="o">!=</span> <span class="n">name</span><span class="p">:</span>
<span class="n">schema_changed</span> <span class="o">=</span> <span class="kc">True</span>
<span class="n">sanitized_field</span> <span class="o">=</span> <span class="n">pa</span><span class="o">.</span><span class="n">field</span><span class="p">(</span><span class="n">sanitized_name</span><span class="p">,</span> <span class="n">field</span><span class="o">.</span><span class="n">type</span><span class="p">,</span>
<span class="n">field</span><span class="o">.</span><span class="n">nullable</span><span class="p">,</span> <span class="n">field</span><span class="o">.</span><span class="n">metadata</span><span class="p">)</span>
<span class="n">sanitized_fields</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">sanitized_field</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">sanitized_fields</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">field</span><span class="p">)</span>
<span class="n">new_schema</span> <span class="o">=</span> <span class="n">pa</span><span class="o">.</span><span class="n">schema</span><span class="p">(</span><span class="n">sanitized_fields</span><span class="p">,</span> <span class="n">metadata</span><span class="o">=</span><span class="n">schema</span><span class="o">.</span><span class="n">metadata</span><span class="p">)</span>
<span class="k">return</span> <span class="n">new_schema</span><span class="p">,</span> <span class="n">schema_changed</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">return</span> <span class="n">schema</span><span class="p">,</span> <span class="kc">False</span>
<span class="k">def</span> <span class="nf">_sanitize_table</span><span class="p">(</span><span class="n">table</span><span class="p">,</span> <span class="n">new_schema</span><span class="p">,</span> <span class="n">flavor</span><span class="p">):</span>
<span class="c1"># TODO: This will not handle prohibited characters in nested field names</span>
<span class="k">if</span> <span class="s1">&#39;spark&#39;</span> <span class="ow">in</span> <span class="n">flavor</span><span class="p">:</span>
<span class="n">column_data</span> <span class="o">=</span> <span class="p">[</span><span class="n">table</span><span class="p">[</span><span class="n">i</span><span class="p">]</span> <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="n">table</span><span class="o">.</span><span class="n">num_columns</span><span class="p">)]</span>
<span class="k">return</span> <span class="n">pa</span><span class="o">.</span><span class="n">Table</span><span class="o">.</span><span class="n">from_arrays</span><span class="p">(</span><span class="n">column_data</span><span class="p">,</span> <span class="n">schema</span><span class="o">=</span><span class="n">new_schema</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">return</span> <span class="n">table</span>
<span class="n">_parquet_writer_arg_docs</span> <span class="o">=</span> <span class="s2">&quot;&quot;&quot;version : {&quot;1.0&quot;, &quot;2.4&quot;, &quot;2.6&quot;}, default &quot;2.6&quot;</span>
<span class="s2"> Determine which Parquet logical types are available for use, whether the</span>
<span class="s2"> reduced set from the Parquet 1.x.x format or the expanded logical types</span>
<span class="s2"> added in later format versions.</span>
<span class="s2"> Files written with version=&#39;2.4&#39; or &#39;2.6&#39; may not be readable in all</span>
<span class="s2"> Parquet implementations, so version=&#39;1.0&#39; is likely the choice that</span>
<span class="s2"> maximizes file compatibility.</span>
<span class="s2"> UINT32 and some logical types are only available with version &#39;2.4&#39;.</span>
<span class="s2"> Nanosecond timestamps are only available with version &#39;2.6&#39;.</span>
<span class="s2"> Other features such as compression algorithms or the new serialized</span>
<span class="s2"> data page format must be enabled separately (see &#39;compression&#39; and</span>
<span class="s2"> &#39;data_page_version&#39;).</span>
<span class="s2">use_dictionary : bool or list, default True</span>
<span class="s2"> Specify if we should use dictionary encoding in general or only for</span>
<span class="s2"> some columns.</span>
<span class="s2"> When encoding the column, if the dictionary size is too large, the</span>
<span class="s2"> column will fallback to ``PLAIN`` encoding. Specially, ``BOOLEAN`` type</span>
<span class="s2"> doesn&#39;t support dictionary encoding.</span>
<span class="s2">compression : str or dict, default &#39;snappy&#39;</span>
<span class="s2"> Specify the compression codec, either on a general basis or per-column.</span>
<span class="s2"> Valid values: {&#39;NONE&#39;, &#39;SNAPPY&#39;, &#39;GZIP&#39;, &#39;BROTLI&#39;, &#39;LZ4&#39;, &#39;ZSTD&#39;}.</span>
<span class="s2">write_statistics : bool or list, default True</span>
<span class="s2"> Specify if we should write statistics in general (default is True) or only</span>
<span class="s2"> for some columns.</span>
<span class="s2">use_deprecated_int96_timestamps : bool, default None</span>
<span class="s2"> Write timestamps to INT96 Parquet format. Defaults to False unless enabled</span>
<span class="s2"> by flavor argument. This take priority over the coerce_timestamps option.</span>
<span class="s2">coerce_timestamps : str, default None</span>
<span class="s2"> Cast timestamps to a particular resolution. If omitted, defaults are chosen</span>
<span class="s2"> depending on `version`. By default, for ``version=&#39;1.0&#39;`` (the default)</span>
<span class="s2"> and ``version=&#39;2.4&#39;``, nanoseconds are cast to microseconds (&#39;us&#39;), while</span>
<span class="s2"> for other `version` values, they are written natively without loss</span>
<span class="s2"> of resolution. Seconds are always cast to milliseconds (&#39;ms&#39;) by default,</span>
<span class="s2"> as Parquet does not have any temporal type with seconds resolution.</span>
<span class="s2"> If the casting results in loss of data, it will raise an exception</span>
<span class="s2"> unless ``allow_truncated_timestamps=True`` is given.</span>
<span class="s2"> Valid values: {None, &#39;ms&#39;, &#39;us&#39;}</span>
<span class="s2">allow_truncated_timestamps : bool, default False</span>
<span class="s2"> Allow loss of data when coercing timestamps to a particular</span>
<span class="s2"> resolution. E.g. if microsecond or nanosecond data is lost when coercing to</span>
<span class="s2"> &#39;ms&#39;, do not raise an exception. Passing ``allow_truncated_timestamp=True``</span>
<span class="s2"> will NOT result in the truncation exception being ignored unless</span>
<span class="s2"> ``coerce_timestamps`` is not None.</span>
<span class="s2">data_page_size : int, default None</span>
<span class="s2"> Set a target threshold for the approximate encoded size of data</span>
<span class="s2"> pages within a column chunk (in bytes). If None, use the default data page</span>
<span class="s2"> size of 1MByte.</span>
<span class="s2">flavor : {&#39;spark&#39;}, default None</span>
<span class="s2"> Sanitize schema or set other compatibility options to work with</span>
<span class="s2"> various target systems.</span>
<span class="s2">filesystem : FileSystem, default None</span>
<span class="s2"> If nothing passed, will be inferred from `where` if path-like, else</span>
<span class="s2"> `where` is already a file-like object so no filesystem is needed.</span>
<span class="s2">compression_level : int or dict, default None</span>
<span class="s2"> Specify the compression level for a codec, either on a general basis or</span>
<span class="s2"> per-column. If None is passed, arrow selects the compression level for</span>
<span class="s2"> the compression codec in use. The compression level has a different</span>
<span class="s2"> meaning for each codec, so you have to read the documentation of the</span>
<span class="s2"> codec you are using.</span>
<span class="s2"> An exception is thrown if the compression codec does not allow specifying</span>
<span class="s2"> a compression level.</span>
<span class="s2">use_byte_stream_split : bool or list, default False</span>
<span class="s2"> Specify if the byte_stream_split encoding should be used in general or</span>
<span class="s2"> only for some columns. If both dictionary and byte_stream_stream are</span>
<span class="s2"> enabled, then dictionary is preferred.</span>
<span class="s2"> The byte_stream_split encoding is valid only for floating-point data types</span>
<span class="s2"> and should be combined with a compression codec.</span>
<span class="s2">column_encoding : string or dict, default None</span>
<span class="s2"> Specify the encoding scheme on a per column basis.</span>
<span class="s2"> Can only be used when ``use_dictionary`` is set to False, and</span>
<span class="s2"> cannot be used in combination with ``use_byte_stream_split``.</span>
<span class="s2"> Currently supported values: {&#39;PLAIN&#39;, &#39;BYTE_STREAM_SPLIT&#39;,</span>
<span class="s2"> &#39;DELTA_BINARY_PACKED&#39;, &#39;DELTA_LENGTH_BYTE_ARRAY&#39;, &#39;DELTA_BYTE_ARRAY&#39;}.</span>
<span class="s2"> Certain encodings are only compatible with certain data types.</span>
<span class="s2"> Please refer to the encodings section of `Reading and writing Parquet</span>
<span class="s2"> files &lt;https://arrow.apache.org/docs/cpp/parquet.html#encodings&gt;`_.</span>
<span class="s2">data_page_version : {&quot;1.0&quot;, &quot;2.0&quot;}, default &quot;1.0&quot;</span>
<span class="s2"> The serialized Parquet data page format version to write, defaults to</span>
<span class="s2"> 1.0. This does not impact the file schema logical types and Arrow to</span>
<span class="s2"> Parquet type casting behavior; for that use the &quot;version&quot; option.</span>
<span class="s2">use_compliant_nested_type : bool, default True</span>
<span class="s2"> Whether to write compliant Parquet nested type (lists) as defined</span>
<span class="s2"> `here &lt;https://github.com/apache/parquet-format/blob/master/</span>
<span class="s2"> LogicalTypes.md#nested-types&gt;`_, defaults to ``True``.</span>
<span class="s2"> For ``use_compliant_nested_type=True``, this will write into a list</span>
<span class="s2"> with 3-level structure where the middle level, named ``list``,</span>
<span class="s2"> is a repeated group with a single field named ``element``::</span>
<span class="s2"> &lt;list-repetition&gt; group &lt;name&gt; (LIST) {</span>
<span class="s2"> repeated group list {</span>
<span class="s2"> &lt;element-repetition&gt; &lt;element-type&gt; element;</span>
<span class="s2"> }</span>
<span class="s2"> }</span>
<span class="s2"> For ``use_compliant_nested_type=False``, this will also write into a list</span>
<span class="s2"> with 3-level structure, where the name of the single field of the middle</span>
<span class="s2"> level ``list`` is taken from the element name for nested columns in Arrow,</span>
<span class="s2"> which defaults to ``item``::</span>
<span class="s2"> &lt;list-repetition&gt; group &lt;name&gt; (LIST) {</span>
<span class="s2"> repeated group list {</span>
<span class="s2"> &lt;element-repetition&gt; &lt;element-type&gt; item;</span>
<span class="s2"> }</span>
<span class="s2"> }</span>
<span class="s2">encryption_properties : FileEncryptionProperties, default None</span>
<span class="s2"> File encryption properties for Parquet Modular Encryption.</span>
<span class="s2"> If None, no encryption will be done.</span>
<span class="s2"> The encryption properties can be created using:</span>
<span class="s2"> ``CryptoFactory.file_encryption_properties()``.</span>
<span class="s2">write_batch_size : int, default None</span>
<span class="s2"> Number of values to write to a page at a time. If None, use the default of</span>
<span class="s2"> 1024. ``write_batch_size`` is complementary to ``data_page_size``. If pages</span>
<span class="s2"> are exceeding the ``data_page_size`` due to large column values, lowering</span>
<span class="s2"> the batch size can help keep page sizes closer to the intended size.</span>
<span class="s2">dictionary_pagesize_limit : int, default None</span>
<span class="s2"> Specify the dictionary page size limit per row group. If None, use the</span>
<span class="s2"> default 1MB.</span>
<span class="s2">store_schema : bool, default True</span>
<span class="s2"> By default, the Arrow schema is serialized and stored in the Parquet</span>
<span class="s2"> file metadata (in the &quot;ARROW:schema&quot; key). When reading the file,</span>
<span class="s2"> if this key is available, it will be used to more faithfully recreate</span>
<span class="s2"> the original Arrow data. For example, for tz-aware timestamp columns</span>
<span class="s2"> it will restore the timezone (Parquet only stores the UTC values without</span>
<span class="s2"> timezone), or columns with duration type will be restored from the int64</span>
<span class="s2"> Parquet column.</span>
<span class="s2">write_page_index : bool, default False</span>
<span class="s2"> Whether to write a page index in general for all columns.</span>
<span class="s2"> Writing statistics to the page index disables the old method of writing</span>
<span class="s2"> statistics to each data page header. The page index makes statistics-based</span>
<span class="s2"> filtering more efficient than the page header, as it gathers all the</span>
<span class="s2"> statistics for a Parquet file in a single place, avoiding scattered I/O.</span>
<span class="s2"> Note that the page index is not yet used on the read size by PyArrow.</span>
<span class="s2">write_page_checksum : bool, default False</span>
<span class="s2"> Whether to write page checksums in general for all columns.</span>
<span class="s2"> Page checksums enable detection of data corruption, which might occur during</span>
<span class="s2"> transmission or in the storage.</span>
<span class="s2">sorting_columns : Sequence of SortingColumn, default None</span>
<span class="s2"> Specify the sort order of the data being written. The writer does not sort</span>
<span class="s2"> the data nor does it verify that the data is sorted. The sort order is</span>
<span class="s2"> written to the row group metadata, which can then be used by readers.</span>
<span class="s2">&quot;&quot;&quot;</span>
<span class="n">_parquet_writer_example_doc</span> <span class="o">=</span> <span class="s2">&quot;&quot;&quot;</span><span class="se">\</span>
<span class="s2">Generate an example PyArrow Table and RecordBatch:</span>
<span class="s2">&gt;&gt;&gt; import pyarrow as pa</span>
<span class="s2">&gt;&gt;&gt; table = pa.table({&#39;n_legs&#39;: [2, 2, 4, 4, 5, 100],</span>
<span class="s2">... &#39;animal&#39;: [&quot;Flamingo&quot;, &quot;Parrot&quot;, &quot;Dog&quot;, &quot;Horse&quot;,</span>
<span class="s2">... &quot;Brittle stars&quot;, &quot;Centipede&quot;]})</span>
<span class="s2">&gt;&gt;&gt; batch = pa.record_batch([[2, 2, 4, 4, 5, 100],</span>
<span class="s2">... [&quot;Flamingo&quot;, &quot;Parrot&quot;, &quot;Dog&quot;, &quot;Horse&quot;,</span>
<span class="s2">... &quot;Brittle stars&quot;, &quot;Centipede&quot;]],</span>
<span class="s2">... names=[&#39;n_legs&#39;, &#39;animal&#39;])</span>
<span class="s2">create a ParquetWriter object:</span>
<span class="s2">&gt;&gt;&gt; import pyarrow.parquet as pq</span>
<span class="s2">&gt;&gt;&gt; writer = pq.ParquetWriter(&#39;example.parquet&#39;, table.schema)</span>
<span class="s2">and write the Table into the Parquet file:</span>
<span class="s2">&gt;&gt;&gt; writer.write_table(table)</span>
<span class="s2">&gt;&gt;&gt; writer.close()</span>
<span class="s2">&gt;&gt;&gt; pq.read_table(&#39;example.parquet&#39;).to_pandas()</span>
<span class="s2"> n_legs animal</span>
<span class="s2">0 2 Flamingo</span>
<span class="s2">1 2 Parrot</span>
<span class="s2">2 4 Dog</span>
<span class="s2">3 4 Horse</span>
<span class="s2">4 5 Brittle stars</span>
<span class="s2">5 100 Centipede</span>
<span class="s2">create a ParquetWriter object for the RecordBatch:</span>
<span class="s2">&gt;&gt;&gt; writer2 = pq.ParquetWriter(&#39;example2.parquet&#39;, batch.schema)</span>
<span class="s2">and write the RecordBatch into the Parquet file:</span>
<span class="s2">&gt;&gt;&gt; writer2.write_batch(batch)</span>
<span class="s2">&gt;&gt;&gt; writer2.close()</span>
<span class="s2">&gt;&gt;&gt; pq.read_table(&#39;example2.parquet&#39;).to_pandas()</span>
<span class="s2"> n_legs animal</span>
<span class="s2">0 2 Flamingo</span>
<span class="s2">1 2 Parrot</span>
<span class="s2">2 4 Dog</span>
<span class="s2">3 4 Horse</span>
<span class="s2">4 5 Brittle stars</span>
<span class="s2">5 100 Centipede</span>
<span class="s2">&quot;&quot;&quot;</span>
<div class="viewcode-block" id="ParquetWriter"><a class="viewcode-back" href="../../../python/generated/pyarrow.parquet.ParquetWriter.html#pyarrow.parquet.ParquetWriter">[docs]</a><span class="k">class</span> <span class="nc">ParquetWriter</span><span class="p">:</span>
<span class="vm">__doc__</span> <span class="o">=</span> <span class="s2">&quot;&quot;&quot;</span>
<span class="s2">Class for incrementally building a Parquet file for Arrow tables.</span>
<span class="s2">Parameters</span>
<span class="s2">----------</span>
<span class="s2">where : path or file-like object</span>
<span class="s2">schema : pyarrow.Schema</span>
<span class="si">{}</span>
<span class="s2">writer_engine_version : unused</span>
<span class="s2">**options : dict</span>
<span class="s2"> If options contains a key `metadata_collector` then the</span>
<span class="s2"> corresponding value is assumed to be a list (or any object with</span>
<span class="s2"> `.append` method) that will be filled with the file metadata instance</span>
<span class="s2"> of the written file.</span>
<span class="s2">Examples</span>
<span class="s2">--------</span>
<span class="si">{}</span>
<span class="s2">&quot;&quot;&quot;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">_parquet_writer_arg_docs</span><span class="p">,</span> <span class="n">_parquet_writer_example_doc</span><span class="p">)</span>
<div class="viewcode-block" id="ParquetWriter.__init__"><a class="viewcode-back" href="../../../python/generated/pyarrow.parquet.ParquetWriter.html#pyarrow.parquet.ParquetWriter.__init__">[docs]</a> <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">where</span><span class="p">,</span> <span class="n">schema</span><span class="p">,</span> <span class="n">filesystem</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span>
<span class="n">flavor</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span>
<span class="n">version</span><span class="o">=</span><span class="s1">&#39;2.6&#39;</span><span class="p">,</span>
<span class="n">use_dictionary</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span>
<span class="n">compression</span><span class="o">=</span><span class="s1">&#39;snappy&#39;</span><span class="p">,</span>
<span class="n">write_statistics</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span>
<span class="n">use_deprecated_int96_timestamps</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span>
<span class="n">compression_level</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span>
<span class="n">use_byte_stream_split</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span>
<span class="n">column_encoding</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span>
<span class="n">writer_engine_version</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span>
<span class="n">data_page_version</span><span class="o">=</span><span class="s1">&#39;1.0&#39;</span><span class="p">,</span>
<span class="n">use_compliant_nested_type</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span>
<span class="n">encryption_properties</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span>
<span class="n">write_batch_size</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span>
<span class="n">dictionary_pagesize_limit</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span>
<span class="n">store_schema</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span>
<span class="n">write_page_index</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span>
<span class="n">write_page_checksum</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span>
<span class="n">sorting_columns</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span>
<span class="o">**</span><span class="n">options</span><span class="p">):</span>
<span class="k">if</span> <span class="n">use_deprecated_int96_timestamps</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
<span class="c1"># Use int96 timestamps for Spark</span>
<span class="k">if</span> <span class="n">flavor</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> <span class="ow">and</span> <span class="s1">&#39;spark&#39;</span> <span class="ow">in</span> <span class="n">flavor</span><span class="p">:</span>
<span class="n">use_deprecated_int96_timestamps</span> <span class="o">=</span> <span class="kc">True</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">use_deprecated_int96_timestamps</span> <span class="o">=</span> <span class="kc">False</span>
<span class="bp">self</span><span class="o">.</span><span class="n">flavor</span> <span class="o">=</span> <span class="n">flavor</span>
<span class="k">if</span> <span class="n">flavor</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="n">schema</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">schema_changed</span> <span class="o">=</span> <span class="n">_sanitize_schema</span><span class="p">(</span><span class="n">schema</span><span class="p">,</span> <span class="n">flavor</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="bp">self</span><span class="o">.</span><span class="n">schema_changed</span> <span class="o">=</span> <span class="kc">False</span>
<span class="bp">self</span><span class="o">.</span><span class="n">schema</span> <span class="o">=</span> <span class="n">schema</span>
<span class="bp">self</span><span class="o">.</span><span class="n">where</span> <span class="o">=</span> <span class="n">where</span>
<span class="c1"># If we open a file using a filesystem, store file handle so we can be</span>
<span class="c1"># sure to close it when `self.close` is called.</span>
<span class="bp">self</span><span class="o">.</span><span class="n">file_handle</span> <span class="o">=</span> <span class="kc">None</span>
<span class="n">filesystem</span><span class="p">,</span> <span class="n">path</span> <span class="o">=</span> <span class="n">_resolve_filesystem_and_path</span><span class="p">(</span><span class="n">where</span><span class="p">,</span> <span class="n">filesystem</span><span class="p">)</span>
<span class="k">if</span> <span class="n">filesystem</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="c1"># ARROW-10480: do not auto-detect compression. While</span>
<span class="c1"># a filename like foo.parquet.gz is nonconforming, it</span>
<span class="c1"># shouldn&#39;t implicitly apply compression.</span>
<span class="n">sink</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">file_handle</span> <span class="o">=</span> <span class="n">filesystem</span><span class="o">.</span><span class="n">open_output_stream</span><span class="p">(</span>
<span class="n">path</span><span class="p">,</span> <span class="n">compression</span><span class="o">=</span><span class="kc">None</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">sink</span> <span class="o">=</span> <span class="n">where</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_metadata_collector</span> <span class="o">=</span> <span class="n">options</span><span class="o">.</span><span class="n">pop</span><span class="p">(</span><span class="s1">&#39;metadata_collector&#39;</span><span class="p">,</span> <span class="kc">None</span><span class="p">)</span>
<span class="n">engine_version</span> <span class="o">=</span> <span class="s1">&#39;V2&#39;</span>
<span class="bp">self</span><span class="o">.</span><span class="n">writer</span> <span class="o">=</span> <span class="n">_parquet</span><span class="o">.</span><span class="n">ParquetWriter</span><span class="p">(</span>
<span class="n">sink</span><span class="p">,</span> <span class="n">schema</span><span class="p">,</span>
<span class="n">version</span><span class="o">=</span><span class="n">version</span><span class="p">,</span>
<span class="n">compression</span><span class="o">=</span><span class="n">compression</span><span class="p">,</span>
<span class="n">use_dictionary</span><span class="o">=</span><span class="n">use_dictionary</span><span class="p">,</span>
<span class="n">write_statistics</span><span class="o">=</span><span class="n">write_statistics</span><span class="p">,</span>
<span class="n">use_deprecated_int96_timestamps</span><span class="o">=</span><span class="n">use_deprecated_int96_timestamps</span><span class="p">,</span>
<span class="n">compression_level</span><span class="o">=</span><span class="n">compression_level</span><span class="p">,</span>
<span class="n">use_byte_stream_split</span><span class="o">=</span><span class="n">use_byte_stream_split</span><span class="p">,</span>
<span class="n">column_encoding</span><span class="o">=</span><span class="n">column_encoding</span><span class="p">,</span>
<span class="n">writer_engine_version</span><span class="o">=</span><span class="n">engine_version</span><span class="p">,</span>
<span class="n">data_page_version</span><span class="o">=</span><span class="n">data_page_version</span><span class="p">,</span>
<span class="n">use_compliant_nested_type</span><span class="o">=</span><span class="n">use_compliant_nested_type</span><span class="p">,</span>
<span class="n">encryption_properties</span><span class="o">=</span><span class="n">encryption_properties</span><span class="p">,</span>
<span class="n">write_batch_size</span><span class="o">=</span><span class="n">write_batch_size</span><span class="p">,</span>
<span class="n">dictionary_pagesize_limit</span><span class="o">=</span><span class="n">dictionary_pagesize_limit</span><span class="p">,</span>
<span class="n">store_schema</span><span class="o">=</span><span class="n">store_schema</span><span class="p">,</span>
<span class="n">write_page_index</span><span class="o">=</span><span class="n">write_page_index</span><span class="p">,</span>
<span class="n">write_page_checksum</span><span class="o">=</span><span class="n">write_page_checksum</span><span class="p">,</span>
<span class="n">sorting_columns</span><span class="o">=</span><span class="n">sorting_columns</span><span class="p">,</span>
<span class="o">**</span><span class="n">options</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">is_open</span> <span class="o">=</span> <span class="kc">True</span></div>
<span class="k">def</span> <span class="fm">__del__</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="k">if</span> <span class="nb">getattr</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="s1">&#39;is_open&#39;</span><span class="p">,</span> <span class="kc">False</span><span class="p">):</span>
<span class="bp">self</span><span class="o">.</span><span class="n">close</span><span class="p">()</span>
<span class="k">def</span> <span class="fm">__enter__</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="k">return</span> <span class="bp">self</span>
<span class="k">def</span> <span class="fm">__exit__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="n">args</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">):</span>
<span class="bp">self</span><span class="o">.</span><span class="n">close</span><span class="p">()</span>
<span class="c1"># return false since we want to propagate exceptions</span>
<span class="k">return</span> <span class="kc">False</span>
<div class="viewcode-block" id="ParquetWriter.write"><a class="viewcode-back" href="../../../python/generated/pyarrow.parquet.ParquetWriter.html#pyarrow.parquet.ParquetWriter.write">[docs]</a> <span class="k">def</span> <span class="nf">write</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">table_or_batch</span><span class="p">,</span> <span class="n">row_group_size</span><span class="o">=</span><span class="kc">None</span><span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Write RecordBatch or Table to the Parquet file.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> table_or_batch : {RecordBatch, Table}</span>
<span class="sd"> row_group_size : int, default None</span>
<span class="sd"> Maximum number of rows in each written row group. If None,</span>
<span class="sd"> the row group size will be the minimum of the input</span>
<span class="sd"> table or batch length and 1024 * 1024.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">table_or_batch</span><span class="p">,</span> <span class="n">pa</span><span class="o">.</span><span class="n">RecordBatch</span><span class="p">):</span>
<span class="bp">self</span><span class="o">.</span><span class="n">write_batch</span><span class="p">(</span><span class="n">table_or_batch</span><span class="p">,</span> <span class="n">row_group_size</span><span class="p">)</span>
<span class="k">elif</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">table_or_batch</span><span class="p">,</span> <span class="n">pa</span><span class="o">.</span><span class="n">Table</span><span class="p">):</span>
<span class="bp">self</span><span class="o">.</span><span class="n">write_table</span><span class="p">(</span><span class="n">table_or_batch</span><span class="p">,</span> <span class="n">row_group_size</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="nb">type</span><span class="p">(</span><span class="n">table_or_batch</span><span class="p">))</span></div>
<div class="viewcode-block" id="ParquetWriter.write_batch"><a class="viewcode-back" href="../../../python/generated/pyarrow.parquet.ParquetWriter.html#pyarrow.parquet.ParquetWriter.write_batch">[docs]</a> <span class="k">def</span> <span class="nf">write_batch</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">batch</span><span class="p">,</span> <span class="n">row_group_size</span><span class="o">=</span><span class="kc">None</span><span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Write RecordBatch to the Parquet file.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> batch : RecordBatch</span>
<span class="sd"> row_group_size : int, default None</span>
<span class="sd"> Maximum number of rows in written row group. If None, the</span>
<span class="sd"> row group size will be the minimum of the RecordBatch</span>
<span class="sd"> size and 1024 * 1024. If set larger than 64Mi then 64Mi</span>
<span class="sd"> will be used instead.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">table</span> <span class="o">=</span> <span class="n">pa</span><span class="o">.</span><span class="n">Table</span><span class="o">.</span><span class="n">from_batches</span><span class="p">([</span><span class="n">batch</span><span class="p">],</span> <span class="n">batch</span><span class="o">.</span><span class="n">schema</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">write_table</span><span class="p">(</span><span class="n">table</span><span class="p">,</span> <span class="n">row_group_size</span><span class="p">)</span></div>
<div class="viewcode-block" id="ParquetWriter.write_table"><a class="viewcode-back" href="../../../python/generated/pyarrow.parquet.ParquetWriter.html#pyarrow.parquet.ParquetWriter.write_table">[docs]</a> <span class="k">def</span> <span class="nf">write_table</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">table</span><span class="p">,</span> <span class="n">row_group_size</span><span class="o">=</span><span class="kc">None</span><span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Write Table to the Parquet file.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> table : Table</span>
<span class="sd"> row_group_size : int, default None</span>
<span class="sd"> Maximum number of rows in each written row group. If None,</span>
<span class="sd"> the row group size will be the minimum of the Table size</span>
<span class="sd"> and 1024 * 1024. If set larger than 64Mi then 64Mi will</span>
<span class="sd"> be used instead.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">schema_changed</span><span class="p">:</span>
<span class="n">table</span> <span class="o">=</span> <span class="n">_sanitize_table</span><span class="p">(</span><span class="n">table</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">schema</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">flavor</span><span class="p">)</span>
<span class="k">assert</span> <span class="bp">self</span><span class="o">.</span><span class="n">is_open</span>
<span class="k">if</span> <span class="ow">not</span> <span class="n">table</span><span class="o">.</span><span class="n">schema</span><span class="o">.</span><span class="n">equals</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">schema</span><span class="p">,</span> <span class="n">check_metadata</span><span class="o">=</span><span class="kc">False</span><span class="p">):</span>
<span class="n">msg</span> <span class="o">=</span> <span class="p">(</span><span class="s1">&#39;Table schema does not match schema used to create file: &#39;</span>
<span class="s1">&#39;</span><span class="se">\n</span><span class="s1">table:</span><span class="se">\n</span><span class="si">{!s}</span><span class="s1"> vs. </span><span class="se">\n</span><span class="s1">file:</span><span class="se">\n</span><span class="si">{!s}</span><span class="s1">&#39;</span>
<span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">table</span><span class="o">.</span><span class="n">schema</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">schema</span><span class="p">))</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="n">msg</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">writer</span><span class="o">.</span><span class="n">write_table</span><span class="p">(</span><span class="n">table</span><span class="p">,</span> <span class="n">row_group_size</span><span class="o">=</span><span class="n">row_group_size</span><span class="p">)</span></div>
<div class="viewcode-block" id="ParquetWriter.close"><a class="viewcode-back" href="../../../python/generated/pyarrow.parquet.ParquetWriter.html#pyarrow.parquet.ParquetWriter.close">[docs]</a> <span class="k">def</span> <span class="nf">close</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Close the connection to the Parquet file.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">is_open</span><span class="p">:</span>
<span class="bp">self</span><span class="o">.</span><span class="n">writer</span><span class="o">.</span><span class="n">close</span><span class="p">()</span>
<span class="bp">self</span><span class="o">.</span><span class="n">is_open</span> <span class="o">=</span> <span class="kc">False</span>
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">_metadata_collector</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_metadata_collector</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">writer</span><span class="o">.</span><span class="n">metadata</span><span class="p">)</span>
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">file_handle</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="bp">self</span><span class="o">.</span><span class="n">file_handle</span><span class="o">.</span><span class="n">close</span><span class="p">()</span></div></div>
<span class="k">def</span> <span class="nf">_get_pandas_index_columns</span><span class="p">(</span><span class="n">keyvalues</span><span class="p">):</span>
<span class="k">return</span> <span class="p">(</span><span class="n">json</span><span class="o">.</span><span class="n">loads</span><span class="p">(</span><span class="n">keyvalues</span><span class="p">[</span><span class="sa">b</span><span class="s1">&#39;pandas&#39;</span><span class="p">]</span><span class="o">.</span><span class="n">decode</span><span class="p">(</span><span class="s1">&#39;utf8&#39;</span><span class="p">))</span>
<span class="p">[</span><span class="s1">&#39;index_columns&#39;</span><span class="p">])</span>
<span class="n">EXCLUDED_PARQUET_PATHS</span> <span class="o">=</span> <span class="p">{</span><span class="s1">&#39;_SUCCESS&#39;</span><span class="p">}</span>
<span class="n">_read_docstring_common</span> <span class="o">=</span> <span class="s2">&quot;&quot;&quot;</span><span class="se">\</span>
<span class="s2">read_dictionary : list, default None</span>
<span class="s2"> List of names or column paths (for nested types) to read directly</span>
<span class="s2"> as DictionaryArray. Only supported for BYTE_ARRAY storage. To read</span>
<span class="s2"> a flat column as dictionary-encoded pass the column name. For</span>
<span class="s2"> nested types, you must pass the full column &quot;path&quot;, which could be</span>
<span class="s2"> something like level1.level2.list.item. Refer to the Parquet</span>
<span class="s2"> file&#39;s schema to obtain the paths.</span>
<span class="s2">memory_map : bool, default False</span>
<span class="s2"> If the source is a file path, use a memory map to read file, which can</span>
<span class="s2"> improve performance in some environments.</span>
<span class="s2">buffer_size : int, default 0</span>
<span class="s2"> If positive, perform read buffering when deserializing individual</span>
<span class="s2"> column chunks. Otherwise IO calls are unbuffered.</span>
<span class="s2">partitioning : pyarrow.dataset.Partitioning or str or list of str, </span><span class="se">\</span>
<span class="s2">default &quot;hive&quot;</span>
<span class="s2"> The partitioning scheme for a partitioned dataset. The default of &quot;hive&quot;</span>
<span class="s2"> assumes directory names with key=value pairs like &quot;/year=2009/month=11&quot;.</span>
<span class="s2"> In addition, a scheme like &quot;/2009/11&quot; is also supported, in which case</span>
<span class="s2"> you need to specify the field names or a full schema. See the</span>
<span class="s2"> ``pyarrow.dataset.partitioning()`` function for more details.&quot;&quot;&quot;</span>
<span class="n">_parquet_dataset_example</span> <span class="o">=</span> <span class="s2">&quot;&quot;&quot;</span><span class="se">\</span>
<span class="s2">Generate an example PyArrow Table and write it to a partitioned dataset:</span>
<span class="s2">&gt;&gt;&gt; import pyarrow as pa</span>
<span class="s2">&gt;&gt;&gt; table = pa.table({&#39;year&#39;: [2020, 2022, 2021, 2022, 2019, 2021],</span>
<span class="s2">... &#39;n_legs&#39;: [2, 2, 4, 4, 5, 100],</span>
<span class="s2">... &#39;animal&#39;: [&quot;Flamingo&quot;, &quot;Parrot&quot;, &quot;Dog&quot;, &quot;Horse&quot;,</span>
<span class="s2">... &quot;Brittle stars&quot;, &quot;Centipede&quot;]})</span>
<span class="s2">&gt;&gt;&gt; import pyarrow.parquet as pq</span>
<span class="s2">&gt;&gt;&gt; pq.write_to_dataset(table, root_path=&#39;dataset_v2&#39;,</span>
<span class="s2">... partition_cols=[&#39;year&#39;])</span>
<span class="s2">create a ParquetDataset object from the dataset source:</span>
<span class="s2">&gt;&gt;&gt; dataset = pq.ParquetDataset(&#39;dataset_v2/&#39;)</span>
<span class="s2">and read the data:</span>
<span class="s2">&gt;&gt;&gt; dataset.read().to_pandas()</span>
<span class="s2"> n_legs animal year</span>
<span class="s2">0 5 Brittle stars 2019</span>
<span class="s2">1 2 Flamingo 2020</span>
<span class="s2">2 4 Dog 2021</span>
<span class="s2">3 100 Centipede 2021</span>
<span class="s2">4 2 Parrot 2022</span>
<span class="s2">5 4 Horse 2022</span>
<span class="s2">create a ParquetDataset object with filter:</span>
<span class="s2">&gt;&gt;&gt; dataset = pq.ParquetDataset(&#39;dataset_v2/&#39;,</span>
<span class="s2">... filters=[(&#39;n_legs&#39;,&#39;=&#39;,4)])</span>
<span class="s2">&gt;&gt;&gt; dataset.read().to_pandas()</span>
<span class="s2"> n_legs animal year</span>
<span class="s2">0 4 Dog 2021</span>
<span class="s2">1 4 Horse 2022</span>
<span class="s2">&quot;&quot;&quot;</span>
<div class="viewcode-block" id="ParquetDataset"><a class="viewcode-back" href="../../../python/generated/pyarrow.parquet.ParquetDataset.html#pyarrow.parquet.ParquetDataset">[docs]</a><span class="k">class</span> <span class="nc">ParquetDataset</span><span class="p">:</span>
<span class="vm">__doc__</span> <span class="o">=</span> <span class="s2">&quot;&quot;&quot;</span>
<span class="s2">Encapsulates details of reading a complete Parquet dataset possibly</span>
<span class="s2">consisting of multiple files and partitions in subdirectories.</span>
<span class="s2">Parameters</span>
<span class="s2">----------</span>
<span class="s2">path_or_paths : str or List[str]</span>
<span class="s2"> A directory name, single file name, or list of file names.</span>
<span class="s2">filesystem : FileSystem, default None</span>
<span class="s2"> If nothing passed, will be inferred based on path.</span>
<span class="s2"> Path will try to be found in the local on-disk filesystem otherwise</span>
<span class="s2"> it will be parsed as an URI to determine the filesystem.</span>
<span class="s2">schema : pyarrow.parquet.Schema</span>
<span class="s2"> Optionally provide the Schema for the Dataset, in which case it will</span>
<span class="s2"> not be inferred from the source.</span>
<span class="s2">filters : pyarrow.compute.Expression or List[Tuple] or List[List[Tuple]], default None</span>
<span class="s2"> Rows which do not match the filter predicate will be removed from scanned</span>
<span class="s2"> data. Partition keys embedded in a nested directory structure will be</span>
<span class="s2"> exploited to avoid loading files at all if they contain no matching rows.</span>
<span class="s2"> Within-file level filtering and different partitioning schemes are supported.</span>
<span class="s2"> </span><span class="si">{1}</span>
<span class="si">{0}</span>
<span class="s2">ignore_prefixes : list, optional</span>
<span class="s2"> Files matching any of these prefixes will be ignored by the</span>
<span class="s2"> discovery process.</span>
<span class="s2"> This is matched to the basename of a path.</span>
<span class="s2"> By default this is [&#39;.&#39;, &#39;_&#39;].</span>
<span class="s2"> Note that discovery happens only if a directory is passed as source.</span>
<span class="s2">pre_buffer : bool, default True</span>
<span class="s2"> Coalesce and issue file reads in parallel to improve performance on</span>
<span class="s2"> high-latency filesystems (e.g. S3, GCS). If True, Arrow will use a</span>
<span class="s2"> background I/O thread pool. If using a filesystem layer that itself</span>
<span class="s2"> performs readahead (e.g. fsspec&#39;s S3FS), disable readahead for best</span>
<span class="s2"> results. Set to False if you want to prioritize minimal memory usage</span>
<span class="s2"> over maximum speed.</span>
<span class="s2">coerce_int96_timestamp_unit : str, default None</span>
<span class="s2"> Cast timestamps that are stored in INT96 format to a particular resolution</span>
<span class="s2"> (e.g. &#39;ms&#39;). Setting to None is equivalent to &#39;ns&#39; and therefore INT96</span>
<span class="s2"> timestamps will be inferred as timestamps in nanoseconds.</span>
<span class="s2">decryption_properties : FileDecryptionProperties or None</span>
<span class="s2"> File-level decryption properties.</span>
<span class="s2"> The decryption properties can be created using</span>
<span class="s2"> ``CryptoFactory.file_decryption_properties()``.</span>
<span class="s2">thrift_string_size_limit : int, default None</span>
<span class="s2"> If not None, override the maximum total string size allocated</span>
<span class="s2"> when decoding Thrift structures. The default limit should be</span>
<span class="s2"> sufficient for most Parquet files.</span>
<span class="s2">thrift_container_size_limit : int, default None</span>
<span class="s2"> If not None, override the maximum total size of containers allocated</span>
<span class="s2"> when decoding Thrift structures. The default limit should be</span>
<span class="s2"> sufficient for most Parquet files.</span>
<span class="s2">page_checksum_verification : bool, default False</span>
<span class="s2"> If True, verify the page checksum for each page read from the file.</span>
<span class="s2">use_legacy_dataset : bool, optional</span>
<span class="s2"> Deprecated and has no effect from PyArrow version 15.0.0.</span>
<span class="s2">Examples</span>
<span class="s2">--------</span>
<span class="si">{2}</span>
<span class="s2">&quot;&quot;&quot;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">_read_docstring_common</span><span class="p">,</span> <span class="n">_DNF_filter_doc</span><span class="p">,</span> <span class="n">_parquet_dataset_example</span><span class="p">)</span>
<div class="viewcode-block" id="ParquetDataset.__init__"><a class="viewcode-back" href="../../../python/generated/pyarrow.parquet.ParquetDataset.html#pyarrow.parquet.ParquetDataset.__init__">[docs]</a> <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">path_or_paths</span><span class="p">,</span> <span class="n">filesystem</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">schema</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="o">*</span><span class="p">,</span> <span class="n">filters</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span>
<span class="n">read_dictionary</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">memory_map</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span> <span class="n">buffer_size</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span>
<span class="n">partitioning</span><span class="o">=</span><span class="s2">&quot;hive&quot;</span><span class="p">,</span> <span class="n">ignore_prefixes</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">pre_buffer</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span>
<span class="n">coerce_int96_timestamp_unit</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span>
<span class="n">decryption_properties</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">thrift_string_size_limit</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span>
<span class="n">thrift_container_size_limit</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span>
<span class="n">page_checksum_verification</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span>
<span class="n">use_legacy_dataset</span><span class="o">=</span><span class="kc">None</span><span class="p">):</span>
<span class="k">if</span> <span class="n">use_legacy_dataset</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="n">warnings</span><span class="o">.</span><span class="n">warn</span><span class="p">(</span>
<span class="s2">&quot;Passing &#39;use_legacy_dataset&#39; is deprecated as of pyarrow 15.0.0 &quot;</span>
<span class="s2">&quot;and will be removed in a future version.&quot;</span><span class="p">,</span>
<span class="ne">FutureWarning</span><span class="p">,</span> <span class="n">stacklevel</span><span class="o">=</span><span class="mi">2</span><span class="p">)</span>
<span class="kn">import</span> <span class="nn">pyarrow.dataset</span> <span class="k">as</span> <span class="nn">ds</span>
<span class="c1"># map format arguments</span>
<span class="n">read_options</span> <span class="o">=</span> <span class="p">{</span>
<span class="s2">&quot;pre_buffer&quot;</span><span class="p">:</span> <span class="n">pre_buffer</span><span class="p">,</span>
<span class="s2">&quot;coerce_int96_timestamp_unit&quot;</span><span class="p">:</span> <span class="n">coerce_int96_timestamp_unit</span><span class="p">,</span>
<span class="s2">&quot;thrift_string_size_limit&quot;</span><span class="p">:</span> <span class="n">thrift_string_size_limit</span><span class="p">,</span>
<span class="s2">&quot;thrift_container_size_limit&quot;</span><span class="p">:</span> <span class="n">thrift_container_size_limit</span><span class="p">,</span>
<span class="s2">&quot;page_checksum_verification&quot;</span><span class="p">:</span> <span class="n">page_checksum_verification</span><span class="p">,</span>
<span class="p">}</span>
<span class="k">if</span> <span class="n">buffer_size</span><span class="p">:</span>
<span class="n">read_options</span><span class="o">.</span><span class="n">update</span><span class="p">(</span><span class="n">use_buffered_stream</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span>
<span class="n">buffer_size</span><span class="o">=</span><span class="n">buffer_size</span><span class="p">)</span>
<span class="k">if</span> <span class="n">read_dictionary</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="n">read_options</span><span class="o">.</span><span class="n">update</span><span class="p">(</span><span class="n">dictionary_columns</span><span class="o">=</span><span class="n">read_dictionary</span><span class="p">)</span>
<span class="k">if</span> <span class="n">decryption_properties</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="n">read_options</span><span class="o">.</span><span class="n">update</span><span class="p">(</span><span class="n">decryption_properties</span><span class="o">=</span><span class="n">decryption_properties</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_filter_expression</span> <span class="o">=</span> <span class="kc">None</span>
<span class="k">if</span> <span class="n">filters</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_filter_expression</span> <span class="o">=</span> <span class="n">filters_to_expression</span><span class="p">(</span><span class="n">filters</span><span class="p">)</span>
<span class="c1"># map old filesystems to new one</span>
<span class="k">if</span> <span class="n">filesystem</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="n">filesystem</span> <span class="o">=</span> <span class="n">_ensure_filesystem</span><span class="p">(</span>
<span class="n">filesystem</span><span class="p">,</span> <span class="n">use_mmap</span><span class="o">=</span><span class="n">memory_map</span><span class="p">)</span>
<span class="k">elif</span> <span class="n">filesystem</span> <span class="ow">is</span> <span class="kc">None</span> <span class="ow">and</span> <span class="n">memory_map</span><span class="p">:</span>
<span class="c1"># if memory_map is specified, assume local file system (string</span>
<span class="c1"># path can in principle be URI for any filesystem)</span>
<span class="n">filesystem</span> <span class="o">=</span> <span class="n">LocalFileSystem</span><span class="p">(</span><span class="n">use_mmap</span><span class="o">=</span><span class="n">memory_map</span><span class="p">)</span>
<span class="c1"># This needs to be checked after _ensure_filesystem, because that</span>
<span class="c1"># handles the case of an fsspec LocalFileSystem</span>
<span class="k">if</span> <span class="p">(</span>
<span class="nb">hasattr</span><span class="p">(</span><span class="n">path_or_paths</span><span class="p">,</span> <span class="s2">&quot;__fspath__&quot;</span><span class="p">)</span> <span class="ow">and</span>
<span class="n">filesystem</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> <span class="ow">and</span>
<span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">filesystem</span><span class="p">,</span> <span class="n">LocalFileSystem</span><span class="p">)</span>
<span class="p">):</span>
<span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span>
<span class="s2">&quot;Path-like objects with __fspath__ must only be used with &quot;</span>
<span class="sa">f</span><span class="s2">&quot;local file systems, not </span><span class="si">{</span><span class="nb">type</span><span class="p">(</span><span class="n">filesystem</span><span class="p">)</span><span class="si">}</span><span class="s2">&quot;</span>
<span class="p">)</span>
<span class="c1"># check for single fragment dataset</span>
<span class="n">single_file</span> <span class="o">=</span> <span class="kc">None</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_base_dir</span> <span class="o">=</span> <span class="kc">None</span>
<span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">path_or_paths</span><span class="p">,</span> <span class="nb">list</span><span class="p">):</span>
<span class="k">if</span> <span class="n">_is_path_like</span><span class="p">(</span><span class="n">path_or_paths</span><span class="p">):</span>
<span class="n">path_or_paths</span> <span class="o">=</span> <span class="n">_stringify_path</span><span class="p">(</span><span class="n">path_or_paths</span><span class="p">)</span>
<span class="k">if</span> <span class="n">filesystem</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
<span class="c1"># path might be a URI describing the FileSystem as well</span>
<span class="k">try</span><span class="p">:</span>
<span class="n">filesystem</span><span class="p">,</span> <span class="n">path_or_paths</span> <span class="o">=</span> <span class="n">FileSystem</span><span class="o">.</span><span class="n">from_uri</span><span class="p">(</span>
<span class="n">path_or_paths</span><span class="p">)</span>
<span class="k">except</span> <span class="ne">ValueError</span><span class="p">:</span>
<span class="n">filesystem</span> <span class="o">=</span> <span class="n">LocalFileSystem</span><span class="p">(</span><span class="n">use_mmap</span><span class="o">=</span><span class="n">memory_map</span><span class="p">)</span>
<span class="n">finfo</span> <span class="o">=</span> <span class="n">filesystem</span><span class="o">.</span><span class="n">get_file_info</span><span class="p">(</span><span class="n">path_or_paths</span><span class="p">)</span>
<span class="k">if</span> <span class="n">finfo</span><span class="o">.</span><span class="n">is_file</span><span class="p">:</span>
<span class="n">single_file</span> <span class="o">=</span> <span class="n">path_or_paths</span>
<span class="k">if</span> <span class="n">finfo</span><span class="o">.</span><span class="n">type</span> <span class="o">==</span> <span class="n">FileType</span><span class="o">.</span><span class="n">Directory</span><span class="p">:</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_base_dir</span> <span class="o">=</span> <span class="n">path_or_paths</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">single_file</span> <span class="o">=</span> <span class="n">path_or_paths</span>
<span class="n">parquet_format</span> <span class="o">=</span> <span class="n">ds</span><span class="o">.</span><span class="n">ParquetFileFormat</span><span class="p">(</span><span class="o">**</span><span class="n">read_options</span><span class="p">)</span>
<span class="k">if</span> <span class="n">single_file</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="n">fragment</span> <span class="o">=</span> <span class="n">parquet_format</span><span class="o">.</span><span class="n">make_fragment</span><span class="p">(</span><span class="n">single_file</span><span class="p">,</span> <span class="n">filesystem</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_dataset</span> <span class="o">=</span> <span class="n">ds</span><span class="o">.</span><span class="n">FileSystemDataset</span><span class="p">(</span>
<span class="p">[</span><span class="n">fragment</span><span class="p">],</span> <span class="n">schema</span><span class="o">=</span><span class="n">schema</span> <span class="ow">or</span> <span class="n">fragment</span><span class="o">.</span><span class="n">physical_schema</span><span class="p">,</span>
<span class="nb">format</span><span class="o">=</span><span class="n">parquet_format</span><span class="p">,</span>
<span class="n">filesystem</span><span class="o">=</span><span class="n">fragment</span><span class="o">.</span><span class="n">filesystem</span>
<span class="p">)</span>
<span class="k">return</span>
<span class="c1"># check partitioning to enable dictionary encoding</span>
<span class="k">if</span> <span class="n">partitioning</span> <span class="o">==</span> <span class="s2">&quot;hive&quot;</span><span class="p">:</span>
<span class="n">partitioning</span> <span class="o">=</span> <span class="n">ds</span><span class="o">.</span><span class="n">HivePartitioning</span><span class="o">.</span><span class="n">discover</span><span class="p">(</span>
<span class="n">infer_dictionary</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_dataset</span> <span class="o">=</span> <span class="n">ds</span><span class="o">.</span><span class="n">dataset</span><span class="p">(</span><span class="n">path_or_paths</span><span class="p">,</span> <span class="n">filesystem</span><span class="o">=</span><span class="n">filesystem</span><span class="p">,</span>
<span class="n">schema</span><span class="o">=</span><span class="n">schema</span><span class="p">,</span> <span class="nb">format</span><span class="o">=</span><span class="n">parquet_format</span><span class="p">,</span>
<span class="n">partitioning</span><span class="o">=</span><span class="n">partitioning</span><span class="p">,</span>
<span class="n">ignore_prefixes</span><span class="o">=</span><span class="n">ignore_prefixes</span><span class="p">)</span></div>
<div class="viewcode-block" id="ParquetDataset.equals"><a class="viewcode-back" href="../../../python/generated/pyarrow.parquet.ParquetDataset.html#pyarrow.parquet.ParquetDataset.equals">[docs]</a> <span class="k">def</span> <span class="nf">equals</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">):</span>
<span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">other</span><span class="p">,</span> <span class="n">ParquetDataset</span><span class="p">):</span>
<span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s1">&#39;`other` must be an instance of ParquetDataset&#39;</span><span class="p">)</span>
<span class="k">return</span> <span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">schema</span> <span class="o">==</span> <span class="n">other</span><span class="o">.</span><span class="n">schema</span> <span class="ow">and</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_dataset</span><span class="o">.</span><span class="n">format</span> <span class="o">==</span> <span class="n">other</span><span class="o">.</span><span class="n">_dataset</span><span class="o">.</span><span class="n">format</span> <span class="ow">and</span>
<span class="bp">self</span><span class="o">.</span><span class="n">filesystem</span> <span class="o">==</span> <span class="n">other</span><span class="o">.</span><span class="n">filesystem</span> <span class="ow">and</span>
<span class="c1"># self.fragments == other.fragments and</span>
<span class="bp">self</span><span class="o">.</span><span class="n">files</span> <span class="o">==</span> <span class="n">other</span><span class="o">.</span><span class="n">files</span><span class="p">)</span></div>
<span class="k">def</span> <span class="fm">__eq__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">):</span>
<span class="k">try</span><span class="p">:</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">equals</span><span class="p">(</span><span class="n">other</span><span class="p">)</span>
<span class="k">except</span> <span class="ne">TypeError</span><span class="p">:</span>
<span class="k">return</span> <span class="bp">NotImplemented</span>
<span class="nd">@property</span>
<span class="k">def</span> <span class="nf">schema</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Schema of the Dataset.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> Generate an example dataset:</span>
<span class="sd"> &gt;&gt;&gt; import pyarrow as pa</span>
<span class="sd"> &gt;&gt;&gt; table = pa.table({&#39;year&#39;: [2020, 2022, 2021, 2022, 2019, 2021],</span>
<span class="sd"> ... &#39;n_legs&#39;: [2, 2, 4, 4, 5, 100],</span>
<span class="sd"> ... &#39;animal&#39;: [&quot;Flamingo&quot;, &quot;Parrot&quot;, &quot;Dog&quot;, &quot;Horse&quot;,</span>
<span class="sd"> ... &quot;Brittle stars&quot;, &quot;Centipede&quot;]})</span>
<span class="sd"> &gt;&gt;&gt; import pyarrow.parquet as pq</span>
<span class="sd"> &gt;&gt;&gt; pq.write_to_dataset(table, root_path=&#39;dataset_v2_schema&#39;,</span>
<span class="sd"> ... partition_cols=[&#39;year&#39;])</span>
<span class="sd"> &gt;&gt;&gt; dataset = pq.ParquetDataset(&#39;dataset_v2_schema/&#39;)</span>
<span class="sd"> Read the schema:</span>
<span class="sd"> &gt;&gt;&gt; dataset.schema</span>
<span class="sd"> n_legs: int64</span>
<span class="sd"> animal: string</span>
<span class="sd"> year: dictionary&lt;values=int32, indices=int32, ordered=0&gt;</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_dataset</span><span class="o">.</span><span class="n">schema</span>
<div class="viewcode-block" id="ParquetDataset.read"><a class="viewcode-back" href="../../../python/generated/pyarrow.parquet.ParquetDataset.html#pyarrow.parquet.ParquetDataset.read">[docs]</a> <span class="k">def</span> <span class="nf">read</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">columns</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">use_threads</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">use_pandas_metadata</span><span class="o">=</span><span class="kc">False</span><span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Read (multiple) Parquet files as a single pyarrow.Table.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> columns : List[str]</span>
<span class="sd"> Names of columns to read from the dataset. The partition fields</span>
<span class="sd"> are not automatically included.</span>
<span class="sd"> use_threads : bool, default True</span>
<span class="sd"> Perform multi-threaded column reads.</span>
<span class="sd"> use_pandas_metadata : bool, default False</span>
<span class="sd"> If True and file has custom pandas schema metadata, ensure that</span>
<span class="sd"> index columns are also loaded.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> pyarrow.Table</span>
<span class="sd"> Content of the file as a table (of columns).</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> Generate an example dataset:</span>
<span class="sd"> &gt;&gt;&gt; import pyarrow as pa</span>
<span class="sd"> &gt;&gt;&gt; table = pa.table({&#39;year&#39;: [2020, 2022, 2021, 2022, 2019, 2021],</span>
<span class="sd"> ... &#39;n_legs&#39;: [2, 2, 4, 4, 5, 100],</span>
<span class="sd"> ... &#39;animal&#39;: [&quot;Flamingo&quot;, &quot;Parrot&quot;, &quot;Dog&quot;, &quot;Horse&quot;,</span>
<span class="sd"> ... &quot;Brittle stars&quot;, &quot;Centipede&quot;]})</span>
<span class="sd"> &gt;&gt;&gt; import pyarrow.parquet as pq</span>
<span class="sd"> &gt;&gt;&gt; pq.write_to_dataset(table, root_path=&#39;dataset_v2_read&#39;,</span>
<span class="sd"> ... partition_cols=[&#39;year&#39;])</span>
<span class="sd"> &gt;&gt;&gt; dataset = pq.ParquetDataset(&#39;dataset_v2_read/&#39;)</span>
<span class="sd"> Read the dataset:</span>
<span class="sd"> &gt;&gt;&gt; dataset.read(columns=[&quot;n_legs&quot;])</span>
<span class="sd"> pyarrow.Table</span>
<span class="sd"> n_legs: int64</span>
<span class="sd"> ----</span>
<span class="sd"> n_legs: [[5],[2],[4,100],[2,4]]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="c1"># if use_pandas_metadata, we need to include index columns in the</span>
<span class="c1"># column selection, to be able to restore those in the pandas DataFrame</span>
<span class="n">metadata</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">schema</span><span class="o">.</span><span class="n">metadata</span> <span class="ow">or</span> <span class="p">{}</span>
<span class="k">if</span> <span class="n">use_pandas_metadata</span><span class="p">:</span>
<span class="c1"># if the dataset schema metadata itself doesn&#39;t have pandas</span>
<span class="c1"># then try to get this from common file (for backwards compat)</span>
<span class="k">if</span> <span class="sa">b</span><span class="s2">&quot;pandas&quot;</span> <span class="ow">not</span> <span class="ow">in</span> <span class="n">metadata</span><span class="p">:</span>
<span class="n">common_metadata</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_get_common_pandas_metadata</span><span class="p">()</span>
<span class="k">if</span> <span class="n">common_metadata</span><span class="p">:</span>
<span class="n">metadata</span> <span class="o">=</span> <span class="n">common_metadata</span>
<span class="k">if</span> <span class="n">columns</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> <span class="ow">and</span> <span class="n">use_pandas_metadata</span><span class="p">:</span>
<span class="k">if</span> <span class="n">metadata</span> <span class="ow">and</span> <span class="sa">b</span><span class="s1">&#39;pandas&#39;</span> <span class="ow">in</span> <span class="n">metadata</span><span class="p">:</span>
<span class="c1"># RangeIndex can be represented as dict instead of column name</span>
<span class="n">index_columns</span> <span class="o">=</span> <span class="p">[</span>
<span class="n">col</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">_get_pandas_index_columns</span><span class="p">(</span><span class="n">metadata</span><span class="p">)</span>
<span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">col</span><span class="p">,</span> <span class="nb">dict</span><span class="p">)</span>
<span class="p">]</span>
<span class="n">columns</span> <span class="o">=</span> <span class="p">(</span>
<span class="nb">list</span><span class="p">(</span><span class="n">columns</span><span class="p">)</span> <span class="o">+</span> <span class="nb">list</span><span class="p">(</span><span class="nb">set</span><span class="p">(</span><span class="n">index_columns</span><span class="p">)</span> <span class="o">-</span> <span class="nb">set</span><span class="p">(</span><span class="n">columns</span><span class="p">))</span>
<span class="p">)</span>
<span class="n">table</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_dataset</span><span class="o">.</span><span class="n">to_table</span><span class="p">(</span>
<span class="n">columns</span><span class="o">=</span><span class="n">columns</span><span class="p">,</span> <span class="nb">filter</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">_filter_expression</span><span class="p">,</span>
<span class="n">use_threads</span><span class="o">=</span><span class="n">use_threads</span>
<span class="p">)</span>
<span class="c1"># if use_pandas_metadata, restore the pandas metadata (which gets</span>
<span class="c1"># lost if doing a specific `columns` selection in to_table)</span>
<span class="k">if</span> <span class="n">use_pandas_metadata</span><span class="p">:</span>
<span class="k">if</span> <span class="n">metadata</span> <span class="ow">and</span> <span class="sa">b</span><span class="s2">&quot;pandas&quot;</span> <span class="ow">in</span> <span class="n">metadata</span><span class="p">:</span>
<span class="n">new_metadata</span> <span class="o">=</span> <span class="n">table</span><span class="o">.</span><span class="n">schema</span><span class="o">.</span><span class="n">metadata</span> <span class="ow">or</span> <span class="p">{}</span>
<span class="n">new_metadata</span><span class="o">.</span><span class="n">update</span><span class="p">({</span><span class="sa">b</span><span class="s2">&quot;pandas&quot;</span><span class="p">:</span> <span class="n">metadata</span><span class="p">[</span><span class="sa">b</span><span class="s2">&quot;pandas&quot;</span><span class="p">]})</span>
<span class="n">table</span> <span class="o">=</span> <span class="n">table</span><span class="o">.</span><span class="n">replace_schema_metadata</span><span class="p">(</span><span class="n">new_metadata</span><span class="p">)</span>
<span class="k">return</span> <span class="n">table</span></div>
<span class="k">def</span> <span class="nf">_get_common_pandas_metadata</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="k">if</span> <span class="ow">not</span> <span class="bp">self</span><span class="o">.</span><span class="n">_base_dir</span><span class="p">:</span>
<span class="k">return</span> <span class="kc">None</span>
<span class="n">metadata</span> <span class="o">=</span> <span class="kc">None</span>
<span class="k">for</span> <span class="n">name</span> <span class="ow">in</span> <span class="p">[</span><span class="s2">&quot;_common_metadata&quot;</span><span class="p">,</span> <span class="s2">&quot;_metadata&quot;</span><span class="p">]:</span>
<span class="n">metadata_path</span> <span class="o">=</span> <span class="n">os</span><span class="o">.</span><span class="n">path</span><span class="o">.</span><span class="n">join</span><span class="p">(</span><span class="nb">str</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_base_dir</span><span class="p">),</span> <span class="n">name</span><span class="p">)</span>
<span class="n">finfo</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">filesystem</span><span class="o">.</span><span class="n">get_file_info</span><span class="p">(</span><span class="n">metadata_path</span><span class="p">)</span>
<span class="k">if</span> <span class="n">finfo</span><span class="o">.</span><span class="n">is_file</span><span class="p">:</span>
<span class="n">pq_meta</span> <span class="o">=</span> <span class="n">read_metadata</span><span class="p">(</span>
<span class="n">metadata_path</span><span class="p">,</span> <span class="n">filesystem</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">filesystem</span><span class="p">)</span>
<span class="n">metadata</span> <span class="o">=</span> <span class="n">pq_meta</span><span class="o">.</span><span class="n">metadata</span>
<span class="k">if</span> <span class="n">metadata</span> <span class="ow">and</span> <span class="sa">b</span><span class="s1">&#39;pandas&#39;</span> <span class="ow">in</span> <span class="n">metadata</span><span class="p">:</span>
<span class="k">break</span>
<span class="k">return</span> <span class="n">metadata</span>
<div class="viewcode-block" id="ParquetDataset.read_pandas"><a class="viewcode-back" href="../../../python/generated/pyarrow.parquet.ParquetDataset.html#pyarrow.parquet.ParquetDataset.read_pandas">[docs]</a> <span class="k">def</span> <span class="nf">read_pandas</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Read dataset including pandas metadata, if any. Other arguments passed</span>
<span class="sd"> through to :func:`read`, see docstring for further details.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> **kwargs : optional</span>
<span class="sd"> Additional options for :func:`read`</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> Generate an example parquet file:</span>
<span class="sd"> &gt;&gt;&gt; import pyarrow as pa</span>
<span class="sd"> &gt;&gt;&gt; import pandas as pd</span>
<span class="sd"> &gt;&gt;&gt; df = pd.DataFrame({&#39;year&#39;: [2020, 2022, 2021, 2022, 2019, 2021],</span>
<span class="sd"> ... &#39;n_legs&#39;: [2, 2, 4, 4, 5, 100],</span>
<span class="sd"> ... &#39;animal&#39;: [&quot;Flamingo&quot;, &quot;Parrot&quot;, &quot;Dog&quot;, &quot;Horse&quot;,</span>
<span class="sd"> ... &quot;Brittle stars&quot;, &quot;Centipede&quot;]})</span>
<span class="sd"> &gt;&gt;&gt; table = pa.Table.from_pandas(df)</span>
<span class="sd"> &gt;&gt;&gt; import pyarrow.parquet as pq</span>
<span class="sd"> &gt;&gt;&gt; pq.write_table(table, &#39;table_V2.parquet&#39;)</span>
<span class="sd"> &gt;&gt;&gt; dataset = pq.ParquetDataset(&#39;table_V2.parquet&#39;)</span>
<span class="sd"> Read the dataset with pandas metadata:</span>
<span class="sd"> &gt;&gt;&gt; dataset.read_pandas(columns=[&quot;n_legs&quot;])</span>
<span class="sd"> pyarrow.Table</span>
<span class="sd"> n_legs: int64</span>
<span class="sd"> ----</span>
<span class="sd"> n_legs: [[2,2,4,4,5,100]]</span>
<span class="sd"> &gt;&gt;&gt; dataset.read_pandas(columns=[&quot;n_legs&quot;]).schema.pandas_metadata</span>
<span class="sd"> {&#39;index_columns&#39;: [{&#39;kind&#39;: &#39;range&#39;, &#39;name&#39;: None, &#39;start&#39;: 0, ...}</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">read</span><span class="p">(</span><span class="n">use_pandas_metadata</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">)</span></div>
<span class="nd">@property</span>
<span class="k">def</span> <span class="nf">fragments</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> A list of the Dataset source fragments or pieces with absolute</span>
<span class="sd"> file paths.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> Generate an example dataset:</span>
<span class="sd"> &gt;&gt;&gt; import pyarrow as pa</span>
<span class="sd"> &gt;&gt;&gt; table = pa.table({&#39;year&#39;: [2020, 2022, 2021, 2022, 2019, 2021],</span>
<span class="sd"> ... &#39;n_legs&#39;: [2, 2, 4, 4, 5, 100],</span>
<span class="sd"> ... &#39;animal&#39;: [&quot;Flamingo&quot;, &quot;Parrot&quot;, &quot;Dog&quot;, &quot;Horse&quot;,</span>
<span class="sd"> ... &quot;Brittle stars&quot;, &quot;Centipede&quot;]})</span>
<span class="sd"> &gt;&gt;&gt; import pyarrow.parquet as pq</span>
<span class="sd"> &gt;&gt;&gt; pq.write_to_dataset(table, root_path=&#39;dataset_v2_fragments&#39;,</span>
<span class="sd"> ... partition_cols=[&#39;year&#39;])</span>
<span class="sd"> &gt;&gt;&gt; dataset = pq.ParquetDataset(&#39;dataset_v2_fragments/&#39;)</span>
<span class="sd"> List the fragments:</span>
<span class="sd"> &gt;&gt;&gt; dataset.fragments</span>
<span class="sd"> [&lt;pyarrow.dataset.ParquetFileFragment path=dataset_v2_fragments/...</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="nb">list</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_dataset</span><span class="o">.</span><span class="n">get_fragments</span><span class="p">())</span>
<span class="nd">@property</span>
<span class="k">def</span> <span class="nf">files</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> A list of absolute Parquet file paths in the Dataset source.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> Generate an example dataset:</span>
<span class="sd"> &gt;&gt;&gt; import pyarrow as pa</span>
<span class="sd"> &gt;&gt;&gt; table = pa.table({&#39;year&#39;: [2020, 2022, 2021, 2022, 2019, 2021],</span>
<span class="sd"> ... &#39;n_legs&#39;: [2, 2, 4, 4, 5, 100],</span>
<span class="sd"> ... &#39;animal&#39;: [&quot;Flamingo&quot;, &quot;Parrot&quot;, &quot;Dog&quot;, &quot;Horse&quot;,</span>
<span class="sd"> ... &quot;Brittle stars&quot;, &quot;Centipede&quot;]})</span>
<span class="sd"> &gt;&gt;&gt; import pyarrow.parquet as pq</span>
<span class="sd"> &gt;&gt;&gt; pq.write_to_dataset(table, root_path=&#39;dataset_v2_files&#39;,</span>
<span class="sd"> ... partition_cols=[&#39;year&#39;])</span>
<span class="sd"> &gt;&gt;&gt; dataset = pq.ParquetDataset(&#39;dataset_v2_files/&#39;)</span>
<span class="sd"> List the files:</span>
<span class="sd"> &gt;&gt;&gt; dataset.files</span>
<span class="sd"> [&#39;dataset_v2_files/year=2019/...-0.parquet&#39;, ...</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_dataset</span><span class="o">.</span><span class="n">files</span>
<span class="nd">@property</span>
<span class="k">def</span> <span class="nf">filesystem</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> The filesystem type of the Dataset source.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_dataset</span><span class="o">.</span><span class="n">filesystem</span>
<span class="nd">@property</span>
<span class="k">def</span> <span class="nf">partitioning</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> The partitioning of the Dataset source, if discovered.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_dataset</span><span class="o">.</span><span class="n">partitioning</span></div>
<span class="n">_read_table_docstring</span> <span class="o">=</span> <span class="s2">&quot;&quot;&quot;</span>
<span class="si">{0}</span>
<span class="s2">Parameters</span>
<span class="s2">----------</span>
<span class="s2">source : str, pyarrow.NativeFile, or file-like object</span>
<span class="s2"> If a string passed, can be a single file name or directory name. For</span>
<span class="s2"> file-like objects, only read a single file. Use pyarrow.BufferReader to</span>
<span class="s2"> read a file contained in a bytes or buffer-like object.</span>
<span class="s2">columns : list</span>
<span class="s2"> If not None, only these columns will be read from the file. A column</span>
<span class="s2"> name may be a prefix of a nested field, e.g. &#39;a&#39; will select &#39;a.b&#39;,</span>
<span class="s2"> &#39;a.c&#39;, and &#39;a.d.e&#39;. If empty, no columns will be read. Note</span>
<span class="s2"> that the table will still have the correct num_rows set despite having</span>
<span class="s2"> no columns.</span>
<span class="s2">use_threads : bool, default True</span>
<span class="s2"> Perform multi-threaded column reads.</span>
<span class="s2">schema : Schema, optional</span>
<span class="s2"> Optionally provide the Schema for the parquet dataset, in which case it</span>
<span class="s2"> will not be inferred from the source.</span>
<span class="si">{1}</span>
<span class="s2">filesystem : FileSystem, default None</span>
<span class="s2"> If nothing passed, will be inferred based on path.</span>
<span class="s2"> Path will try to be found in the local on-disk filesystem otherwise</span>
<span class="s2"> it will be parsed as an URI to determine the filesystem.</span>
<span class="s2">filters : pyarrow.compute.Expression or List[Tuple] or List[List[Tuple]], default None</span>
<span class="s2"> Rows which do not match the filter predicate will be removed from scanned</span>
<span class="s2"> data. Partition keys embedded in a nested directory structure will be</span>
<span class="s2"> exploited to avoid loading files at all if they contain no matching rows.</span>
<span class="s2"> Within-file level filtering and different partitioning schemes are supported.</span>
<span class="s2"> </span><span class="si">{3}</span>
<span class="s2">use_legacy_dataset : bool, optional</span>
<span class="s2"> Deprecated and has no effect from PyArrow version 15.0.0.</span>
<span class="s2">ignore_prefixes : list, optional</span>
<span class="s2"> Files matching any of these prefixes will be ignored by the</span>
<span class="s2"> discovery process.</span>
<span class="s2"> This is matched to the basename of a path.</span>
<span class="s2"> By default this is [&#39;.&#39;, &#39;_&#39;].</span>
<span class="s2"> Note that discovery happens only if a directory is passed as source.</span>
<span class="s2">pre_buffer : bool, default True</span>
<span class="s2"> Coalesce and issue file reads in parallel to improve performance on</span>
<span class="s2"> high-latency filesystems (e.g. S3). If True, Arrow will use a</span>
<span class="s2"> background I/O thread pool. If using a filesystem layer that itself</span>
<span class="s2"> performs readahead (e.g. fsspec&#39;s S3FS), disable readahead for best</span>
<span class="s2"> results.</span>
<span class="s2">coerce_int96_timestamp_unit : str, default None</span>
<span class="s2"> Cast timestamps that are stored in INT96 format to a particular</span>
<span class="s2"> resolution (e.g. &#39;ms&#39;). Setting to None is equivalent to &#39;ns&#39;</span>
<span class="s2"> and therefore INT96 timestamps will be inferred as timestamps</span>
<span class="s2"> in nanoseconds.</span>
<span class="s2">decryption_properties : FileDecryptionProperties or None</span>
<span class="s2"> File-level decryption properties.</span>
<span class="s2"> The decryption properties can be created using</span>
<span class="s2"> ``CryptoFactory.file_decryption_properties()``.</span>
<span class="s2">thrift_string_size_limit : int, default None</span>
<span class="s2"> If not None, override the maximum total string size allocated</span>
<span class="s2"> when decoding Thrift structures. The default limit should be</span>
<span class="s2"> sufficient for most Parquet files.</span>
<span class="s2">thrift_container_size_limit : int, default None</span>
<span class="s2"> If not None, override the maximum total size of containers allocated</span>
<span class="s2"> when decoding Thrift structures. The default limit should be</span>
<span class="s2"> sufficient for most Parquet files.</span>
<span class="s2">page_checksum_verification : bool, default False</span>
<span class="s2"> If True, verify the checksum for each page read from the file.</span>
<span class="s2">Returns</span>
<span class="s2">-------</span>
<span class="si">{2}</span>
<span class="si">{4}</span>
<span class="s2">&quot;&quot;&quot;</span>
<span class="n">_read_table_example</span> <span class="o">=</span> <span class="s2">&quot;&quot;&quot;</span><span class="se">\</span>
<span class="s2">Examples</span>
<span class="s2">--------</span>
<span class="s2">Generate an example PyArrow Table and write it to a partitioned dataset:</span>
<span class="s2">&gt;&gt;&gt; import pyarrow as pa</span>
<span class="s2">&gt;&gt;&gt; table = pa.table({&#39;year&#39;: [2020, 2022, 2021, 2022, 2019, 2021],</span>
<span class="s2">... &#39;n_legs&#39;: [2, 2, 4, 4, 5, 100],</span>
<span class="s2">... &#39;animal&#39;: [&quot;Flamingo&quot;, &quot;Parrot&quot;, &quot;Dog&quot;, &quot;Horse&quot;,</span>
<span class="s2">... &quot;Brittle stars&quot;, &quot;Centipede&quot;]})</span>
<span class="s2">&gt;&gt;&gt; import pyarrow.parquet as pq</span>
<span class="s2">&gt;&gt;&gt; pq.write_to_dataset(table, root_path=&#39;dataset_name_2&#39;,</span>
<span class="s2">... partition_cols=[&#39;year&#39;])</span>
<span class="s2">Read the data:</span>
<span class="s2">&gt;&gt;&gt; pq.read_table(&#39;dataset_name_2&#39;).to_pandas()</span>
<span class="s2"> n_legs animal year</span>
<span class="s2">0 5 Brittle stars 2019</span>
<span class="s2">1 2 Flamingo 2020</span>
<span class="s2">2 4 Dog 2021</span>
<span class="s2">3 100 Centipede 2021</span>
<span class="s2">4 2 Parrot 2022</span>
<span class="s2">5 4 Horse 2022</span>
<span class="s2">Read only a subset of columns:</span>
<span class="s2">&gt;&gt;&gt; pq.read_table(&#39;dataset_name_2&#39;, columns=[&quot;n_legs&quot;, &quot;animal&quot;])</span>
<span class="s2">pyarrow.Table</span>
<span class="s2">n_legs: int64</span>
<span class="s2">animal: string</span>
<span class="s2">----</span>
<span class="s2">n_legs: [[5],[2],[4,100],[2,4]]</span>
<span class="s2">animal: [[&quot;Brittle stars&quot;],[&quot;Flamingo&quot;],[&quot;Dog&quot;,&quot;Centipede&quot;],[&quot;Parrot&quot;,&quot;Horse&quot;]]</span>
<span class="s2">Read a subset of columns and read one column as DictionaryArray:</span>
<span class="s2">&gt;&gt;&gt; pq.read_table(&#39;dataset_name_2&#39;, columns=[&quot;n_legs&quot;, &quot;animal&quot;],</span>
<span class="s2">... read_dictionary=[&quot;animal&quot;])</span>
<span class="s2">pyarrow.Table</span>
<span class="s2">n_legs: int64</span>
<span class="s2">animal: dictionary&lt;values=string, indices=int32, ordered=0&gt;</span>
<span class="s2">----</span>
<span class="s2">n_legs: [[5],[2],[4,100],[2,4]]</span>
<span class="s2">animal: [ -- dictionary:</span>
<span class="s2">[&quot;Brittle stars&quot;] -- indices:</span>
<span class="s2">[0], -- dictionary:</span>
<span class="s2">[&quot;Flamingo&quot;] -- indices:</span>
<span class="s2">[0], -- dictionary:</span>
<span class="s2">[&quot;Dog&quot;,&quot;Centipede&quot;] -- indices:</span>
<span class="s2">[0,1], -- dictionary:</span>
<span class="s2">[&quot;Parrot&quot;,&quot;Horse&quot;] -- indices:</span>
<span class="s2">[0,1]]</span>
<span class="s2">Read the table with filter:</span>
<span class="s2">&gt;&gt;&gt; pq.read_table(&#39;dataset_name_2&#39;, columns=[&quot;n_legs&quot;, &quot;animal&quot;],</span>
<span class="s2">... filters=[(&#39;n_legs&#39;,&#39;&lt;&#39;,4)]).to_pandas()</span>
<span class="s2"> n_legs animal</span>
<span class="s2">0 2 Flamingo</span>
<span class="s2">1 2 Parrot</span>
<span class="s2">Read data from a single Parquet file:</span>
<span class="s2">&gt;&gt;&gt; pq.write_table(table, &#39;example.parquet&#39;)</span>
<span class="s2">&gt;&gt;&gt; pq.read_table(&#39;dataset_name_2&#39;).to_pandas()</span>
<span class="s2"> n_legs animal year</span>
<span class="s2">0 5 Brittle stars 2019</span>
<span class="s2">1 2 Flamingo 2020</span>
<span class="s2">2 4 Dog 2021</span>
<span class="s2">3 100 Centipede 2021</span>
<span class="s2">4 2 Parrot 2022</span>
<span class="s2">5 4 Horse 2022</span>
<span class="s2">&quot;&quot;&quot;</span>
<div class="viewcode-block" id="read_table"><a class="viewcode-back" href="../../../python/generated/pyarrow.parquet.read_table.html#pyarrow.parquet.read_table">[docs]</a><span class="k">def</span> <span class="nf">read_table</span><span class="p">(</span><span class="n">source</span><span class="p">,</span> <span class="o">*</span><span class="p">,</span> <span class="n">columns</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">use_threads</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span>
<span class="n">schema</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">use_pandas_metadata</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span> <span class="n">read_dictionary</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span>
<span class="n">memory_map</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span> <span class="n">buffer_size</span><span class="o">=</span><span class="mi">0</span><span class="p">,</span> <span class="n">partitioning</span><span class="o">=</span><span class="s2">&quot;hive&quot;</span><span class="p">,</span>
<span class="n">filesystem</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">filters</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">use_legacy_dataset</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span>
<span class="n">ignore_prefixes</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">pre_buffer</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span>
<span class="n">coerce_int96_timestamp_unit</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span>
<span class="n">decryption_properties</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">thrift_string_size_limit</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span>
<span class="n">thrift_container_size_limit</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span>
<span class="n">page_checksum_verification</span><span class="o">=</span><span class="kc">False</span><span class="p">):</span>
<span class="k">if</span> <span class="n">use_legacy_dataset</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="n">warnings</span><span class="o">.</span><span class="n">warn</span><span class="p">(</span>
<span class="s2">&quot;Passing &#39;use_legacy_dataset&#39; is deprecated as of pyarrow 15.0.0 &quot;</span>
<span class="s2">&quot;and will be removed in a future version.&quot;</span><span class="p">,</span>
<span class="ne">FutureWarning</span><span class="p">,</span> <span class="n">stacklevel</span><span class="o">=</span><span class="mi">2</span><span class="p">)</span>
<span class="k">try</span><span class="p">:</span>
<span class="n">dataset</span> <span class="o">=</span> <span class="n">ParquetDataset</span><span class="p">(</span>
<span class="n">source</span><span class="p">,</span>
<span class="n">schema</span><span class="o">=</span><span class="n">schema</span><span class="p">,</span>
<span class="n">filesystem</span><span class="o">=</span><span class="n">filesystem</span><span class="p">,</span>
<span class="n">partitioning</span><span class="o">=</span><span class="n">partitioning</span><span class="p">,</span>
<span class="n">memory_map</span><span class="o">=</span><span class="n">memory_map</span><span class="p">,</span>
<span class="n">read_dictionary</span><span class="o">=</span><span class="n">read_dictionary</span><span class="p">,</span>
<span class="n">buffer_size</span><span class="o">=</span><span class="n">buffer_size</span><span class="p">,</span>
<span class="n">filters</span><span class="o">=</span><span class="n">filters</span><span class="p">,</span>
<span class="n">ignore_prefixes</span><span class="o">=</span><span class="n">ignore_prefixes</span><span class="p">,</span>
<span class="n">pre_buffer</span><span class="o">=</span><span class="n">pre_buffer</span><span class="p">,</span>
<span class="n">coerce_int96_timestamp_unit</span><span class="o">=</span><span class="n">coerce_int96_timestamp_unit</span><span class="p">,</span>
<span class="n">thrift_string_size_limit</span><span class="o">=</span><span class="n">thrift_string_size_limit</span><span class="p">,</span>
<span class="n">thrift_container_size_limit</span><span class="o">=</span><span class="n">thrift_container_size_limit</span><span class="p">,</span>
<span class="n">page_checksum_verification</span><span class="o">=</span><span class="n">page_checksum_verification</span><span class="p">,</span>
<span class="p">)</span>
<span class="k">except</span> <span class="ne">ImportError</span><span class="p">:</span>
<span class="c1"># fall back on ParquetFile for simple cases when pyarrow.dataset</span>
<span class="c1"># module is not available</span>
<span class="k">if</span> <span class="n">filters</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span>
<span class="s2">&quot;the &#39;filters&#39; keyword is not supported when the &quot;</span>
<span class="s2">&quot;pyarrow.dataset module is not available&quot;</span>
<span class="p">)</span>
<span class="k">if</span> <span class="n">partitioning</span> <span class="o">!=</span> <span class="s2">&quot;hive&quot;</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span>
<span class="s2">&quot;the &#39;partitioning&#39; keyword is not supported when the &quot;</span>
<span class="s2">&quot;pyarrow.dataset module is not available&quot;</span>
<span class="p">)</span>
<span class="k">if</span> <span class="n">schema</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span>
<span class="s2">&quot;the &#39;schema&#39; argument is not supported when the &quot;</span>
<span class="s2">&quot;pyarrow.dataset module is not available&quot;</span>
<span class="p">)</span>
<span class="n">filesystem</span><span class="p">,</span> <span class="n">path</span> <span class="o">=</span> <span class="n">_resolve_filesystem_and_path</span><span class="p">(</span><span class="n">source</span><span class="p">,</span> <span class="n">filesystem</span><span class="p">)</span>
<span class="k">if</span> <span class="n">filesystem</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="n">source</span> <span class="o">=</span> <span class="n">filesystem</span><span class="o">.</span><span class="n">open_input_file</span><span class="p">(</span><span class="n">path</span><span class="p">)</span>
<span class="c1"># TODO test that source is not a directory or a list</span>
<span class="n">dataset</span> <span class="o">=</span> <span class="n">ParquetFile</span><span class="p">(</span>
<span class="n">source</span><span class="p">,</span> <span class="n">read_dictionary</span><span class="o">=</span><span class="n">read_dictionary</span><span class="p">,</span>
<span class="n">memory_map</span><span class="o">=</span><span class="n">memory_map</span><span class="p">,</span> <span class="n">buffer_size</span><span class="o">=</span><span class="n">buffer_size</span><span class="p">,</span>
<span class="n">pre_buffer</span><span class="o">=</span><span class="n">pre_buffer</span><span class="p">,</span>
<span class="n">coerce_int96_timestamp_unit</span><span class="o">=</span><span class="n">coerce_int96_timestamp_unit</span><span class="p">,</span>
<span class="n">decryption_properties</span><span class="o">=</span><span class="n">decryption_properties</span><span class="p">,</span>
<span class="n">thrift_string_size_limit</span><span class="o">=</span><span class="n">thrift_string_size_limit</span><span class="p">,</span>
<span class="n">thrift_container_size_limit</span><span class="o">=</span><span class="n">thrift_container_size_limit</span><span class="p">,</span>
<span class="n">page_checksum_verification</span><span class="o">=</span><span class="n">page_checksum_verification</span><span class="p">,</span>
<span class="p">)</span>
<span class="k">return</span> <span class="n">dataset</span><span class="o">.</span><span class="n">read</span><span class="p">(</span><span class="n">columns</span><span class="o">=</span><span class="n">columns</span><span class="p">,</span> <span class="n">use_threads</span><span class="o">=</span><span class="n">use_threads</span><span class="p">,</span>
<span class="n">use_pandas_metadata</span><span class="o">=</span><span class="n">use_pandas_metadata</span><span class="p">)</span></div>
<span class="n">read_table</span><span class="o">.</span><span class="vm">__doc__</span> <span class="o">=</span> <span class="n">_read_table_docstring</span><span class="o">.</span><span class="n">format</span><span class="p">(</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Read a Table from Parquet format&quot;&quot;&quot;</span><span class="p">,</span>
<span class="s2">&quot;</span><span class="se">\n</span><span class="s2">&quot;</span><span class="o">.</span><span class="n">join</span><span class="p">((</span><span class="s2">&quot;&quot;&quot;use_pandas_metadata : bool, default False</span>
<span class="s2"> If True and file has custom pandas schema metadata, ensure that</span>
<span class="s2"> index columns are also loaded.&quot;&quot;&quot;</span><span class="p">,</span> <span class="n">_read_docstring_common</span><span class="p">)),</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;pyarrow.Table</span>
<span class="sd"> Content of the file as a table (of columns)&quot;&quot;&quot;</span><span class="p">,</span>
<span class="n">_DNF_filter_doc</span><span class="p">,</span> <span class="n">_read_table_example</span><span class="p">)</span>
<div class="viewcode-block" id="read_pandas"><a class="viewcode-back" href="../../../python/generated/pyarrow.parquet.read_pandas.html#pyarrow.parquet.read_pandas">[docs]</a><span class="k">def</span> <span class="nf">read_pandas</span><span class="p">(</span><span class="n">source</span><span class="p">,</span> <span class="n">columns</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">):</span>
<span class="k">return</span> <span class="n">read_table</span><span class="p">(</span>
<span class="n">source</span><span class="p">,</span> <span class="n">columns</span><span class="o">=</span><span class="n">columns</span><span class="p">,</span> <span class="n">use_pandas_metadata</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span>
<span class="p">)</span></div>
<span class="n">read_pandas</span><span class="o">.</span><span class="vm">__doc__</span> <span class="o">=</span> <span class="n">_read_table_docstring</span><span class="o">.</span><span class="n">format</span><span class="p">(</span>
<span class="s1">&#39;Read a Table from Parquet format, also reading DataFrame</span><span class="se">\n</span><span class="s1">&#39;</span>
<span class="s1">&#39;index values if known in the file metadata&#39;</span><span class="p">,</span>
<span class="s2">&quot;</span><span class="se">\n</span><span class="s2">&quot;</span><span class="o">.</span><span class="n">join</span><span class="p">((</span><span class="n">_read_docstring_common</span><span class="p">,</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;**kwargs</span>
<span class="sd"> additional options for :func:`read_table`&quot;&quot;&quot;</span><span class="p">)),</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;pyarrow.Table</span>
<span class="sd"> Content of the file as a Table of Columns, including DataFrame</span>
<span class="sd"> indexes as columns&quot;&quot;&quot;</span><span class="p">,</span>
<span class="n">_DNF_filter_doc</span><span class="p">,</span> <span class="s2">&quot;&quot;</span><span class="p">)</span>
<div class="viewcode-block" id="write_table"><a class="viewcode-back" href="../../../python/generated/pyarrow.parquet.write_table.html#pyarrow.parquet.write_table">[docs]</a><span class="k">def</span> <span class="nf">write_table</span><span class="p">(</span><span class="n">table</span><span class="p">,</span> <span class="n">where</span><span class="p">,</span> <span class="n">row_group_size</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">version</span><span class="o">=</span><span class="s1">&#39;2.6&#39;</span><span class="p">,</span>
<span class="n">use_dictionary</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">compression</span><span class="o">=</span><span class="s1">&#39;snappy&#39;</span><span class="p">,</span>
<span class="n">write_statistics</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span>
<span class="n">use_deprecated_int96_timestamps</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span>
<span class="n">coerce_timestamps</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span>
<span class="n">allow_truncated_timestamps</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span>
<span class="n">data_page_size</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">flavor</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span>
<span class="n">filesystem</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span>
<span class="n">compression_level</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span>
<span class="n">use_byte_stream_split</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span>
<span class="n">column_encoding</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span>
<span class="n">data_page_version</span><span class="o">=</span><span class="s1">&#39;1.0&#39;</span><span class="p">,</span>
<span class="n">use_compliant_nested_type</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span>
<span class="n">encryption_properties</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span>
<span class="n">write_batch_size</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span>
<span class="n">dictionary_pagesize_limit</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span>
<span class="n">store_schema</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span>
<span class="n">write_page_index</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span>
<span class="n">write_page_checksum</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span>
<span class="n">sorting_columns</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span>
<span class="o">**</span><span class="n">kwargs</span><span class="p">):</span>
<span class="c1"># Implementor&#39;s note: when adding keywords here / updating defaults, also</span>
<span class="c1"># update it in write_to_dataset and _dataset_parquet.pyx ParquetFileWriteOptions</span>
<span class="n">row_group_size</span> <span class="o">=</span> <span class="n">kwargs</span><span class="o">.</span><span class="n">pop</span><span class="p">(</span><span class="s1">&#39;chunk_size&#39;</span><span class="p">,</span> <span class="n">row_group_size</span><span class="p">)</span>
<span class="n">use_int96</span> <span class="o">=</span> <span class="n">use_deprecated_int96_timestamps</span>
<span class="k">try</span><span class="p">:</span>
<span class="k">with</span> <span class="n">ParquetWriter</span><span class="p">(</span>
<span class="n">where</span><span class="p">,</span> <span class="n">table</span><span class="o">.</span><span class="n">schema</span><span class="p">,</span>
<span class="n">filesystem</span><span class="o">=</span><span class="n">filesystem</span><span class="p">,</span>
<span class="n">version</span><span class="o">=</span><span class="n">version</span><span class="p">,</span>
<span class="n">flavor</span><span class="o">=</span><span class="n">flavor</span><span class="p">,</span>
<span class="n">use_dictionary</span><span class="o">=</span><span class="n">use_dictionary</span><span class="p">,</span>
<span class="n">write_statistics</span><span class="o">=</span><span class="n">write_statistics</span><span class="p">,</span>
<span class="n">coerce_timestamps</span><span class="o">=</span><span class="n">coerce_timestamps</span><span class="p">,</span>
<span class="n">data_page_size</span><span class="o">=</span><span class="n">data_page_size</span><span class="p">,</span>
<span class="n">allow_truncated_timestamps</span><span class="o">=</span><span class="n">allow_truncated_timestamps</span><span class="p">,</span>
<span class="n">compression</span><span class="o">=</span><span class="n">compression</span><span class="p">,</span>
<span class="n">use_deprecated_int96_timestamps</span><span class="o">=</span><span class="n">use_int96</span><span class="p">,</span>
<span class="n">compression_level</span><span class="o">=</span><span class="n">compression_level</span><span class="p">,</span>
<span class="n">use_byte_stream_split</span><span class="o">=</span><span class="n">use_byte_stream_split</span><span class="p">,</span>
<span class="n">column_encoding</span><span class="o">=</span><span class="n">column_encoding</span><span class="p">,</span>
<span class="n">data_page_version</span><span class="o">=</span><span class="n">data_page_version</span><span class="p">,</span>
<span class="n">use_compliant_nested_type</span><span class="o">=</span><span class="n">use_compliant_nested_type</span><span class="p">,</span>
<span class="n">encryption_properties</span><span class="o">=</span><span class="n">encryption_properties</span><span class="p">,</span>
<span class="n">write_batch_size</span><span class="o">=</span><span class="n">write_batch_size</span><span class="p">,</span>
<span class="n">dictionary_pagesize_limit</span><span class="o">=</span><span class="n">dictionary_pagesize_limit</span><span class="p">,</span>
<span class="n">store_schema</span><span class="o">=</span><span class="n">store_schema</span><span class="p">,</span>
<span class="n">write_page_index</span><span class="o">=</span><span class="n">write_page_index</span><span class="p">,</span>
<span class="n">write_page_checksum</span><span class="o">=</span><span class="n">write_page_checksum</span><span class="p">,</span>
<span class="n">sorting_columns</span><span class="o">=</span><span class="n">sorting_columns</span><span class="p">,</span>
<span class="o">**</span><span class="n">kwargs</span><span class="p">)</span> <span class="k">as</span> <span class="n">writer</span><span class="p">:</span>
<span class="n">writer</span><span class="o">.</span><span class="n">write_table</span><span class="p">(</span><span class="n">table</span><span class="p">,</span> <span class="n">row_group_size</span><span class="o">=</span><span class="n">row_group_size</span><span class="p">)</span>
<span class="k">except</span> <span class="ne">Exception</span><span class="p">:</span>
<span class="k">if</span> <span class="n">_is_path_like</span><span class="p">(</span><span class="n">where</span><span class="p">):</span>
<span class="k">try</span><span class="p">:</span>
<span class="n">os</span><span class="o">.</span><span class="n">remove</span><span class="p">(</span><span class="n">_stringify_path</span><span class="p">(</span><span class="n">where</span><span class="p">))</span>
<span class="k">except</span> <span class="n">os</span><span class="o">.</span><span class="n">error</span><span class="p">:</span>
<span class="k">pass</span>
<span class="k">raise</span></div>
<span class="n">_write_table_example</span> <span class="o">=</span> <span class="s2">&quot;&quot;&quot;</span><span class="se">\</span>
<span class="s2">Generate an example PyArrow Table:</span>
<span class="s2">&gt;&gt;&gt; import pyarrow as pa</span>
<span class="s2">&gt;&gt;&gt; table = pa.table({&#39;n_legs&#39;: [2, 2, 4, 4, 5, 100],</span>
<span class="s2">... &#39;animal&#39;: [&quot;Flamingo&quot;, &quot;Parrot&quot;, &quot;Dog&quot;, &quot;Horse&quot;,</span>
<span class="s2">... &quot;Brittle stars&quot;, &quot;Centipede&quot;]})</span>
<span class="s2">and write the Table into Parquet file:</span>
<span class="s2">&gt;&gt;&gt; import pyarrow.parquet as pq</span>
<span class="s2">&gt;&gt;&gt; pq.write_table(table, &#39;example.parquet&#39;)</span>
<span class="s2">Defining row group size for the Parquet file:</span>
<span class="s2">&gt;&gt;&gt; pq.write_table(table, &#39;example.parquet&#39;, row_group_size=3)</span>
<span class="s2">Defining row group compression (default is Snappy):</span>
<span class="s2">&gt;&gt;&gt; pq.write_table(table, &#39;example.parquet&#39;, compression=&#39;none&#39;)</span>
<span class="s2">Defining row group compression and encoding per-column:</span>
<span class="s2">&gt;&gt;&gt; pq.write_table(table, &#39;example.parquet&#39;,</span>
<span class="s2">... compression={&#39;n_legs&#39;: &#39;snappy&#39;, &#39;animal&#39;: &#39;gzip&#39;},</span>
<span class="s2">... use_dictionary=[&#39;n_legs&#39;, &#39;animal&#39;])</span>
<span class="s2">Defining column encoding per-column:</span>
<span class="s2">&gt;&gt;&gt; pq.write_table(table, &#39;example.parquet&#39;,</span>
<span class="s2">... column_encoding={&#39;animal&#39;:&#39;PLAIN&#39;},</span>
<span class="s2">... use_dictionary=False)</span>
<span class="s2">&quot;&quot;&quot;</span>
<span class="n">write_table</span><span class="o">.</span><span class="vm">__doc__</span> <span class="o">=</span> <span class="s2">&quot;&quot;&quot;</span>
<span class="s2">Write a Table to Parquet format.</span>
<span class="s2">Parameters</span>
<span class="s2">----------</span>
<span class="s2">table : pyarrow.Table</span>
<span class="s2">where : string or pyarrow.NativeFile</span>
<span class="s2">row_group_size : int</span>
<span class="s2"> Maximum number of rows in each written row group. If None, the</span>
<span class="s2"> row group size will be the minimum of the Table size and</span>
<span class="s2"> 1024 * 1024.</span>
<span class="si">{}</span>
<span class="s2">**kwargs : optional</span>
<span class="s2"> Additional options for ParquetWriter</span>
<span class="s2">Examples</span>
<span class="s2">--------</span>
<span class="si">{}</span>
<span class="s2">&quot;&quot;&quot;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">_parquet_writer_arg_docs</span><span class="p">,</span> <span class="n">_write_table_example</span><span class="p">)</span>
<div class="viewcode-block" id="write_to_dataset"><a class="viewcode-back" href="../../../python/generated/pyarrow.parquet.write_to_dataset.html#pyarrow.parquet.write_to_dataset">[docs]</a><span class="k">def</span> <span class="nf">write_to_dataset</span><span class="p">(</span><span class="n">table</span><span class="p">,</span> <span class="n">root_path</span><span class="p">,</span> <span class="n">partition_cols</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span>
<span class="n">filesystem</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">use_legacy_dataset</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span>
<span class="n">schema</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">partitioning</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span>
<span class="n">basename_template</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">use_threads</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span>
<span class="n">file_visitor</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">existing_data_behavior</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span>
<span class="o">**</span><span class="n">kwargs</span><span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Wrapper around dataset.write_dataset for writing a Table to</span>
<span class="sd"> Parquet format by partitions.</span>
<span class="sd"> For each combination of partition columns and values,</span>
<span class="sd"> a subdirectories are created in the following</span>
<span class="sd"> manner:</span>
<span class="sd"> root_dir/</span>
<span class="sd"> group1=value1</span>
<span class="sd"> group2=value1</span>
<span class="sd"> &lt;uuid&gt;.parquet</span>
<span class="sd"> group2=value2</span>
<span class="sd"> &lt;uuid&gt;.parquet</span>
<span class="sd"> group1=valueN</span>
<span class="sd"> group2=value1</span>
<span class="sd"> &lt;uuid&gt;.parquet</span>
<span class="sd"> group2=valueN</span>
<span class="sd"> &lt;uuid&gt;.parquet</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> table : pyarrow.Table</span>
<span class="sd"> root_path : str, pathlib.Path</span>
<span class="sd"> The root directory of the dataset.</span>
<span class="sd"> partition_cols : list,</span>
<span class="sd"> Column names by which to partition the dataset.</span>
<span class="sd"> Columns are partitioned in the order they are given.</span>
<span class="sd"> filesystem : FileSystem, default None</span>
<span class="sd"> If nothing passed, will be inferred based on path.</span>
<span class="sd"> Path will try to be found in the local on-disk filesystem otherwise</span>
<span class="sd"> it will be parsed as an URI to determine the filesystem.</span>
<span class="sd"> use_legacy_dataset : bool, optional</span>
<span class="sd"> Deprecated and has no effect from PyArrow version 15.0.0.</span>
<span class="sd"> schema : Schema, optional</span>
<span class="sd"> This Schema of the dataset.</span>
<span class="sd"> partitioning : Partitioning or list[str], optional</span>
<span class="sd"> The partitioning scheme specified with the</span>
<span class="sd"> ``pyarrow.dataset.partitioning()`` function or a list of field names.</span>
<span class="sd"> When providing a list of field names, you can use</span>
<span class="sd"> ``partitioning_flavor`` to drive which partitioning type should be</span>
<span class="sd"> used.</span>
<span class="sd"> basename_template : str, optional</span>
<span class="sd"> A template string used to generate basenames of written data files.</span>
<span class="sd"> The token &#39;{i}&#39; will be replaced with an automatically incremented</span>
<span class="sd"> integer. If not specified, it defaults to &quot;guid-{i}.parquet&quot;.</span>
<span class="sd"> use_threads : bool, default True</span>
<span class="sd"> Write files in parallel. If enabled, then maximum parallelism will be</span>
<span class="sd"> used determined by the number of available CPU cores.</span>
<span class="sd"> file_visitor : function</span>
<span class="sd"> If set, this function will be called with a WrittenFile instance</span>
<span class="sd"> for each file created during the call. This object will have both</span>
<span class="sd"> a path attribute and a metadata attribute.</span>
<span class="sd"> The path attribute will be a string containing the path to</span>
<span class="sd"> the created file.</span>
<span class="sd"> The metadata attribute will be the parquet metadata of the file.</span>
<span class="sd"> This metadata will have the file path attribute set and can be used</span>
<span class="sd"> to build a _metadata file. The metadata attribute will be None if</span>
<span class="sd"> the format is not parquet.</span>
<span class="sd"> Example visitor which simple collects the filenames created::</span>
<span class="sd"> visited_paths = []</span>
<span class="sd"> def file_visitor(written_file):</span>
<span class="sd"> visited_paths.append(written_file.path)</span>
<span class="sd"> existing_data_behavior : &#39;overwrite_or_ignore&#39; | &#39;error&#39; | \</span>
<span class="sd">&#39;delete_matching&#39;</span>
<span class="sd"> Controls how the dataset will handle data that already exists in</span>
<span class="sd"> the destination. The default behaviour is &#39;overwrite_or_ignore&#39;.</span>
<span class="sd"> &#39;overwrite_or_ignore&#39; will ignore any existing data and will</span>
<span class="sd"> overwrite files with the same name as an output file. Other</span>
<span class="sd"> existing files will be ignored. This behavior, in combination</span>
<span class="sd"> with a unique basename_template for each write, will allow for</span>
<span class="sd"> an append workflow.</span>
<span class="sd"> &#39;error&#39; will raise an error if any data exists in the destination.</span>
<span class="sd"> &#39;delete_matching&#39; is useful when you are writing a partitioned</span>
<span class="sd"> dataset. The first time each partition directory is encountered</span>
<span class="sd"> the entire directory will be deleted. This allows you to overwrite</span>
<span class="sd"> old partitions completely.</span>
<span class="sd"> **kwargs : dict,</span>
<span class="sd"> Used as additional kwargs for :func:`pyarrow.dataset.write_dataset`</span>
<span class="sd"> function for matching kwargs, and remainder to</span>
<span class="sd"> :func:`pyarrow.dataset.ParquetFileFormat.make_write_options`.</span>
<span class="sd"> See the docstring of :func:`write_table` and</span>
<span class="sd"> :func:`pyarrow.dataset.write_dataset` for the available options.</span>
<span class="sd"> Using `metadata_collector` in kwargs allows one to collect the</span>
<span class="sd"> file metadata instances of dataset pieces. The file paths in the</span>
<span class="sd"> ColumnChunkMetaData will be set relative to `root_path`.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> Generate an example PyArrow Table:</span>
<span class="sd"> &gt;&gt;&gt; import pyarrow as pa</span>
<span class="sd"> &gt;&gt;&gt; table = pa.table({&#39;year&#39;: [2020, 2022, 2021, 2022, 2019, 2021],</span>
<span class="sd"> ... &#39;n_legs&#39;: [2, 2, 4, 4, 5, 100],</span>
<span class="sd"> ... &#39;animal&#39;: [&quot;Flamingo&quot;, &quot;Parrot&quot;, &quot;Dog&quot;, &quot;Horse&quot;,</span>
<span class="sd"> ... &quot;Brittle stars&quot;, &quot;Centipede&quot;]})</span>
<span class="sd"> and write it to a partitioned dataset:</span>
<span class="sd"> &gt;&gt;&gt; import pyarrow.parquet as pq</span>
<span class="sd"> &gt;&gt;&gt; pq.write_to_dataset(table, root_path=&#39;dataset_name_3&#39;,</span>
<span class="sd"> ... partition_cols=[&#39;year&#39;])</span>
<span class="sd"> &gt;&gt;&gt; pq.ParquetDataset(&#39;dataset_name_3&#39;).files</span>
<span class="sd"> [&#39;dataset_name_3/year=2019/...-0.parquet&#39;, ...</span>
<span class="sd"> Write a single Parquet file into the root folder:</span>
<span class="sd"> &gt;&gt;&gt; pq.write_to_dataset(table, root_path=&#39;dataset_name_4&#39;)</span>
<span class="sd"> &gt;&gt;&gt; pq.ParquetDataset(&#39;dataset_name_4/&#39;).files</span>
<span class="sd"> [&#39;dataset_name_4/...-0.parquet&#39;]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="n">use_legacy_dataset</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="n">warnings</span><span class="o">.</span><span class="n">warn</span><span class="p">(</span>
<span class="s2">&quot;Passing &#39;use_legacy_dataset&#39; is deprecated as of pyarrow 15.0.0 &quot;</span>
<span class="s2">&quot;and will be removed in a future version.&quot;</span><span class="p">,</span>
<span class="ne">FutureWarning</span><span class="p">,</span> <span class="n">stacklevel</span><span class="o">=</span><span class="mi">2</span><span class="p">)</span>
<span class="n">metadata_collector</span> <span class="o">=</span> <span class="n">kwargs</span><span class="o">.</span><span class="n">pop</span><span class="p">(</span><span class="s1">&#39;metadata_collector&#39;</span><span class="p">,</span> <span class="kc">None</span><span class="p">)</span>
<span class="c1"># Check for conflicting keywords</span>
<span class="n">msg_confl</span> <span class="o">=</span> <span class="p">(</span>
<span class="s2">&quot;The &#39;</span><span class="si">{1}</span><span class="s2">&#39; argument is not supported. &quot;</span>
<span class="s2">&quot;Use only &#39;</span><span class="si">{0}</span><span class="s2">&#39; instead.&quot;</span>
<span class="p">)</span>
<span class="k">if</span> <span class="n">partition_cols</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> <span class="ow">and</span> <span class="n">partitioning</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="n">msg_confl</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="s2">&quot;partitioning&quot;</span><span class="p">,</span>
<span class="s2">&quot;partition_cols&quot;</span><span class="p">))</span>
<span class="k">if</span> <span class="n">metadata_collector</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> <span class="ow">and</span> <span class="n">file_visitor</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="n">msg_confl</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="s2">&quot;file_visitor&quot;</span><span class="p">,</span>
<span class="s2">&quot;metadata_collector&quot;</span><span class="p">))</span>
<span class="kn">import</span> <span class="nn">pyarrow.dataset</span> <span class="k">as</span> <span class="nn">ds</span>
<span class="c1"># extract write_dataset specific options</span>
<span class="c1"># reset assumed to go to make_write_options</span>
<span class="n">write_dataset_kwargs</span> <span class="o">=</span> <span class="nb">dict</span><span class="p">()</span>
<span class="k">for</span> <span class="n">key</span> <span class="ow">in</span> <span class="n">inspect</span><span class="o">.</span><span class="n">signature</span><span class="p">(</span><span class="n">ds</span><span class="o">.</span><span class="n">write_dataset</span><span class="p">)</span><span class="o">.</span><span class="n">parameters</span><span class="p">:</span>
<span class="k">if</span> <span class="n">key</span> <span class="ow">in</span> <span class="n">kwargs</span><span class="p">:</span>
<span class="n">write_dataset_kwargs</span><span class="p">[</span><span class="n">key</span><span class="p">]</span> <span class="o">=</span> <span class="n">kwargs</span><span class="o">.</span><span class="n">pop</span><span class="p">(</span><span class="n">key</span><span class="p">)</span>
<span class="n">write_dataset_kwargs</span><span class="p">[</span><span class="s1">&#39;max_rows_per_group&#39;</span><span class="p">]</span> <span class="o">=</span> <span class="n">kwargs</span><span class="o">.</span><span class="n">pop</span><span class="p">(</span>
<span class="s1">&#39;row_group_size&#39;</span><span class="p">,</span> <span class="n">kwargs</span><span class="o">.</span><span class="n">pop</span><span class="p">(</span><span class="s2">&quot;chunk_size&quot;</span><span class="p">,</span> <span class="kc">None</span><span class="p">)</span>
<span class="p">)</span>
<span class="k">if</span> <span class="n">metadata_collector</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="k">def</span> <span class="nf">file_visitor</span><span class="p">(</span><span class="n">written_file</span><span class="p">):</span>
<span class="n">metadata_collector</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">written_file</span><span class="o">.</span><span class="n">metadata</span><span class="p">)</span>
<span class="c1"># map format arguments</span>
<span class="n">parquet_format</span> <span class="o">=</span> <span class="n">ds</span><span class="o">.</span><span class="n">ParquetFileFormat</span><span class="p">()</span>
<span class="n">write_options</span> <span class="o">=</span> <span class="n">parquet_format</span><span class="o">.</span><span class="n">make_write_options</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
<span class="c1"># map old filesystems to new one</span>
<span class="k">if</span> <span class="n">filesystem</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="n">filesystem</span> <span class="o">=</span> <span class="n">_ensure_filesystem</span><span class="p">(</span><span class="n">filesystem</span><span class="p">)</span>
<span class="k">if</span> <span class="n">partition_cols</span><span class="p">:</span>
<span class="n">part_schema</span> <span class="o">=</span> <span class="n">table</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="n">partition_cols</span><span class="p">)</span><span class="o">.</span><span class="n">schema</span>
<span class="n">partitioning</span> <span class="o">=</span> <span class="n">ds</span><span class="o">.</span><span class="n">partitioning</span><span class="p">(</span><span class="n">part_schema</span><span class="p">,</span> <span class="n">flavor</span><span class="o">=</span><span class="s2">&quot;hive&quot;</span><span class="p">)</span>
<span class="k">if</span> <span class="n">basename_template</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
<span class="n">basename_template</span> <span class="o">=</span> <span class="n">guid</span><span class="p">()</span> <span class="o">+</span> <span class="s1">&#39;-</span><span class="si">{i}</span><span class="s1">.parquet&#39;</span>
<span class="k">if</span> <span class="n">existing_data_behavior</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
<span class="n">existing_data_behavior</span> <span class="o">=</span> <span class="s1">&#39;overwrite_or_ignore&#39;</span>
<span class="n">ds</span><span class="o">.</span><span class="n">write_dataset</span><span class="p">(</span>
<span class="n">table</span><span class="p">,</span> <span class="n">root_path</span><span class="p">,</span> <span class="n">filesystem</span><span class="o">=</span><span class="n">filesystem</span><span class="p">,</span>
<span class="nb">format</span><span class="o">=</span><span class="n">parquet_format</span><span class="p">,</span> <span class="n">file_options</span><span class="o">=</span><span class="n">write_options</span><span class="p">,</span> <span class="n">schema</span><span class="o">=</span><span class="n">schema</span><span class="p">,</span>
<span class="n">partitioning</span><span class="o">=</span><span class="n">partitioning</span><span class="p">,</span> <span class="n">use_threads</span><span class="o">=</span><span class="n">use_threads</span><span class="p">,</span>
<span class="n">file_visitor</span><span class="o">=</span><span class="n">file_visitor</span><span class="p">,</span>
<span class="n">basename_template</span><span class="o">=</span><span class="n">basename_template</span><span class="p">,</span>
<span class="n">existing_data_behavior</span><span class="o">=</span><span class="n">existing_data_behavior</span><span class="p">,</span>
<span class="o">**</span><span class="n">write_dataset_kwargs</span><span class="p">)</span>
<span class="k">return</span></div>
<div class="viewcode-block" id="write_metadata"><a class="viewcode-back" href="../../../python/generated/pyarrow.parquet.write_metadata.html#pyarrow.parquet.write_metadata">[docs]</a><span class="k">def</span> <span class="nf">write_metadata</span><span class="p">(</span><span class="n">schema</span><span class="p">,</span> <span class="n">where</span><span class="p">,</span> <span class="n">metadata_collector</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">filesystem</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span>
<span class="o">**</span><span class="n">kwargs</span><span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Write metadata-only Parquet file from schema. This can be used with</span>
<span class="sd"> `write_to_dataset` to generate `_common_metadata` and `_metadata` sidecar</span>
<span class="sd"> files.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> schema : pyarrow.Schema</span>
<span class="sd"> where : string or pyarrow.NativeFile</span>
<span class="sd"> metadata_collector : list</span>
<span class="sd"> where to collect metadata information.</span>
<span class="sd"> filesystem : FileSystem, default None</span>
<span class="sd"> If nothing passed, will be inferred from `where` if path-like, else</span>
<span class="sd"> `where` is already a file-like object so no filesystem is needed.</span>
<span class="sd"> **kwargs : dict,</span>
<span class="sd"> Additional kwargs for ParquetWriter class. See docstring for</span>
<span class="sd"> `ParquetWriter` for more information.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> Generate example data:</span>
<span class="sd"> &gt;&gt;&gt; import pyarrow as pa</span>
<span class="sd"> &gt;&gt;&gt; table = pa.table({&#39;n_legs&#39;: [2, 2, 4, 4, 5, 100],</span>
<span class="sd"> ... &#39;animal&#39;: [&quot;Flamingo&quot;, &quot;Parrot&quot;, &quot;Dog&quot;, &quot;Horse&quot;,</span>
<span class="sd"> ... &quot;Brittle stars&quot;, &quot;Centipede&quot;]})</span>
<span class="sd"> Write a dataset and collect metadata information.</span>
<span class="sd"> &gt;&gt;&gt; metadata_collector = []</span>
<span class="sd"> &gt;&gt;&gt; import pyarrow.parquet as pq</span>
<span class="sd"> &gt;&gt;&gt; pq.write_to_dataset(</span>
<span class="sd"> ... table, &#39;dataset_metadata&#39;,</span>
<span class="sd"> ... metadata_collector=metadata_collector)</span>
<span class="sd"> Write the `_common_metadata` parquet file without row groups statistics.</span>
<span class="sd"> &gt;&gt;&gt; pq.write_metadata(</span>
<span class="sd"> ... table.schema, &#39;dataset_metadata/_common_metadata&#39;)</span>
<span class="sd"> Write the `_metadata` parquet file with row groups statistics.</span>
<span class="sd"> &gt;&gt;&gt; pq.write_metadata(</span>
<span class="sd"> ... table.schema, &#39;dataset_metadata/_metadata&#39;,</span>
<span class="sd"> ... metadata_collector=metadata_collector)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">filesystem</span><span class="p">,</span> <span class="n">where</span> <span class="o">=</span> <span class="n">_resolve_filesystem_and_path</span><span class="p">(</span><span class="n">where</span><span class="p">,</span> <span class="n">filesystem</span><span class="p">)</span>
<span class="k">if</span> <span class="nb">hasattr</span><span class="p">(</span><span class="n">where</span><span class="p">,</span> <span class="s2">&quot;seek&quot;</span><span class="p">):</span> <span class="c1"># file-like</span>
<span class="n">cursor_position</span> <span class="o">=</span> <span class="n">where</span><span class="o">.</span><span class="n">tell</span><span class="p">()</span>
<span class="n">writer</span> <span class="o">=</span> <span class="n">ParquetWriter</span><span class="p">(</span><span class="n">where</span><span class="p">,</span> <span class="n">schema</span><span class="p">,</span> <span class="n">filesystem</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
<span class="n">writer</span><span class="o">.</span><span class="n">close</span><span class="p">()</span>
<span class="k">if</span> <span class="n">metadata_collector</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="c1"># ParquetWriter doesn&#39;t expose the metadata until it&#39;s written. Write</span>
<span class="c1"># it and read it again.</span>
<span class="n">metadata</span> <span class="o">=</span> <span class="n">read_metadata</span><span class="p">(</span><span class="n">where</span><span class="p">,</span> <span class="n">filesystem</span><span class="o">=</span><span class="n">filesystem</span><span class="p">)</span>
<span class="k">if</span> <span class="nb">hasattr</span><span class="p">(</span><span class="n">where</span><span class="p">,</span> <span class="s2">&quot;seek&quot;</span><span class="p">):</span>
<span class="n">where</span><span class="o">.</span><span class="n">seek</span><span class="p">(</span><span class="n">cursor_position</span><span class="p">)</span> <span class="c1"># file-like, set cursor back.</span>
<span class="k">for</span> <span class="n">m</span> <span class="ow">in</span> <span class="n">metadata_collector</span><span class="p">:</span>
<span class="n">metadata</span><span class="o">.</span><span class="n">append_row_groups</span><span class="p">(</span><span class="n">m</span><span class="p">)</span>
<span class="k">if</span> <span class="n">filesystem</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="k">with</span> <span class="n">filesystem</span><span class="o">.</span><span class="n">open_output_stream</span><span class="p">(</span><span class="n">where</span><span class="p">)</span> <span class="k">as</span> <span class="n">f</span><span class="p">:</span>
<span class="n">metadata</span><span class="o">.</span><span class="n">write_metadata_file</span><span class="p">(</span><span class="n">f</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">metadata</span><span class="o">.</span><span class="n">write_metadata_file</span><span class="p">(</span><span class="n">where</span><span class="p">)</span></div>
<div class="viewcode-block" id="read_metadata"><a class="viewcode-back" href="../../../python/generated/pyarrow.parquet.read_metadata.html#pyarrow.parquet.read_metadata">[docs]</a><span class="k">def</span> <span class="nf">read_metadata</span><span class="p">(</span><span class="n">where</span><span class="p">,</span> <span class="n">memory_map</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span> <span class="n">decryption_properties</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span>
<span class="n">filesystem</span><span class="o">=</span><span class="kc">None</span><span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Read FileMetaData from footer of a single Parquet file.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> where : str (file path) or file-like object</span>
<span class="sd"> memory_map : bool, default False</span>
<span class="sd"> Create memory map when the source is a file path.</span>
<span class="sd"> decryption_properties : FileDecryptionProperties, default None</span>
<span class="sd"> Decryption properties for reading encrypted Parquet files.</span>
<span class="sd"> filesystem : FileSystem, default None</span>
<span class="sd"> If nothing passed, will be inferred based on path.</span>
<span class="sd"> Path will try to be found in the local on-disk filesystem otherwise</span>
<span class="sd"> it will be parsed as an URI to determine the filesystem.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> metadata : FileMetaData</span>
<span class="sd"> The metadata of the Parquet file</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; import pyarrow as pa</span>
<span class="sd"> &gt;&gt;&gt; import pyarrow.parquet as pq</span>
<span class="sd"> &gt;&gt;&gt; table = pa.table({&#39;n_legs&#39;: [4, 5, 100],</span>
<span class="sd"> ... &#39;animal&#39;: [&quot;Dog&quot;, &quot;Brittle stars&quot;, &quot;Centipede&quot;]})</span>
<span class="sd"> &gt;&gt;&gt; pq.write_table(table, &#39;example.parquet&#39;)</span>
<span class="sd"> &gt;&gt;&gt; pq.read_metadata(&#39;example.parquet&#39;)</span>
<span class="sd"> &lt;pyarrow._parquet.FileMetaData object at ...&gt;</span>
<span class="sd"> created_by: parquet-cpp-arrow version ...</span>
<span class="sd"> num_columns: 2</span>
<span class="sd"> num_rows: 3</span>
<span class="sd"> num_row_groups: 1</span>
<span class="sd"> format_version: 2.6</span>
<span class="sd"> serialized_size: ...</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">filesystem</span><span class="p">,</span> <span class="n">where</span> <span class="o">=</span> <span class="n">_resolve_filesystem_and_path</span><span class="p">(</span><span class="n">where</span><span class="p">,</span> <span class="n">filesystem</span><span class="p">)</span>
<span class="n">file_ctx</span> <span class="o">=</span> <span class="n">nullcontext</span><span class="p">()</span>
<span class="k">if</span> <span class="n">filesystem</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="n">file_ctx</span> <span class="o">=</span> <span class="n">where</span> <span class="o">=</span> <span class="n">filesystem</span><span class="o">.</span><span class="n">open_input_file</span><span class="p">(</span><span class="n">where</span><span class="p">)</span>
<span class="k">with</span> <span class="n">file_ctx</span><span class="p">:</span>
<span class="n">file</span> <span class="o">=</span> <span class="n">ParquetFile</span><span class="p">(</span><span class="n">where</span><span class="p">,</span> <span class="n">memory_map</span><span class="o">=</span><span class="n">memory_map</span><span class="p">,</span>
<span class="n">decryption_properties</span><span class="o">=</span><span class="n">decryption_properties</span><span class="p">)</span>
<span class="k">return</span> <span class="n">file</span><span class="o">.</span><span class="n">metadata</span></div>
<div class="viewcode-block" id="read_schema"><a class="viewcode-back" href="../../../python/generated/pyarrow.parquet.read_schema.html#pyarrow.parquet.read_schema">[docs]</a><span class="k">def</span> <span class="nf">read_schema</span><span class="p">(</span><span class="n">where</span><span class="p">,</span> <span class="n">memory_map</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span> <span class="n">decryption_properties</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span>
<span class="n">filesystem</span><span class="o">=</span><span class="kc">None</span><span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Read effective Arrow schema from Parquet file metadata.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> where : str (file path) or file-like object</span>
<span class="sd"> memory_map : bool, default False</span>
<span class="sd"> Create memory map when the source is a file path.</span>
<span class="sd"> decryption_properties : FileDecryptionProperties, default None</span>
<span class="sd"> Decryption properties for reading encrypted Parquet files.</span>
<span class="sd"> filesystem : FileSystem, default None</span>
<span class="sd"> If nothing passed, will be inferred based on path.</span>
<span class="sd"> Path will try to be found in the local on-disk filesystem otherwise</span>
<span class="sd"> it will be parsed as an URI to determine the filesystem.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> schema : pyarrow.Schema</span>
<span class="sd"> The schema of the Parquet file</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; import pyarrow as pa</span>
<span class="sd"> &gt;&gt;&gt; import pyarrow.parquet as pq</span>
<span class="sd"> &gt;&gt;&gt; table = pa.table({&#39;n_legs&#39;: [4, 5, 100],</span>
<span class="sd"> ... &#39;animal&#39;: [&quot;Dog&quot;, &quot;Brittle stars&quot;, &quot;Centipede&quot;]})</span>
<span class="sd"> &gt;&gt;&gt; pq.write_table(table, &#39;example.parquet&#39;)</span>
<span class="sd"> &gt;&gt;&gt; pq.read_schema(&#39;example.parquet&#39;)</span>
<span class="sd"> n_legs: int64</span>
<span class="sd"> animal: string</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">filesystem</span><span class="p">,</span> <span class="n">where</span> <span class="o">=</span> <span class="n">_resolve_filesystem_and_path</span><span class="p">(</span><span class="n">where</span><span class="p">,</span> <span class="n">filesystem</span><span class="p">)</span>
<span class="n">file_ctx</span> <span class="o">=</span> <span class="n">nullcontext</span><span class="p">()</span>
<span class="k">if</span> <span class="n">filesystem</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="n">file_ctx</span> <span class="o">=</span> <span class="n">where</span> <span class="o">=</span> <span class="n">filesystem</span><span class="o">.</span><span class="n">open_input_file</span><span class="p">(</span><span class="n">where</span><span class="p">)</span>
<span class="k">with</span> <span class="n">file_ctx</span><span class="p">:</span>
<span class="n">file</span> <span class="o">=</span> <span class="n">ParquetFile</span><span class="p">(</span>
<span class="n">where</span><span class="p">,</span> <span class="n">memory_map</span><span class="o">=</span><span class="n">memory_map</span><span class="p">,</span>
<span class="n">decryption_properties</span><span class="o">=</span><span class="n">decryption_properties</span><span class="p">)</span>
<span class="k">return</span> <span class="n">file</span><span class="o">.</span><span class="n">schema</span><span class="o">.</span><span class="n">to_arrow_schema</span><span class="p">()</span></div>
<span class="n">__all__</span> <span class="o">=</span> <span class="p">(</span>
<span class="s2">&quot;ColumnChunkMetaData&quot;</span><span class="p">,</span>
<span class="s2">&quot;ColumnSchema&quot;</span><span class="p">,</span>
<span class="s2">&quot;FileDecryptionProperties&quot;</span><span class="p">,</span>
<span class="s2">&quot;FileEncryptionProperties&quot;</span><span class="p">,</span>
<span class="s2">&quot;FileMetaData&quot;</span><span class="p">,</span>
<span class="s2">&quot;ParquetDataset&quot;</span><span class="p">,</span>
<span class="s2">&quot;ParquetFile&quot;</span><span class="p">,</span>
<span class="s2">&quot;ParquetLogicalType&quot;</span><span class="p">,</span>
<span class="s2">&quot;ParquetReader&quot;</span><span class="p">,</span>
<span class="s2">&quot;ParquetSchema&quot;</span><span class="p">,</span>
<span class="s2">&quot;ParquetWriter&quot;</span><span class="p">,</span>
<span class="s2">&quot;RowGroupMetaData&quot;</span><span class="p">,</span>
<span class="s2">&quot;SortingColumn&quot;</span><span class="p">,</span>
<span class="s2">&quot;Statistics&quot;</span><span class="p">,</span>
<span class="s2">&quot;read_metadata&quot;</span><span class="p">,</span>
<span class="s2">&quot;read_pandas&quot;</span><span class="p">,</span>
<span class="s2">&quot;read_schema&quot;</span><span class="p">,</span>
<span class="s2">&quot;read_table&quot;</span><span class="p">,</span>
<span class="s2">&quot;write_metadata&quot;</span><span class="p">,</span>
<span class="s2">&quot;write_table&quot;</span><span class="p">,</span>
<span class="s2">&quot;write_to_dataset&quot;</span><span class="p">,</span>
<span class="s2">&quot;_filters_to_expression&quot;</span><span class="p">,</span>
<span class="s2">&quot;filters_to_expression&quot;</span><span class="p">,</span>
<span class="p">)</span>
</pre></div>
</article>
<footer class="prev-next-footer">
<div class="prev-next-area">
</div>
</footer>
</div>
<div class="bd-sidebar-secondary bd-toc"></div>
</div>
<footer class="bd-footer-content">
</footer>
</main>
</div>
</div>
<!-- Scripts loaded after <body> so the DOM is not blocked -->
<script src="../../../_static/scripts/bootstrap.js?digest=8d27b9dea8ad943066ae"></script>
<script src="../../../_static/scripts/pydata-sphinx-theme.js?digest=8d27b9dea8ad943066ae"></script>
<footer class="bd-footer">
<div class="bd-footer__inner bd-page-width">
<div class="footer-items__start">
<div class="footer-item">
<p class="copyright">
© Copyright 2016-2024 Apache Software Foundation.
Apache Arrow, Arrow, Apache, the Apache feather logo, and the Apache Arrow project logo are either registered trademarks or trademarks of The Apache Software Foundation in the United States and other countries.
<br/>
</p>
</div>
<div class="footer-item">
<p class="sphinx-version">
Created using <a href="https://www.sphinx-doc.org/">Sphinx</a> 6.2.0.
<br/>
</p>
</div>
</div>
<div class="footer-items__end">
<div class="footer-item">
<p class="theme-version">
Built with the <a href="https://pydata-sphinx-theme.readthedocs.io/en/stable/index.html">PyData Sphinx Theme</a> 0.15.2.
</p></div>
</div>
</div>
</footer>
</body>
</html>