blob: 042a68cd3c8fc4b5d600d7e46869a67ebfd4c7ea [file] [log] [blame]
<!DOCTYPE html>
<html lang="en" data-content_root="" >
<head>
<meta charset="utf-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="generator" content="Docutils 0.19: https://docutils.sourceforge.io/" />
<title>Arrow Datasets &#8212; Apache Arrow v17.0.0.dev52</title>
<script data-cfasync="false">
document.documentElement.dataset.mode = localStorage.getItem("mode") || "";
document.documentElement.dataset.theme = localStorage.getItem("theme") || "light";
</script>
<!-- Loaded before other Sphinx assets -->
<link href="../../_static/styles/theme.css?digest=8d27b9dea8ad943066ae" rel="stylesheet" />
<link href="../../_static/styles/bootstrap.css?digest=8d27b9dea8ad943066ae" rel="stylesheet" />
<link href="../../_static/styles/pydata-sphinx-theme.css?digest=8d27b9dea8ad943066ae" rel="stylesheet" />
<link href="../../_static/vendor/fontawesome/6.5.1/css/all.min.css?digest=8d27b9dea8ad943066ae" rel="stylesheet" />
<link rel="preload" as="font" type="font/woff2" crossorigin href="../../_static/vendor/fontawesome/6.5.1/webfonts/fa-solid-900.woff2" />
<link rel="preload" as="font" type="font/woff2" crossorigin href="../../_static/vendor/fontawesome/6.5.1/webfonts/fa-brands-400.woff2" />
<link rel="preload" as="font" type="font/woff2" crossorigin href="../../_static/vendor/fontawesome/6.5.1/webfonts/fa-regular-400.woff2" />
<link rel="stylesheet" type="text/css" href="../../_static/pygments.css" />
<link rel="stylesheet" type="text/css" href="../../_static/copybutton.css" />
<link rel="stylesheet" type="text/css" href="../../_static/design-style.1e8bd061cd6da7fc9cf755528e8ffc24.min.css" />
<link rel="stylesheet" type="text/css" href="../../_static/theme_overrides.css" />
<!-- Pre-loaded scripts that we'll load fully later -->
<link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=8d27b9dea8ad943066ae" />
<link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=8d27b9dea8ad943066ae" />
<script src="../../_static/vendor/fontawesome/6.5.1/js/all.min.js?digest=8d27b9dea8ad943066ae"></script>
<script data-url_root="../../" id="documentation_options" src="../../_static/documentation_options.js"></script>
<script src="../../_static/doctools.js"></script>
<script src="../../_static/sphinx_highlight.js"></script>
<script src="../../_static/clipboard.min.js"></script>
<script src="../../_static/copybutton.js"></script>
<script src="../../_static/design-tabs.js"></script>
<script>DOCUMENTATION_OPTIONS.pagename = 'cpp/tutorials/datasets_tutorial';</script>
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.15.2';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = '/docs/_static/versions.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = 'dev/';
DOCUMENTATION_OPTIONS.show_version_warning_banner = true;
</script>
<link rel="canonical" href="https://arrow.apache.org/docs/cpp/tutorials/datasets_tutorial.html" />
<link rel="icon" href="../../_static/favicon.ico"/>
<link rel="index" title="Index" href="../../genindex.html" />
<link rel="search" title="Search" href="../../search.html" />
<link rel="next" title="User Guide" href="../user_guide.html" />
<link rel="prev" title="Arrow Compute" href="compute_tutorial.html" />
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<!-- Matomo -->
<script>
var _paq = window._paq = window._paq || [];
/* tracker methods like "setCustomDimension" should be called before "trackPageView" */
/* We explicitly disable cookie tracking to avoid privacy issues */
_paq.push(['disableCookies']);
_paq.push(['trackPageView']);
_paq.push(['enableLinkTracking']);
(function() {
var u="https://analytics.apache.org/";
_paq.push(['setTrackerUrl', u+'matomo.php']);
_paq.push(['setSiteId', '20']);
var d=document, g=d.createElement('script'), s=d.getElementsByTagName('script')[0];
g.async=true; g.src=u+'matomo.js'; s.parentNode.insertBefore(g,s);
})();
</script>
<!-- End Matomo Code -->
</head>
<body data-bs-spy="scroll" data-bs-target=".bd-toc-nav" data-offset="180" data-bs-root-margin="0px 0px -60%" data-default-mode="">
<a id="pst-skip-link" class="skip-link" href="#main-content">Skip to main content</a>
<div id="pst-scroll-pixel-helper"></div>
<button type="button" class="btn rounded-pill" id="pst-back-to-top">
<i class="fa-solid fa-arrow-up"></i>
Back to top
</button>
<input type="checkbox"
class="sidebar-toggle"
name="__primary"
id="__primary"/>
<label class="overlay overlay-primary" for="__primary"></label>
<input type="checkbox"
class="sidebar-toggle"
name="__secondary"
id="__secondary"/>
<label class="overlay overlay-secondary" for="__secondary"></label>
<div class="search-button__wrapper">
<div class="search-button__overlay"></div>
<div class="search-button__search-container">
<form class="bd-search d-flex align-items-center"
action="../../search.html"
method="get">
<i class="fa-solid fa-magnifying-glass"></i>
<input type="search"
class="form-control"
name="q"
id="search-input"
placeholder="Search the docs ..."
aria-label="Search the docs ..."
autocomplete="off"
autocorrect="off"
autocapitalize="off"
spellcheck="false"/>
<span class="search-button__kbd-shortcut"><kbd class="kbd-shortcut__modifier">Ctrl</kbd>+<kbd>K</kbd></span>
</form></div>
</div>
<header class="bd-header navbar navbar-expand-lg bd-navbar">
<div class="bd-header__inner bd-page-width">
<label class="sidebar-toggle primary-toggle" for="__primary">
<span class="fa-solid fa-bars"></span>
</label>
<div class="col-lg-3 navbar-header-items__start">
<div class="navbar-item">
<a class="navbar-brand logo" href="../../index.html">
<img src="../../_static/arrow.png" class="logo__image only-light" alt="Apache Arrow v17.0.0.dev52 - Home"/>
<script>document.write(`<img src="../../_static/arrow-dark.png" class="logo__image only-dark" alt="Apache Arrow v17.0.0.dev52 - Home"/>`);</script>
</a></div>
</div>
<div class="col-lg-9 navbar-header-items">
<div class="me-auto navbar-header-items__center">
<div class="navbar-item">
<nav class="navbar-nav">
<ul class="bd-navbar-elements navbar-nav">
<li class="nav-item">
<a class="nav-link nav-internal" href="../../format/index.html">
Specifications
</a>
</li>
<li class="nav-item">
<a class="nav-link nav-internal" href="../../developers/index.html">
Development
</a>
</li>
<li class="nav-item dropdown">
<button class="btn dropdown-toggle nav-item" type="button" data-bs-toggle="dropdown" aria-expanded="false" aria-controls="pst-nav-more-links">
Implementations
</button>
<ul id="pst-nav-more-links" class="dropdown-menu">
<li class="nav-item">
<a class="nav-link dropdown-item nav-internal" href="../../c_glib/index.html">
C/GLib
</a>
</li>
<li class="nav-item current active">
<a class="nav-link dropdown-item nav-internal" href="../index.html">
C++
</a>
</li>
<li class="nav-item">
<a class="nav-link dropdown-item nav-external" href="https://github.com/apache/arrow/blob/main/csharp/README.md">
C#
</a>
</li>
<li class="nav-item">
<a class="nav-link dropdown-item nav-external" href="https://pkg.go.dev/github.com/apache/arrow/go/v17">
Go
</a>
</li>
<li class="nav-item">
<a class="nav-link dropdown-item nav-internal" href="../../java/index.html">
Java
</a>
</li>
<li class="nav-item">
<a class="nav-link dropdown-item nav-internal" href="../../js/index.html">
JavaScript
</a>
</li>
<li class="nav-item">
<a class="nav-link dropdown-item nav-external" href="https://arrow.apache.org/julia/">
Julia
</a>
</li>
<li class="nav-item">
<a class="nav-link dropdown-item nav-external" href="https://github.com/apache/arrow/blob/main/matlab/README.md">
MATLAB
</a>
</li>
<li class="nav-item">
<a class="nav-link dropdown-item nav-external" href="https://arrow.apache.org/nanoarrow/">
nanoarrow
</a>
</li>
<li class="nav-item">
<a class="nav-link dropdown-item nav-internal" href="../../python/index.html">
Python
</a>
</li>
<li class="nav-item">
<a class="nav-link dropdown-item nav-internal" href="../../r/index.html">
R
</a>
</li>
<li class="nav-item">
<a class="nav-link dropdown-item nav-external" href="https://github.com/apache/arrow/blob/main/ruby/README.md">
Ruby
</a>
</li>
<li class="nav-item">
<a class="nav-link dropdown-item nav-external" href="https://docs.rs/crate/arrow/">
Rust
</a>
</li>
<li class="nav-item">
<a class="nav-link dropdown-item nav-internal" href="../../status.html">
Implementation Status
</a>
</li>
<li class="nav-item">
<a class="nav-link dropdown-item nav-external" href="https://arrow.apache.org/cookbook/cpp/">
C++ cookbook
</a>
</li>
<li class="nav-item">
<a class="nav-link dropdown-item nav-external" href="https://arrow.apache.org/cookbook/java/">
Java cookbook
</a>
</li>
<li class="nav-item">
<a class="nav-link dropdown-item nav-external" href="https://arrow.apache.org/cookbook/py/">
Python cookbook
</a>
</li>
<li class="nav-item">
<a class="nav-link dropdown-item nav-external" href="https://arrow.apache.org/cookbook/r/">
R cookbook
</a>
</li>
</ul>
</li>
</ul>
</nav></div>
</div>
<div class="navbar-header-items__end">
<div class="navbar-item navbar-persistent--container">
<script>
document.write(`
<button class="btn navbar-btn search-button-field search-button__button" title="Search" aria-label="Search" data-bs-placement="bottom" data-bs-toggle="tooltip">
<i class="fa-solid fa-magnifying-glass"></i>
<span class="search-button__default-text">Search</span>
<span class="search-button__kbd-shortcut"><kbd class="kbd-shortcut__modifier">Ctrl</kbd>+<kbd class="kbd-shortcut__modifier">K</kbd></span>
</button>
`);
</script>
</div>
<div class="navbar-item">
<script>
document.write(`
<div class="version-switcher__container dropdown">
<button id="pst-version-switcher-button-2"
type="button"
class="version-switcher__button btn btn-sm navbar-btn dropdown-toggle"
data-bs-toggle="dropdown"
aria-haspopup="listbox"
aria-controls="pst-version-switcher-list-2"
aria-label="Version switcher list"
>
Choose version <!-- this text may get changed later by javascript -->
<span class="caret"></span>
</button>
<div id="pst-version-switcher-list-2"
class="version-switcher__menu dropdown-menu list-group-flush py-0"
role="listbox" aria-labelledby="pst-version-switcher-button-2">
<!-- dropdown will be populated by javascript on page load -->
</div>
</div>
`);
</script></div>
<div class="navbar-item">
<script>
document.write(`
<button class="btn btn-sm navbar-btn theme-switch-button" title="light/dark" aria-label="light/dark" data-bs-placement="bottom" data-bs-toggle="tooltip">
<span class="theme-switch nav-link" data-mode="light"><i class="fa-solid fa-sun fa-lg"></i></span>
<span class="theme-switch nav-link" data-mode="dark"><i class="fa-solid fa-moon fa-lg"></i></span>
<span class="theme-switch nav-link" data-mode="auto"><i class="fa-solid fa-circle-half-stroke fa-lg"></i></span>
</button>
`);
</script></div>
<div class="navbar-item"><ul class="navbar-icon-links navbar-nav"
aria-label="Icon Links">
<li class="nav-item">
<a href="https://github.com/apache/arrow" title="GitHub" class="nav-link" rel="noopener" target="_blank" data-bs-toggle="tooltip" data-bs-placement="bottom"><span><i class="fa-brands fa-square-github fa-lg" aria-hidden="true"></i></span>
<span class="sr-only">GitHub</span></a>
</li>
<li class="nav-item">
<a href="https://twitter.com/ApacheArrow" title="X" class="nav-link" rel="noopener" target="_blank" data-bs-toggle="tooltip" data-bs-placement="bottom"><span><i class="fa-brands fa-square-x-twitter fa-lg" aria-hidden="true"></i></span>
<span class="sr-only">X</span></a>
</li>
</ul></div>
</div>
</div>
<div class="navbar-persistent--mobile">
<script>
document.write(`
<button class="btn navbar-btn search-button-field search-button__button" title="Search" aria-label="Search" data-bs-placement="bottom" data-bs-toggle="tooltip">
<i class="fa-solid fa-magnifying-glass"></i>
<span class="search-button__default-text">Search</span>
<span class="search-button__kbd-shortcut"><kbd class="kbd-shortcut__modifier">Ctrl</kbd>+<kbd class="kbd-shortcut__modifier">K</kbd></span>
</button>
`);
</script>
</div>
<label class="sidebar-toggle secondary-toggle" for="__secondary" tabindex="0">
<span class="fa-solid fa-outdent"></span>
</label>
</div>
</header>
<div class="bd-container">
<div class="bd-container__inner bd-page-width">
<div class="bd-sidebar-primary bd-sidebar">
<div class="sidebar-header-items sidebar-primary__section">
<div class="sidebar-header-items__center">
<div class="navbar-item">
<nav class="navbar-nav">
<ul class="bd-navbar-elements navbar-nav">
<li class="nav-item">
<a class="nav-link nav-internal" href="../../format/index.html">
Specifications
</a>
</li>
<li class="nav-item">
<a class="nav-link nav-internal" href="../../developers/index.html">
Development
</a>
</li>
<li class="nav-item dropdown">
<button class="btn dropdown-toggle nav-item" type="button" data-bs-toggle="dropdown" aria-expanded="false" aria-controls="pst-nav-more-links-2">
Implementations
</button>
<ul id="pst-nav-more-links-2" class="dropdown-menu">
<li class="nav-item">
<a class="nav-link dropdown-item nav-internal" href="../../c_glib/index.html">
C/GLib
</a>
</li>
<li class="nav-item current active">
<a class="nav-link dropdown-item nav-internal" href="../index.html">
C++
</a>
</li>
<li class="nav-item">
<a class="nav-link dropdown-item nav-external" href="https://github.com/apache/arrow/blob/main/csharp/README.md">
C#
</a>
</li>
<li class="nav-item">
<a class="nav-link dropdown-item nav-external" href="https://pkg.go.dev/github.com/apache/arrow/go/v17">
Go
</a>
</li>
<li class="nav-item">
<a class="nav-link dropdown-item nav-internal" href="../../java/index.html">
Java
</a>
</li>
<li class="nav-item">
<a class="nav-link dropdown-item nav-internal" href="../../js/index.html">
JavaScript
</a>
</li>
<li class="nav-item">
<a class="nav-link dropdown-item nav-external" href="https://arrow.apache.org/julia/">
Julia
</a>
</li>
<li class="nav-item">
<a class="nav-link dropdown-item nav-external" href="https://github.com/apache/arrow/blob/main/matlab/README.md">
MATLAB
</a>
</li>
<li class="nav-item">
<a class="nav-link dropdown-item nav-external" href="https://arrow.apache.org/nanoarrow/">
nanoarrow
</a>
</li>
<li class="nav-item">
<a class="nav-link dropdown-item nav-internal" href="../../python/index.html">
Python
</a>
</li>
<li class="nav-item">
<a class="nav-link dropdown-item nav-internal" href="../../r/index.html">
R
</a>
</li>
<li class="nav-item">
<a class="nav-link dropdown-item nav-external" href="https://github.com/apache/arrow/blob/main/ruby/README.md">
Ruby
</a>
</li>
<li class="nav-item">
<a class="nav-link dropdown-item nav-external" href="https://docs.rs/crate/arrow/">
Rust
</a>
</li>
<li class="nav-item">
<a class="nav-link dropdown-item nav-internal" href="../../status.html">
Implementation Status
</a>
</li>
<li class="nav-item">
<a class="nav-link dropdown-item nav-external" href="https://arrow.apache.org/cookbook/cpp/">
C++ cookbook
</a>
</li>
<li class="nav-item">
<a class="nav-link dropdown-item nav-external" href="https://arrow.apache.org/cookbook/java/">
Java cookbook
</a>
</li>
<li class="nav-item">
<a class="nav-link dropdown-item nav-external" href="https://arrow.apache.org/cookbook/py/">
Python cookbook
</a>
</li>
<li class="nav-item">
<a class="nav-link dropdown-item nav-external" href="https://arrow.apache.org/cookbook/r/">
R cookbook
</a>
</li>
</ul>
</li>
</ul>
</nav></div>
</div>
<div class="sidebar-header-items__end">
<div class="navbar-item">
<script>
document.write(`
<div class="version-switcher__container dropdown">
<button id="pst-version-switcher-button-3"
type="button"
class="version-switcher__button btn btn-sm navbar-btn dropdown-toggle"
data-bs-toggle="dropdown"
aria-haspopup="listbox"
aria-controls="pst-version-switcher-list-3"
aria-label="Version switcher list"
>
Choose version <!-- this text may get changed later by javascript -->
<span class="caret"></span>
</button>
<div id="pst-version-switcher-list-3"
class="version-switcher__menu dropdown-menu list-group-flush py-0"
role="listbox" aria-labelledby="pst-version-switcher-button-3">
<!-- dropdown will be populated by javascript on page load -->
</div>
</div>
`);
</script></div>
<div class="navbar-item">
<script>
document.write(`
<button class="btn btn-sm navbar-btn theme-switch-button" title="light/dark" aria-label="light/dark" data-bs-placement="bottom" data-bs-toggle="tooltip">
<span class="theme-switch nav-link" data-mode="light"><i class="fa-solid fa-sun fa-lg"></i></span>
<span class="theme-switch nav-link" data-mode="dark"><i class="fa-solid fa-moon fa-lg"></i></span>
<span class="theme-switch nav-link" data-mode="auto"><i class="fa-solid fa-circle-half-stroke fa-lg"></i></span>
</button>
`);
</script></div>
<div class="navbar-item"><ul class="navbar-icon-links navbar-nav"
aria-label="Icon Links">
<li class="nav-item">
<a href="https://github.com/apache/arrow" title="GitHub" class="nav-link" rel="noopener" target="_blank" data-bs-toggle="tooltip" data-bs-placement="bottom"><span><i class="fa-brands fa-square-github fa-lg" aria-hidden="true"></i></span>
<span class="sr-only">GitHub</span></a>
</li>
<li class="nav-item">
<a href="https://twitter.com/ApacheArrow" title="X" class="nav-link" rel="noopener" target="_blank" data-bs-toggle="tooltip" data-bs-placement="bottom"><span><i class="fa-brands fa-square-x-twitter fa-lg" aria-hidden="true"></i></span>
<span class="sr-only">X</span></a>
</li>
</ul></div>
</div>
</div>
<div class="sidebar-primary-items__start sidebar-primary__section">
<div class="sidebar-primary-item">
<nav class="bd-docs-nav bd-links"
aria-label="Section Navigation">
<p class="bd-links__title" role="heading" aria-level="1">Section Navigation</p>
<div class="bd-toc-item navbar-nav"><ul class="current nav bd-sidenav">
<li class="toctree-l1 current active has-children"><a class="reference internal" href="../getting_started.html">Getting Started</a><input checked="" class="toctree-checkbox" id="toctree-checkbox-1" name="toctree-checkbox-1" type="checkbox"/><label class="toctree-toggle" for="toctree-checkbox-1"><i class="fa-solid fa-chevron-down"></i></label><ul class="current">
<li class="toctree-l2"><a class="reference internal" href="../build_system.html">Using Arrow C++ in your own project</a></li>
<li class="toctree-l2"><a class="reference internal" href="../conventions.html">Conventions</a></li>
<li class="toctree-l2"><a class="reference internal" href="basic_arrow.html">Basic Arrow Data Structures</a></li>
<li class="toctree-l2"><a class="reference internal" href="io_tutorial.html">Arrow File I/O</a></li>
<li class="toctree-l2"><a class="reference internal" href="compute_tutorial.html">Arrow Compute</a></li>
<li class="toctree-l2 current active"><a class="current reference internal" href="#">Arrow Datasets</a></li>
</ul>
</li>
<li class="toctree-l1 has-children"><a class="reference internal" href="../user_guide.html">User Guide</a><input class="toctree-checkbox" id="toctree-checkbox-2" name="toctree-checkbox-2" type="checkbox"/><label class="toctree-toggle" for="toctree-checkbox-2"><i class="fa-solid fa-chevron-down"></i></label><ul>
<li class="toctree-l2"><a class="reference internal" href="../overview.html">High-Level Overview</a></li>
<li class="toctree-l2"><a class="reference internal" href="../memory.html">Memory Management</a></li>
<li class="toctree-l2"><a class="reference internal" href="../arrays.html">Arrays</a></li>
<li class="toctree-l2"><a class="reference internal" href="../datatypes.html">Data Types</a></li>
<li class="toctree-l2"><a class="reference internal" href="../tables.html">Tabular Data</a></li>
<li class="toctree-l2"><a class="reference internal" href="../compute.html">Compute Functions</a></li>
<li class="toctree-l2 has-children"><a class="reference internal" href="../gandiva.html">The Gandiva Expression Compiler</a><input class="toctree-checkbox" id="toctree-checkbox-3" name="toctree-checkbox-3" type="checkbox"/><label class="toctree-toggle" for="toctree-checkbox-3"><i class="fa-solid fa-chevron-down"></i></label><ul>
<li class="toctree-l3"><a class="reference internal" href="../gandiva/expr_projector_filter.html">Gandiva Expression, Projector, and Filter</a></li>
<li class="toctree-l3"><a class="reference internal" href="../gandiva/external_func.html">Gandiva External Functions Development Guide</a></li>
</ul>
</li>
<li class="toctree-l2 has-children"><a class="reference internal" href="../streaming_execution.html">Acero: A C++ streaming execution engine</a><input class="toctree-checkbox" id="toctree-checkbox-4" name="toctree-checkbox-4" type="checkbox"/><label class="toctree-toggle" for="toctree-checkbox-4"><i class="fa-solid fa-chevron-down"></i></label><ul>
<li class="toctree-l3"><a class="reference internal" href="../acero/overview.html">Acero Overview</a></li>
<li class="toctree-l3"><a class="reference internal" href="../acero/user_guide.html">Acero User’s Guide</a></li>
<li class="toctree-l3"><a class="reference internal" href="../acero/substrait.html">Using Acero with Substrait</a></li>
<li class="toctree-l3"><a class="reference internal" href="../acero/developer_guide.html">Developer’s Guide</a></li>
</ul>
</li>
<li class="toctree-l2"><a class="reference internal" href="../io.html">Input / output and filesystems</a></li>
<li class="toctree-l2"><a class="reference internal" href="../ipc.html">Reading and writing the Arrow IPC format</a></li>
<li class="toctree-l2"><a class="reference internal" href="../orc.html">Reading and Writing ORC files</a></li>
<li class="toctree-l2"><a class="reference internal" href="../parquet.html">Reading and writing Parquet files</a></li>
<li class="toctree-l2"><a class="reference internal" href="../csv.html">Reading and Writing CSV files</a></li>
<li class="toctree-l2"><a class="reference internal" href="../json.html">Reading JSON files</a></li>
<li class="toctree-l2"><a class="reference internal" href="../dataset.html">Tabular Datasets</a></li>
<li class="toctree-l2"><a class="reference internal" href="../flight.html">Arrow Flight RPC</a></li>
<li class="toctree-l2"><a class="reference internal" href="../gdb.html">Debugging code using Arrow</a></li>
<li class="toctree-l2"><a class="reference internal" href="../threading.html">Thread Management</a></li>
<li class="toctree-l2"><a class="reference internal" href="../opentelemetry.html">OpenTelemetry</a></li>
<li class="toctree-l2"><a class="reference internal" href="../env_vars.html">Environment Variables</a></li>
</ul>
</li>
<li class="toctree-l1 has-children"><a class="reference internal" href="../examples/index.html">Examples</a><input class="toctree-checkbox" id="toctree-checkbox-5" name="toctree-checkbox-5" type="checkbox"/><label class="toctree-toggle" for="toctree-checkbox-5"><i class="fa-solid fa-chevron-down"></i></label><ul>
<li class="toctree-l2"><a class="reference internal" href="../examples/cmake_minimal_build.html">Minimal build using CMake</a></li>
<li class="toctree-l2"><a class="reference internal" href="../examples/compute_and_write_example.html">Compute and Write CSV Example</a></li>
<li class="toctree-l2"><a class="reference internal" href="../examples/dataset_documentation_example.html">Arrow Datasets example</a></li>
<li class="toctree-l2"><a class="reference internal" href="../examples/dataset_skyhook_scan_example.html">Arrow Skyhook example</a></li>
<li class="toctree-l2"><a class="reference internal" href="../examples/row_columnar_conversion.html">Row to columnar conversion</a></li>
<li class="toctree-l2"><a class="reference internal" href="../examples/tuple_range_conversion.html">std::tuple-like ranges to Arrow</a></li>
<li class="toctree-l2"><a class="reference internal" href="../examples/converting_recordbatch_to_tensor.html">Converting RecordBatch to Tensor</a></li>
</ul>
</li>
<li class="toctree-l1 has-children"><a class="reference internal" href="../api.html">API Reference</a><input class="toctree-checkbox" id="toctree-checkbox-6" name="toctree-checkbox-6" type="checkbox"/><label class="toctree-toggle" for="toctree-checkbox-6"><i class="fa-solid fa-chevron-down"></i></label><ul>
<li class="toctree-l2"><a class="reference internal" href="../api/support.html">Programming Support</a></li>
<li class="toctree-l2"><a class="reference internal" href="../api/memory.html">Memory (management)</a></li>
<li class="toctree-l2"><a class="reference internal" href="../api/thread.html">Thread (management)</a></li>
<li class="toctree-l2"><a class="reference internal" href="../api/datatype.html">Data Types</a></li>
<li class="toctree-l2"><a class="reference internal" href="../api/array.html">Arrays</a></li>
<li class="toctree-l2"><a class="reference internal" href="../api/scalar.html">Scalars</a></li>
<li class="toctree-l2"><a class="reference internal" href="../api/builder.html">Array Builders</a></li>
<li class="toctree-l2"><a class="reference internal" href="../api/table.html">Two-dimensional Datasets</a></li>
<li class="toctree-l2"><a class="reference internal" href="../api/c_abi.html">C Interfaces</a></li>
<li class="toctree-l2"><a class="reference internal" href="../api/compute.html">Compute Functions</a></li>
<li class="toctree-l2"><a class="reference internal" href="../api/acero.html">Streaming Execution (Acero)</a></li>
<li class="toctree-l2"><a class="reference internal" href="../api/gandiva.html">Gandiva Expression Compiler</a></li>
<li class="toctree-l2"><a class="reference internal" href="../api/tensor.html">Tensors</a></li>
<li class="toctree-l2"><a class="reference internal" href="../api/utilities.html">Utilities</a></li>
<li class="toctree-l2"><a class="reference internal" href="../api/async.html">Asynchronous programming</a></li>
<li class="toctree-l2"><a class="reference internal" href="../api/io.html">Input / output</a></li>
<li class="toctree-l2"><a class="reference internal" href="../api/ipc.html">Arrow IPC</a></li>
<li class="toctree-l2"><a class="reference internal" href="../api/formats.html">File Formats</a></li>
<li class="toctree-l2"><a class="reference internal" href="../api/cuda.html">CUDA support</a></li>
<li class="toctree-l2"><a class="reference internal" href="../api/flight.html">Arrow Flight RPC</a></li>
<li class="toctree-l2"><a class="reference internal" href="../api/flightsql.html">Arrow Flight SQL</a></li>
<li class="toctree-l2"><a class="reference internal" href="../api/filesystem.html">Filesystems</a></li>
<li class="toctree-l2"><a class="reference internal" href="../api/dataset.html">Dataset</a></li>
</ul>
</li>
<li class="toctree-l1"><a class="reference external" href="https://arrow.apache.org/cookbook/cpp/">C++ cookbook</a></li>
</ul>
</div>
</nav></div>
</div>
<div class="sidebar-primary-items__end sidebar-primary__section">
</div>
<div id="rtd-footer-container"></div>
</div>
<main id="main-content" class="bd-main">
<div class="bd-content">
<div class="bd-article-container">
<div class="bd-header-article">
<div class="header-article-items header-article__inner">
<div class="header-article-items__start">
<div class="header-article-item">
<nav aria-label="Breadcrumb">
<ul class="bd-breadcrumbs">
<li class="breadcrumb-item breadcrumb-home">
<a href="../../index.html" class="nav-link" aria-label="Home">
<i class="fa-solid fa-home"></i>
</a>
</li>
<li class="breadcrumb-item"><a href="../index.html" class="nav-link">C++ Implementation</a></li>
<li class="breadcrumb-item"><a href="../getting_started.html" class="nav-link">Getting Started</a></li>
<li class="breadcrumb-item active" aria-current="page">Arrow Datasets</li>
</ul>
</nav>
</div>
</div>
</div>
</div>
<div id="searchbox"></div>
<article class="bd-article">
<section id="arrow-datasets">
<h1>Arrow Datasets<a class="headerlink" href="#arrow-datasets" title="Permalink to this heading">#</a></h1>
<p>Arrow C++ provides the concept and implementation of <a class="reference internal" href="../api/dataset.html#_CPPv4N5arrow7dataset7DatasetE" title="arrow::dataset::Dataset"><code class="xref cpp cpp-class docutils literal notranslate"><span class="pre">Datasets</span></code></a> to work
with fragmented data, which can be larger-than-memory, be that due to
generating large amounts, reading in from a stream, or having a large
file on disk. In this article, you will:</p>
<ol class="arabic simple">
<li><p>read a multi-file partitioned dataset and put it into a Table,</p></li>
<li><p>write out a partitioned dataset from a Table.</p></li>
</ol>
<section id="pre-requisites">
<h2>Pre-requisites<a class="headerlink" href="#pre-requisites" title="Permalink to this heading">#</a></h2>
<p>Before continuing, make sure you have:</p>
<ol class="arabic simple">
<li><p>An Arrow installation, which you can set up here: <a class="reference internal" href="../build_system.html"><span class="doc">Using Arrow C++ in your own project</span></a></p></li>
<li><p>An understanding of basic Arrow data structures from <a class="reference internal" href="basic_arrow.html"><span class="doc">Basic Arrow Data Structures</span></a></p></li>
</ol>
<p>To witness the differences, it may be useful to have also read the <a class="reference internal" href="io_tutorial.html"><span class="doc">Arrow File I/O</span></a>. However, it is not required.</p>
</section>
<section id="setup">
<h2>Setup<a class="headerlink" href="#setup" title="Permalink to this heading">#</a></h2>
<p>Before running some computations, we need to fill in a couple gaps:</p>
<ol class="arabic simple">
<li><p>We need to include necessary headers.</p></li>
<li><p>A <code class="docutils literal notranslate"><span class="pre">main()</span></code> is needed to glue things together.</p></li>
<li><p>We need data on disk to play with.</p></li>
</ol>
<section id="includes">
<h3>Includes<a class="headerlink" href="#includes" title="Permalink to this heading">#</a></h3>
<p>Before writing C++ code, we need some includes. We’ll get <code class="docutils literal notranslate"><span class="pre">iostream</span></code> for output, then import Arrow’s
compute functionality for each file type we’ll work with in this article:</p>
<div class="highlight-cpp notranslate"><div class="highlight"><pre><span></span><span class="cp">#include</span><span class="w"> </span><span class="cpf">&lt;arrow/api.h&gt;</span>
<span class="cp">#include</span><span class="w"> </span><span class="cpf">&lt;arrow/dataset/api.h&gt;</span>
<span class="c1">// We use Parquet headers for setting up examples; they are not required for using</span>
<span class="c1">// datasets.</span>
<span class="cp">#include</span><span class="w"> </span><span class="cpf">&lt;parquet/arrow/reader.h&gt;</span>
<span class="cp">#include</span><span class="w"> </span><span class="cpf">&lt;parquet/arrow/writer.h&gt;</span>
<span class="cp">#include</span><span class="w"> </span><span class="cpf">&lt;unistd.h&gt;</span>
<span class="cp">#include</span><span class="w"> </span><span class="cpf">&lt;iostream&gt;</span>
</pre></div>
</div>
</section>
<section id="main">
<h3>Main()<a class="headerlink" href="#main" title="Permalink to this heading">#</a></h3>
<p>For our glue, we’ll use the <code class="docutils literal notranslate"><span class="pre">main()</span></code> pattern from the previous tutorial on
data structures:</p>
<div class="highlight-cpp notranslate"><div class="highlight"><pre><span></span><span class="kt">int</span><span class="w"> </span><span class="nf">main</span><span class="p">()</span><span class="w"> </span><span class="p">{</span>
<span class="w"> </span><span class="n">arrow</span><span class="o">::</span><span class="n">Status</span><span class="w"> </span><span class="n">st</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">RunMain</span><span class="p">();</span>
<span class="w"> </span><span class="k">if</span><span class="w"> </span><span class="p">(</span><span class="o">!</span><span class="n">st</span><span class="p">.</span><span class="n">ok</span><span class="p">())</span><span class="w"> </span><span class="p">{</span>
<span class="w"> </span><span class="n">std</span><span class="o">::</span><span class="n">cerr</span><span class="w"> </span><span class="o">&lt;&lt;</span><span class="w"> </span><span class="n">st</span><span class="w"> </span><span class="o">&lt;&lt;</span><span class="w"> </span><span class="n">std</span><span class="o">::</span><span class="n">endl</span><span class="p">;</span>
<span class="w"> </span><span class="k">return</span><span class="w"> </span><span class="mi">1</span><span class="p">;</span>
<span class="w"> </span><span class="p">}</span>
<span class="w"> </span><span class="k">return</span><span class="w"> </span><span class="mi">0</span><span class="p">;</span>
<span class="p">}</span>
</pre></div>
</div>
<p>Which, like when we used it before, is paired with a <code class="docutils literal notranslate"><span class="pre">RunMain()</span></code>:</p>
<div class="highlight-cpp notranslate"><div class="highlight"><pre><span></span><span class="n">arrow</span><span class="o">::</span><span class="n">Status</span><span class="w"> </span><span class="nf">RunMain</span><span class="p">()</span><span class="w"> </span><span class="p">{</span>
</pre></div>
</div>
</section>
<section id="generating-files-for-reading">
<h3>Generating Files for Reading<a class="headerlink" href="#generating-files-for-reading" title="Permalink to this heading">#</a></h3>
<p>We need some files to actually play with. In practice, you’ll likely
have some input for your own application. Here, however, we want to
explore without the overhead of supplying or finding a dataset, so let’s
generate some to make this easy to follow. Feel free to read through
this, but the concepts will be visited properly in this article – just
copy it in, for now, and realize it ends with a partitioned dataset on
disk:</p>
<div class="highlight-cpp notranslate"><div class="highlight"><pre><span></span><span class="c1">// Generate some data for the rest of this example.</span>
<span class="n">arrow</span><span class="o">::</span><span class="n">Result</span><span class="o">&lt;</span><span class="n">std</span><span class="o">::</span><span class="n">shared_ptr</span><span class="o">&lt;</span><span class="n">arrow</span><span class="o">::</span><span class="n">Table</span><span class="o">&gt;&gt;</span><span class="w"> </span><span class="n">CreateTable</span><span class="p">()</span><span class="w"> </span><span class="p">{</span>
<span class="w"> </span><span class="c1">// This code should look familiar from the basic Arrow example, and is not the</span>
<span class="w"> </span><span class="c1">// focus of this example. However, we need data to work on it, and this makes that!</span>
<span class="w"> </span><span class="k">auto</span><span class="w"> </span><span class="n">schema</span><span class="w"> </span><span class="o">=</span>
<span class="w"> </span><span class="n">arrow</span><span class="o">::</span><span class="n">schema</span><span class="p">({</span><span class="n">arrow</span><span class="o">::</span><span class="n">field</span><span class="p">(</span><span class="s">&quot;a&quot;</span><span class="p">,</span><span class="w"> </span><span class="n">arrow</span><span class="o">::</span><span class="n">int64</span><span class="p">()),</span><span class="w"> </span><span class="n">arrow</span><span class="o">::</span><span class="n">field</span><span class="p">(</span><span class="s">&quot;b&quot;</span><span class="p">,</span><span class="w"> </span><span class="n">arrow</span><span class="o">::</span><span class="n">int64</span><span class="p">()),</span>
<span class="w"> </span><span class="n">arrow</span><span class="o">::</span><span class="n">field</span><span class="p">(</span><span class="s">&quot;c&quot;</span><span class="p">,</span><span class="w"> </span><span class="n">arrow</span><span class="o">::</span><span class="n">int64</span><span class="p">())});</span>
<span class="w"> </span><span class="n">std</span><span class="o">::</span><span class="n">shared_ptr</span><span class="o">&lt;</span><span class="n">arrow</span><span class="o">::</span><span class="n">Array</span><span class="o">&gt;</span><span class="w"> </span><span class="n">array_a</span><span class="p">;</span>
<span class="w"> </span><span class="n">std</span><span class="o">::</span><span class="n">shared_ptr</span><span class="o">&lt;</span><span class="n">arrow</span><span class="o">::</span><span class="n">Array</span><span class="o">&gt;</span><span class="w"> </span><span class="n">array_b</span><span class="p">;</span>
<span class="w"> </span><span class="n">std</span><span class="o">::</span><span class="n">shared_ptr</span><span class="o">&lt;</span><span class="n">arrow</span><span class="o">::</span><span class="n">Array</span><span class="o">&gt;</span><span class="w"> </span><span class="n">array_c</span><span class="p">;</span>
<span class="w"> </span><span class="n">arrow</span><span class="o">::</span><span class="n">NumericBuilder</span><span class="o">&lt;</span><span class="n">arrow</span><span class="o">::</span><span class="n">Int64Type</span><span class="o">&gt;</span><span class="w"> </span><span class="n">builder</span><span class="p">;</span>
<span class="w"> </span><span class="n">ARROW_RETURN_NOT_OK</span><span class="p">(</span><span class="n">builder</span><span class="p">.</span><span class="n">AppendValues</span><span class="p">({</span><span class="mi">0</span><span class="p">,</span><span class="w"> </span><span class="mi">1</span><span class="p">,</span><span class="w"> </span><span class="mi">2</span><span class="p">,</span><span class="w"> </span><span class="mi">3</span><span class="p">,</span><span class="w"> </span><span class="mi">4</span><span class="p">,</span><span class="w"> </span><span class="mi">5</span><span class="p">,</span><span class="w"> </span><span class="mi">6</span><span class="p">,</span><span class="w"> </span><span class="mi">7</span><span class="p">,</span><span class="w"> </span><span class="mi">8</span><span class="p">,</span><span class="w"> </span><span class="mi">9</span><span class="p">}));</span>
<span class="w"> </span><span class="n">ARROW_RETURN_NOT_OK</span><span class="p">(</span><span class="n">builder</span><span class="p">.</span><span class="n">Finish</span><span class="p">(</span><span class="o">&amp;</span><span class="n">array_a</span><span class="p">));</span>
<span class="w"> </span><span class="n">builder</span><span class="p">.</span><span class="n">Reset</span><span class="p">();</span>
<span class="w"> </span><span class="n">ARROW_RETURN_NOT_OK</span><span class="p">(</span><span class="n">builder</span><span class="p">.</span><span class="n">AppendValues</span><span class="p">({</span><span class="mi">9</span><span class="p">,</span><span class="w"> </span><span class="mi">8</span><span class="p">,</span><span class="w"> </span><span class="mi">7</span><span class="p">,</span><span class="w"> </span><span class="mi">6</span><span class="p">,</span><span class="w"> </span><span class="mi">5</span><span class="p">,</span><span class="w"> </span><span class="mi">4</span><span class="p">,</span><span class="w"> </span><span class="mi">3</span><span class="p">,</span><span class="w"> </span><span class="mi">2</span><span class="p">,</span><span class="w"> </span><span class="mi">1</span><span class="p">,</span><span class="w"> </span><span class="mi">0</span><span class="p">}));</span>
<span class="w"> </span><span class="n">ARROW_RETURN_NOT_OK</span><span class="p">(</span><span class="n">builder</span><span class="p">.</span><span class="n">Finish</span><span class="p">(</span><span class="o">&amp;</span><span class="n">array_b</span><span class="p">));</span>
<span class="w"> </span><span class="n">builder</span><span class="p">.</span><span class="n">Reset</span><span class="p">();</span>
<span class="w"> </span><span class="n">ARROW_RETURN_NOT_OK</span><span class="p">(</span><span class="n">builder</span><span class="p">.</span><span class="n">AppendValues</span><span class="p">({</span><span class="mi">1</span><span class="p">,</span><span class="w"> </span><span class="mi">2</span><span class="p">,</span><span class="w"> </span><span class="mi">1</span><span class="p">,</span><span class="w"> </span><span class="mi">2</span><span class="p">,</span><span class="w"> </span><span class="mi">1</span><span class="p">,</span><span class="w"> </span><span class="mi">2</span><span class="p">,</span><span class="w"> </span><span class="mi">1</span><span class="p">,</span><span class="w"> </span><span class="mi">2</span><span class="p">,</span><span class="w"> </span><span class="mi">1</span><span class="p">,</span><span class="w"> </span><span class="mi">2</span><span class="p">}));</span>
<span class="w"> </span><span class="n">ARROW_RETURN_NOT_OK</span><span class="p">(</span><span class="n">builder</span><span class="p">.</span><span class="n">Finish</span><span class="p">(</span><span class="o">&amp;</span><span class="n">array_c</span><span class="p">));</span>
<span class="w"> </span><span class="k">return</span><span class="w"> </span><span class="n">arrow</span><span class="o">::</span><span class="n">Table</span><span class="o">::</span><span class="n">Make</span><span class="p">(</span><span class="n">schema</span><span class="p">,</span><span class="w"> </span><span class="p">{</span><span class="n">array_a</span><span class="p">,</span><span class="w"> </span><span class="n">array_b</span><span class="p">,</span><span class="w"> </span><span class="n">array_c</span><span class="p">});</span>
<span class="p">}</span>
<span class="c1">// Set up a dataset by writing two Parquet files.</span>
<span class="n">arrow</span><span class="o">::</span><span class="n">Result</span><span class="o">&lt;</span><span class="n">std</span><span class="o">::</span><span class="n">string</span><span class="o">&gt;</span><span class="w"> </span><span class="n">CreateExampleParquetDataset</span><span class="p">(</span>
<span class="w"> </span><span class="k">const</span><span class="w"> </span><span class="n">std</span><span class="o">::</span><span class="n">shared_ptr</span><span class="o">&lt;</span><span class="n">arrow</span><span class="o">::</span><span class="n">fs</span><span class="o">::</span><span class="n">FileSystem</span><span class="o">&gt;&amp;</span><span class="w"> </span><span class="n">filesystem</span><span class="p">,</span>
<span class="w"> </span><span class="k">const</span><span class="w"> </span><span class="n">std</span><span class="o">::</span><span class="n">string</span><span class="o">&amp;</span><span class="w"> </span><span class="n">root_path</span><span class="p">)</span><span class="w"> </span><span class="p">{</span>
<span class="w"> </span><span class="c1">// Much like CreateTable(), this is utility that gets us the dataset we&#39;ll be reading</span>
<span class="w"> </span><span class="c1">// from. Don&#39;t worry, we also write a dataset in the example proper.</span>
<span class="w"> </span><span class="k">auto</span><span class="w"> </span><span class="n">base_path</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">root_path</span><span class="w"> </span><span class="o">+</span><span class="w"> </span><span class="s">&quot;parquet_dataset&quot;</span><span class="p">;</span>
<span class="w"> </span><span class="n">ARROW_RETURN_NOT_OK</span><span class="p">(</span><span class="n">filesystem</span><span class="o">-&gt;</span><span class="n">CreateDir</span><span class="p">(</span><span class="n">base_path</span><span class="p">));</span>
<span class="w"> </span><span class="c1">// Create an Arrow Table</span>
<span class="w"> </span><span class="n">ARROW_ASSIGN_OR_RAISE</span><span class="p">(</span><span class="k">auto</span><span class="w"> </span><span class="n">table</span><span class="p">,</span><span class="w"> </span><span class="n">CreateTable</span><span class="p">());</span>
<span class="w"> </span><span class="c1">// Write it into two Parquet files</span>
<span class="w"> </span><span class="n">ARROW_ASSIGN_OR_RAISE</span><span class="p">(</span><span class="k">auto</span><span class="w"> </span><span class="n">output</span><span class="p">,</span>
<span class="w"> </span><span class="n">filesystem</span><span class="o">-&gt;</span><span class="n">OpenOutputStream</span><span class="p">(</span><span class="n">base_path</span><span class="w"> </span><span class="o">+</span><span class="w"> </span><span class="s">&quot;/data1.parquet&quot;</span><span class="p">));</span>
<span class="w"> </span><span class="n">ARROW_RETURN_NOT_OK</span><span class="p">(</span><span class="n">parquet</span><span class="o">::</span><span class="n">arrow</span><span class="o">::</span><span class="n">WriteTable</span><span class="p">(</span>
<span class="w"> </span><span class="o">*</span><span class="n">table</span><span class="o">-&gt;</span><span class="n">Slice</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span><span class="w"> </span><span class="mi">5</span><span class="p">),</span><span class="w"> </span><span class="n">arrow</span><span class="o">::</span><span class="n">default_memory_pool</span><span class="p">(),</span><span class="w"> </span><span class="n">output</span><span class="p">,</span><span class="w"> </span><span class="mi">2048</span><span class="p">));</span>
<span class="w"> </span><span class="n">ARROW_ASSIGN_OR_RAISE</span><span class="p">(</span><span class="n">output</span><span class="p">,</span>
<span class="w"> </span><span class="n">filesystem</span><span class="o">-&gt;</span><span class="n">OpenOutputStream</span><span class="p">(</span><span class="n">base_path</span><span class="w"> </span><span class="o">+</span><span class="w"> </span><span class="s">&quot;/data2.parquet&quot;</span><span class="p">));</span>
<span class="w"> </span><span class="n">ARROW_RETURN_NOT_OK</span><span class="p">(</span><span class="n">parquet</span><span class="o">::</span><span class="n">arrow</span><span class="o">::</span><span class="n">WriteTable</span><span class="p">(</span>
<span class="w"> </span><span class="o">*</span><span class="n">table</span><span class="o">-&gt;</span><span class="n">Slice</span><span class="p">(</span><span class="mi">5</span><span class="p">),</span><span class="w"> </span><span class="n">arrow</span><span class="o">::</span><span class="n">default_memory_pool</span><span class="p">(),</span><span class="w"> </span><span class="n">output</span><span class="p">,</span><span class="w"> </span><span class="mi">2048</span><span class="p">));</span>
<span class="w"> </span><span class="k">return</span><span class="w"> </span><span class="n">base_path</span><span class="p">;</span>
<span class="p">}</span>
<span class="n">arrow</span><span class="o">::</span><span class="n">Status</span><span class="w"> </span><span class="n">PrepareEnv</span><span class="p">()</span><span class="w"> </span><span class="p">{</span>
<span class="w"> </span><span class="c1">// Get our environment prepared for reading, by setting up some quick writing.</span>
<span class="w"> </span><span class="n">ARROW_ASSIGN_OR_RAISE</span><span class="p">(</span><span class="k">auto</span><span class="w"> </span><span class="n">src_table</span><span class="p">,</span><span class="w"> </span><span class="n">CreateTable</span><span class="p">())</span>
<span class="w"> </span><span class="n">std</span><span class="o">::</span><span class="n">shared_ptr</span><span class="o">&lt;</span><span class="n">arrow</span><span class="o">::</span><span class="n">fs</span><span class="o">::</span><span class="n">FileSystem</span><span class="o">&gt;</span><span class="w"> </span><span class="n">setup_fs</span><span class="p">;</span>
<span class="w"> </span><span class="c1">// Note this operates in the directory the executable is built in.</span>
<span class="w"> </span><span class="kt">char</span><span class="w"> </span><span class="n">setup_path</span><span class="p">[</span><span class="mi">256</span><span class="p">];</span>
<span class="w"> </span><span class="kt">char</span><span class="o">*</span><span class="w"> </span><span class="n">result</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">getcwd</span><span class="p">(</span><span class="n">setup_path</span><span class="p">,</span><span class="w"> </span><span class="mi">256</span><span class="p">);</span>
<span class="w"> </span><span class="k">if</span><span class="w"> </span><span class="p">(</span><span class="n">result</span><span class="w"> </span><span class="o">==</span><span class="w"> </span><span class="nb">NULL</span><span class="p">)</span><span class="w"> </span><span class="p">{</span>
<span class="w"> </span><span class="k">return</span><span class="w"> </span><span class="n">arrow</span><span class="o">::</span><span class="n">Status</span><span class="o">::</span><span class="n">IOError</span><span class="p">(</span><span class="s">&quot;Fetching PWD failed.&quot;</span><span class="p">);</span>
<span class="w"> </span><span class="p">}</span>
<span class="w"> </span><span class="n">ARROW_ASSIGN_OR_RAISE</span><span class="p">(</span><span class="n">setup_fs</span><span class="p">,</span><span class="w"> </span><span class="n">arrow</span><span class="o">::</span><span class="n">fs</span><span class="o">::</span><span class="n">FileSystemFromUriOrPath</span><span class="p">(</span><span class="n">setup_path</span><span class="p">));</span>
<span class="w"> </span><span class="n">ARROW_ASSIGN_OR_RAISE</span><span class="p">(</span><span class="k">auto</span><span class="w"> </span><span class="n">dset_path</span><span class="p">,</span><span class="w"> </span><span class="n">CreateExampleParquetDataset</span><span class="p">(</span><span class="n">setup_fs</span><span class="p">,</span><span class="w"> </span><span class="s">&quot;&quot;</span><span class="p">));</span>
<span class="w"> </span><span class="k">return</span><span class="w"> </span><span class="n">arrow</span><span class="o">::</span><span class="n">Status</span><span class="o">::</span><span class="n">OK</span><span class="p">();</span>
<span class="p">}</span>
</pre></div>
</div>
<p>In order to actually have these files, make sure the first thing called
in <code class="docutils literal notranslate"><span class="pre">RunMain()</span></code> is our helper function <code class="docutils literal notranslate"><span class="pre">PrepareEnv()</span></code>, which will get a
dataset on disk for us to play with:</p>
<div class="highlight-cpp notranslate"><div class="highlight"><pre><span></span><span class="w"> </span><span class="n">ARROW_RETURN_NOT_OK</span><span class="p">(</span><span class="n">PrepareEnv</span><span class="p">());</span>
</pre></div>
</div>
</section>
</section>
<section id="reading-a-partitioned-dataset">
<h2>Reading a Partitioned Dataset<a class="headerlink" href="#reading-a-partitioned-dataset" title="Permalink to this heading">#</a></h2>
<p>Reading a Dataset is a distinct task from reading a single file. The
task takes more work than reading a single file, due to needing to be
able to parse multiple files and/or folders. This process can be broken
up into the following steps:</p>
<ol class="arabic simple">
<li><p>Getting a <a class="reference internal" href="../api/filesystem.html#_CPPv4N5arrow2fs10FileSystemE" title="arrow::fs::FileSystem"><code class="xref cpp cpp-class docutils literal notranslate"><span class="pre">fs::FileSystem</span></code></a> object for the local FS</p></li>
<li><p>Create a <a class="reference internal" href="../api/filesystem.html#_CPPv4N5arrow2fs12FileSelectorE" title="arrow::fs::FileSelector"><code class="xref cpp cpp-class docutils literal notranslate"><span class="pre">fs::FileSelector</span></code></a> and use it to prepare a <a class="reference internal" href="../api/dataset.html#_CPPv4N5arrow7dataset24FileSystemDatasetFactoryE" title="arrow::dataset::FileSystemDatasetFactory"><code class="xref cpp cpp-class docutils literal notranslate"><span class="pre">dataset::FileSystemDatasetFactory</span></code></a></p></li>
<li><p>Build a <a class="reference internal" href="../api/dataset.html#_CPPv4N5arrow7dataset7DatasetE" title="arrow::dataset::Dataset"><code class="xref cpp cpp-class docutils literal notranslate"><span class="pre">dataset::Dataset</span></code></a> using the <a class="reference internal" href="../api/dataset.html#_CPPv4N5arrow7dataset24FileSystemDatasetFactoryE" title="arrow::dataset::FileSystemDatasetFactory"><code class="xref cpp cpp-class docutils literal notranslate"><span class="pre">dataset::FileSystemDatasetFactory</span></code></a></p></li>
<li><p>Use a <a class="reference internal" href="../api/dataset.html#_CPPv4N5arrow7dataset7ScannerE" title="arrow::dataset::Scanner"><code class="xref cpp cpp-class docutils literal notranslate"><span class="pre">dataset::Scanner</span></code></a> to read into a <a class="reference internal" href="../api/table.html#_CPPv4N5arrow5TableE" title="arrow::Table"><code class="xref cpp cpp-class docutils literal notranslate"><span class="pre">Table</span></code></a></p></li>
</ol>
<section id="preparing-a-filesystem-object">
<h3>Preparing a FileSystem Object<a class="headerlink" href="#preparing-a-filesystem-object" title="Permalink to this heading">#</a></h3>
<p>In order to begin, we’ll need to be able to interact with the local
filesystem. In order to do that, we’ll need an <a class="reference internal" href="../api/filesystem.html#_CPPv4N5arrow2fs10FileSystemE" title="arrow::fs::FileSystem"><code class="xref cpp cpp-class docutils literal notranslate"><span class="pre">fs::FileSystem</span></code></a> object.
A <a class="reference internal" href="../api/filesystem.html#_CPPv4N5arrow2fs10FileSystemE" title="arrow::fs::FileSystem"><code class="xref cpp cpp-class docutils literal notranslate"><span class="pre">fs::FileSystem</span></code></a> is an abstraction that lets us use the same interface
regardless of using Amazon S3, Google Cloud Storage, or local disk – and
we’ll be using local disk. So, let’s declare it:</p>
<div class="highlight-cpp notranslate"><div class="highlight"><pre><span></span><span class="w"> </span><span class="c1">// First, we need a filesystem object, which lets us interact with our local</span>
<span class="w"> </span><span class="c1">// filesystem starting at a given path. For the sake of simplicity, that&#39;ll be</span>
<span class="w"> </span><span class="c1">// the current directory.</span>
<span class="w"> </span><span class="n">std</span><span class="o">::</span><span class="n">shared_ptr</span><span class="o">&lt;</span><span class="n">arrow</span><span class="o">::</span><span class="n">fs</span><span class="o">::</span><span class="n">FileSystem</span><span class="o">&gt;</span><span class="w"> </span><span class="n">fs</span><span class="p">;</span>
</pre></div>
</div>
<p>For this example, we’ll have our <a class="reference internal" href="../api/filesystem.html#_CPPv4N5arrow2fs10FileSystemE" title="arrow::fs::FileSystem"><code class="xref cpp cpp-class docutils literal notranslate"><span class="pre">FileSystem’s</span></code></a> base path exist in the
same directory as the executable. <code class="xref cpp cpp-func docutils literal notranslate"><span class="pre">fs::FileSystemFromUriOrPath()</span></code> lets us get
a <a class="reference internal" href="../api/filesystem.html#_CPPv4N5arrow2fs10FileSystemE" title="arrow::fs::FileSystem"><code class="xref cpp cpp-class docutils literal notranslate"><span class="pre">fs::FileSystem</span></code></a> object for any of the types of supported filesystems.
Here, though, we’ll just pass our path:</p>
<div class="highlight-cpp notranslate"><div class="highlight"><pre><span></span><span class="w"> </span><span class="c1">// Get the CWD, use it to make the FileSystem object.</span>
<span class="w"> </span><span class="kt">char</span><span class="w"> </span><span class="n">init_path</span><span class="p">[</span><span class="mi">256</span><span class="p">];</span>
<span class="w"> </span><span class="kt">char</span><span class="o">*</span><span class="w"> </span><span class="n">result</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">getcwd</span><span class="p">(</span><span class="n">init_path</span><span class="p">,</span><span class="w"> </span><span class="mi">256</span><span class="p">);</span>
<span class="w"> </span><span class="k">if</span><span class="w"> </span><span class="p">(</span><span class="n">result</span><span class="w"> </span><span class="o">==</span><span class="w"> </span><span class="nb">NULL</span><span class="p">)</span><span class="w"> </span><span class="p">{</span>
<span class="w"> </span><span class="k">return</span><span class="w"> </span><span class="n">arrow</span><span class="o">::</span><span class="n">Status</span><span class="o">::</span><span class="n">IOError</span><span class="p">(</span><span class="s">&quot;Fetching PWD failed.&quot;</span><span class="p">);</span>
<span class="w"> </span><span class="p">}</span>
<span class="w"> </span><span class="n">ARROW_ASSIGN_OR_RAISE</span><span class="p">(</span><span class="n">fs</span><span class="p">,</span><span class="w"> </span><span class="n">arrow</span><span class="o">::</span><span class="n">fs</span><span class="o">::</span><span class="n">FileSystemFromUriOrPath</span><span class="p">(</span><span class="n">init_path</span><span class="p">));</span>
</pre></div>
</div>
<div class="admonition seealso">
<p class="admonition-title">See also</p>
<p><a class="reference internal" href="../api/filesystem.html#_CPPv4N5arrow2fs10FileSystemE" title="arrow::fs::FileSystem"><code class="xref cpp cpp-class docutils literal notranslate"><span class="pre">fs::FileSystem</span></code></a> for the other supported filesystems.</p>
</div>
</section>
<section id="creating-a-filesystemdatasetfactory">
<h3>Creating a FileSystemDatasetFactory<a class="headerlink" href="#creating-a-filesystemdatasetfactory" title="Permalink to this heading">#</a></h3>
<p>A <a class="reference internal" href="../api/filesystem.html#_CPPv4N5arrow2fs10FileSystemE" title="arrow::fs::FileSystem"><code class="xref cpp cpp-class docutils literal notranslate"><span class="pre">fs::FileSystem</span></code></a> stores a lot of metadata, but we need to be able to
traverse it and parse that metadata. In Arrow, we use a <code class="xref cpp cpp-class docutils literal notranslate"><span class="pre">FileSelector</span></code> to
do so:</p>
<div class="highlight-cpp notranslate"><div class="highlight"><pre><span></span><span class="w"> </span><span class="c1">// A file selector lets us actually traverse a multi-file dataset.</span>
<span class="w"> </span><span class="n">arrow</span><span class="o">::</span><span class="n">fs</span><span class="o">::</span><span class="n">FileSelector</span><span class="w"> </span><span class="n">selector</span><span class="p">;</span>
</pre></div>
</div>
<p>This <a class="reference internal" href="../api/filesystem.html#_CPPv4N5arrow2fs12FileSelectorE" title="arrow::fs::FileSelector"><code class="xref cpp cpp-class docutils literal notranslate"><span class="pre">fs::FileSelector</span></code></a> isn’t able to do anything yet. In order to use it, we
need to configure it – we’ll have it start any selection in
“parquet_dataset,” which is where the environment preparation process
has left us a dataset, and set recursive to true, which allows for
traversal of folders.</p>
<div class="highlight-cpp notranslate"><div class="highlight"><pre><span></span><span class="w"> </span><span class="n">selector</span><span class="p">.</span><span class="n">base_dir</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="s">&quot;parquet_dataset&quot;</span><span class="p">;</span>
<span class="w"> </span><span class="c1">// Recursive is a safe bet if you don&#39;t know the nesting of your dataset.</span>
<span class="w"> </span><span class="n">selector</span><span class="p">.</span><span class="n">recursive</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="nb">true</span><span class="p">;</span>
</pre></div>
</div>
<p>To get a <a class="reference internal" href="../api/dataset.html#_CPPv4N5arrow7dataset7DatasetE" title="arrow::dataset::Dataset"><code class="xref cpp cpp-class docutils literal notranslate"><span class="pre">dataset::Dataset</span></code></a> from a <a class="reference internal" href="../api/filesystem.html#_CPPv4N5arrow2fs10FileSystemE" title="arrow::fs::FileSystem"><code class="xref cpp cpp-class docutils literal notranslate"><span class="pre">fs::FileSystem</span></code></a>, we need to prepare a
<a class="reference internal" href="../api/dataset.html#_CPPv4N5arrow7dataset24FileSystemDatasetFactoryE" title="arrow::dataset::FileSystemDatasetFactory"><code class="xref cpp cpp-class docutils literal notranslate"><span class="pre">dataset::FileSystemDatasetFactory</span></code></a>. This is a long but descriptive name – it’ll
make us a factory to get data from our <a class="reference internal" href="../api/filesystem.html#_CPPv4N5arrow2fs10FileSystemE" title="arrow::fs::FileSystem"><code class="xref cpp cpp-class docutils literal notranslate"><span class="pre">fs::FileSystem</span></code></a>. First, we configure
it by filling a <a class="reference internal" href="../api/dataset.html#_CPPv4N5arrow7dataset24FileSystemFactoryOptionsE" title="arrow::dataset::FileSystemFactoryOptions"><code class="xref cpp cpp-class docutils literal notranslate"><span class="pre">dataset::FileSystemFactoryOptions</span></code></a> struct:</p>
<div class="highlight-cpp notranslate"><div class="highlight"><pre><span></span><span class="w"> </span><span class="c1">// Making an options object lets us configure our dataset reading.</span>
<span class="w"> </span><span class="n">arrow</span><span class="o">::</span><span class="n">dataset</span><span class="o">::</span><span class="n">FileSystemFactoryOptions</span><span class="w"> </span><span class="n">options</span><span class="p">;</span>
<span class="w"> </span><span class="c1">// We&#39;ll use Hive-style partitioning. We&#39;ll let Arrow Datasets infer the partition</span>
<span class="w"> </span><span class="c1">// schema. We won&#39;t set any other options, defaults are fine.</span>
<span class="w"> </span><span class="n">options</span><span class="p">.</span><span class="n">partitioning</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">arrow</span><span class="o">::</span><span class="n">dataset</span><span class="o">::</span><span class="n">HivePartitioning</span><span class="o">::</span><span class="n">MakeFactory</span><span class="p">();</span>
</pre></div>
</div>
<p>There are many file formats, and we have to pick one that will be
expected when actually reading. Parquet is what we have on disk, so of
course we’ll ask for that when reading:</p>
<div class="highlight-cpp notranslate"><div class="highlight"><pre><span></span><span class="w"> </span><span class="k">auto</span><span class="w"> </span><span class="n">read_format</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">std</span><span class="o">::</span><span class="n">make_shared</span><span class="o">&lt;</span><span class="n">arrow</span><span class="o">::</span><span class="n">dataset</span><span class="o">::</span><span class="n">ParquetFileFormat</span><span class="o">&gt;</span><span class="p">();</span>
</pre></div>
</div>
<p>After setting up the <a class="reference internal" href="../api/filesystem.html#_CPPv4N5arrow2fs10FileSystemE" title="arrow::fs::FileSystem"><code class="xref cpp cpp-class docutils literal notranslate"><span class="pre">fs::FileSystem</span></code></a>, <a class="reference internal" href="../api/filesystem.html#_CPPv4N5arrow2fs12FileSelectorE" title="arrow::fs::FileSelector"><code class="xref cpp cpp-class docutils literal notranslate"><span class="pre">fs::FileSelector</span></code></a>, options, and file format,
we can make that <a class="reference internal" href="../api/dataset.html#_CPPv4N5arrow7dataset24FileSystemDatasetFactoryE" title="arrow::dataset::FileSystemDatasetFactory"><code class="xref cpp cpp-class docutils literal notranslate"><span class="pre">dataset::FileSystemDatasetFactory</span></code></a>. This simply requires passing
in everything we’ve prepared and assigning that to a variable:</p>
<div class="highlight-cpp notranslate"><div class="highlight"><pre><span></span><span class="w"> </span><span class="c1">// Now, we get a factory that will let us get our dataset -- we don&#39;t have the</span>
<span class="w"> </span><span class="c1">// dataset yet!</span>
<span class="w"> </span><span class="n">ARROW_ASSIGN_OR_RAISE</span><span class="p">(</span><span class="k">auto</span><span class="w"> </span><span class="n">factory</span><span class="p">,</span><span class="w"> </span><span class="n">arrow</span><span class="o">::</span><span class="n">dataset</span><span class="o">::</span><span class="n">FileSystemDatasetFactory</span><span class="o">::</span><span class="n">Make</span><span class="p">(</span>
<span class="w"> </span><span class="n">fs</span><span class="p">,</span><span class="w"> </span><span class="n">selector</span><span class="p">,</span><span class="w"> </span><span class="n">read_format</span><span class="p">,</span><span class="w"> </span><span class="n">options</span><span class="p">));</span>
</pre></div>
</div>
</section>
<section id="build-dataset-using-factory">
<h3>Build Dataset using Factory<a class="headerlink" href="#build-dataset-using-factory" title="Permalink to this heading">#</a></h3>
<p>With a <a class="reference internal" href="../api/dataset.html#_CPPv4N5arrow7dataset24FileSystemDatasetFactoryE" title="arrow::dataset::FileSystemDatasetFactory"><code class="xref cpp cpp-class docutils literal notranslate"><span class="pre">dataset::FileSystemDatasetFactory</span></code></a> set up, we can actually build our
<a class="reference internal" href="../api/dataset.html#_CPPv4N5arrow7dataset7DatasetE" title="arrow::dataset::Dataset"><code class="xref cpp cpp-class docutils literal notranslate"><span class="pre">dataset::Dataset</span></code></a> with <a class="reference internal" href="../api/dataset.html#_CPPv4N5arrow7dataset24FileSystemDatasetFactory6FinishE13FinishOptions" title="arrow::dataset::FileSystemDatasetFactory::Finish"><code class="xref cpp cpp-func docutils literal notranslate"><span class="pre">dataset::FileSystemDatasetFactory::Finish()</span></code></a>, just
like with an <a class="reference internal" href="../api/builder.html#_CPPv4N5arrow12ArrayBuilderE" title="arrow::ArrayBuilder"><code class="xref cpp cpp-class docutils literal notranslate"><span class="pre">ArrayBuilder</span></code></a> back in the basic tutorial:</p>
<div class="highlight-cpp notranslate"><div class="highlight"><pre><span></span><span class="w"> </span><span class="c1">// Now we build our dataset from the factory.</span>
<span class="w"> </span><span class="n">ARROW_ASSIGN_OR_RAISE</span><span class="p">(</span><span class="k">auto</span><span class="w"> </span><span class="n">read_dataset</span><span class="p">,</span><span class="w"> </span><span class="n">factory</span><span class="o">-&gt;</span><span class="n">Finish</span><span class="p">());</span>
</pre></div>
</div>
<p>Now, we have a <a class="reference internal" href="../api/dataset.html#_CPPv4N5arrow7dataset7DatasetE" title="arrow::dataset::Dataset"><code class="xref cpp cpp-class docutils literal notranslate"><span class="pre">dataset::Dataset</span></code></a> object in memory. This does not mean that the
entire dataset is manifested in memory, but that we now have access to
tools that allow us to explore and use the dataset that is on disk. For
example, we can grab the fragments (files) that make up our whole
dataset, and print those out, along with some small info:</p>
<div class="highlight-cpp notranslate"><div class="highlight"><pre><span></span><span class="w"> </span><span class="c1">// Print out the fragments</span>
<span class="w"> </span><span class="n">ARROW_ASSIGN_OR_RAISE</span><span class="p">(</span><span class="k">auto</span><span class="w"> </span><span class="n">fragments</span><span class="p">,</span><span class="w"> </span><span class="n">read_dataset</span><span class="o">-&gt;</span><span class="n">GetFragments</span><span class="p">());</span>
<span class="w"> </span><span class="k">for</span><span class="w"> </span><span class="p">(</span><span class="k">const</span><span class="w"> </span><span class="k">auto</span><span class="o">&amp;</span><span class="w"> </span><span class="n">fragment</span><span class="w"> </span><span class="o">:</span><span class="w"> </span><span class="n">fragments</span><span class="p">)</span><span class="w"> </span><span class="p">{</span>
<span class="w"> </span><span class="n">std</span><span class="o">::</span><span class="n">cout</span><span class="w"> </span><span class="o">&lt;&lt;</span><span class="w"> </span><span class="s">&quot;Found fragment: &quot;</span><span class="w"> </span><span class="o">&lt;&lt;</span><span class="w"> </span><span class="p">(</span><span class="o">*</span><span class="n">fragment</span><span class="p">)</span><span class="o">-&gt;</span><span class="n">ToString</span><span class="p">()</span><span class="w"> </span><span class="o">&lt;&lt;</span><span class="w"> </span><span class="n">std</span><span class="o">::</span><span class="n">endl</span><span class="p">;</span>
<span class="w"> </span><span class="n">std</span><span class="o">::</span><span class="n">cout</span><span class="w"> </span><span class="o">&lt;&lt;</span><span class="w"> </span><span class="s">&quot;Partition expression: &quot;</span>
<span class="w"> </span><span class="o">&lt;&lt;</span><span class="w"> </span><span class="p">(</span><span class="o">*</span><span class="n">fragment</span><span class="p">)</span><span class="o">-&gt;</span><span class="n">partition_expression</span><span class="p">().</span><span class="n">ToString</span><span class="p">()</span><span class="w"> </span><span class="o">&lt;&lt;</span><span class="w"> </span><span class="n">std</span><span class="o">::</span><span class="n">endl</span><span class="p">;</span>
<span class="w"> </span><span class="p">}</span>
</pre></div>
</div>
</section>
<section id="move-dataset-into-table">
<h3>Move Dataset into Table<a class="headerlink" href="#move-dataset-into-table" title="Permalink to this heading">#</a></h3>
<p>One way we can do something with <a class="reference internal" href="../api/dataset.html#_CPPv4N5arrow7dataset7DatasetE" title="arrow::dataset::Dataset"><code class="xref cpp cpp-class docutils literal notranslate"><span class="pre">Datasets</span></code></a> is getting
them into a <a class="reference internal" href="../api/table.html#_CPPv4N5arrow5TableE" title="arrow::Table"><code class="xref cpp cpp-class docutils literal notranslate"><span class="pre">Table</span></code></a>, where we can do anything we’ve learned we can do to
<a class="reference internal" href="../api/table.html#_CPPv4N5arrow5TableE" title="arrow::Table"><code class="xref cpp cpp-class docutils literal notranslate"><span class="pre">Tables</span></code></a> to that <a class="reference internal" href="../api/table.html#_CPPv4N5arrow5TableE" title="arrow::Table"><code class="xref cpp cpp-class docutils literal notranslate"><span class="pre">Table</span></code></a>.</p>
<div class="admonition seealso">
<p class="admonition-title">See also</p>
<p><a class="reference internal" href="../streaming_execution.html"><span class="doc">Acero: A C++ streaming execution engine</span></a> for execution that avoids manifesting the entire dataset in memory.</p>
</div>
<p>In order to move a <a class="reference internal" href="../api/dataset.html#_CPPv4N5arrow7dataset7DatasetE" title="arrow::dataset::Dataset"><code class="xref cpp cpp-class docutils literal notranslate"><span class="pre">Dataset’s</span></code></a> contents into a <a class="reference internal" href="../api/table.html#_CPPv4N5arrow5TableE" title="arrow::Table"><code class="xref cpp cpp-class docutils literal notranslate"><span class="pre">Table</span></code></a>,
we need a <a class="reference internal" href="../api/dataset.html#_CPPv4N5arrow7dataset7ScannerE" title="arrow::dataset::Scanner"><code class="xref cpp cpp-class docutils literal notranslate"><span class="pre">dataset::Scanner</span></code></a>, which scans the data and outputs it to the <a class="reference internal" href="../api/table.html#_CPPv4N5arrow5TableE" title="arrow::Table"><code class="xref cpp cpp-class docutils literal notranslate"><span class="pre">Table</span></code></a>.
First, we get a <a class="reference internal" href="../api/dataset.html#_CPPv4N5arrow7dataset14ScannerBuilderE" title="arrow::dataset::ScannerBuilder"><code class="xref cpp cpp-class docutils literal notranslate"><span class="pre">dataset::ScannerBuilder</span></code></a> from the <a class="reference internal" href="../api/dataset.html#_CPPv4N5arrow7dataset7DatasetE" title="arrow::dataset::Dataset"><code class="xref cpp cpp-class docutils literal notranslate"><span class="pre">dataset::Dataset</span></code></a>:</p>
<div class="highlight-cpp notranslate"><div class="highlight"><pre><span></span><span class="w"> </span><span class="c1">// Scan dataset into a Table -- once this is done, you can do</span>
<span class="w"> </span><span class="c1">// normal table things with it, like computation and printing. However, now you&#39;re</span>
<span class="w"> </span><span class="c1">// also dedicated to being in memory.</span>
<span class="w"> </span><span class="n">ARROW_ASSIGN_OR_RAISE</span><span class="p">(</span><span class="k">auto</span><span class="w"> </span><span class="n">read_scan_builder</span><span class="p">,</span><span class="w"> </span><span class="n">read_dataset</span><span class="o">-&gt;</span><span class="n">NewScan</span><span class="p">());</span>
</pre></div>
</div>
<p>Of course, a Builder’s only use is to get us our <a class="reference internal" href="../api/dataset.html#_CPPv4N5arrow7dataset7ScannerE" title="arrow::dataset::Scanner"><code class="xref cpp cpp-class docutils literal notranslate"><span class="pre">dataset::Scanner</span></code></a>, so let’s use
<a class="reference internal" href="../api/dataset.html#_CPPv4N5arrow7dataset14ScannerBuilder6FinishEv" title="arrow::dataset::ScannerBuilder::Finish"><code class="xref cpp cpp-func docutils literal notranslate"><span class="pre">dataset::ScannerBuilder::Finish()</span></code></a>:</p>
<div class="highlight-cpp notranslate"><div class="highlight"><pre><span></span><span class="w"> </span><span class="n">ARROW_ASSIGN_OR_RAISE</span><span class="p">(</span><span class="k">auto</span><span class="w"> </span><span class="n">read_scanner</span><span class="p">,</span><span class="w"> </span><span class="n">read_scan_builder</span><span class="o">-&gt;</span><span class="n">Finish</span><span class="p">());</span>
</pre></div>
</div>
<p>Now that we have a tool to move through our <a class="reference internal" href="../api/dataset.html#_CPPv4N5arrow7dataset7DatasetE" title="arrow::dataset::Dataset"><code class="xref cpp cpp-class docutils literal notranslate"><span class="pre">dataset::Dataset</span></code></a>, let’s use it to get
our <a class="reference internal" href="../api/table.html#_CPPv4N5arrow5TableE" title="arrow::Table"><code class="xref cpp cpp-class docutils literal notranslate"><span class="pre">Table</span></code></a>. <a class="reference internal" href="../api/dataset.html#_CPPv4N5arrow7dataset7Scanner7ToTableEv" title="arrow::dataset::Scanner::ToTable"><code class="xref cpp cpp-func docutils literal notranslate"><span class="pre">dataset::Scanner::ToTable()</span></code></a> offers exactly what we’re looking for,
and we can print the results:</p>
<div class="highlight-cpp notranslate"><div class="highlight"><pre><span></span><span class="w"> </span><span class="n">ARROW_ASSIGN_OR_RAISE</span><span class="p">(</span><span class="n">std</span><span class="o">::</span><span class="n">shared_ptr</span><span class="o">&lt;</span><span class="n">arrow</span><span class="o">::</span><span class="n">Table</span><span class="o">&gt;</span><span class="w"> </span><span class="n">table</span><span class="p">,</span><span class="w"> </span><span class="n">read_scanner</span><span class="o">-&gt;</span><span class="n">ToTable</span><span class="p">());</span>
<span class="w"> </span><span class="n">std</span><span class="o">::</span><span class="n">cout</span><span class="w"> </span><span class="o">&lt;&lt;</span><span class="w"> </span><span class="n">table</span><span class="o">-&gt;</span><span class="n">ToString</span><span class="p">();</span>
</pre></div>
</div>
<p>This leaves us with a normal <a class="reference internal" href="../api/table.html#_CPPv4N5arrow5TableE" title="arrow::Table"><code class="xref cpp cpp-class docutils literal notranslate"><span class="pre">Table</span></code></a>. Again, to do things with <a class="reference internal" href="../api/dataset.html#_CPPv4N5arrow7dataset7DatasetE" title="arrow::dataset::Dataset"><code class="xref cpp cpp-class docutils literal notranslate"><span class="pre">Datasets</span></code></a>
without moving to a <a class="reference internal" href="../api/table.html#_CPPv4N5arrow5TableE" title="arrow::Table"><code class="xref cpp cpp-class docutils literal notranslate"><span class="pre">Table</span></code></a>, consider using Acero.</p>
</section>
</section>
<section id="writing-a-dataset-to-disk-from-table">
<h2>Writing a Dataset to Disk from Table<a class="headerlink" href="#writing-a-dataset-to-disk-from-table" title="Permalink to this heading">#</a></h2>
<p>Writing a <a class="reference internal" href="../api/dataset.html#_CPPv4N5arrow7dataset7DatasetE" title="arrow::dataset::Dataset"><code class="xref cpp cpp-class docutils literal notranslate"><span class="pre">dataset::Dataset</span></code></a> is a distinct task from writing a single file. The
task takes more work than writing a single file, due to needing to be
able to parse handle a partitioning scheme across multiple files and
folders. This process can be broken up into the following steps:</p>
<ol class="arabic simple">
<li><p>Prepare a <a class="reference internal" href="../api/table.html#_CPPv4N5arrow16TableBatchReaderE" title="arrow::TableBatchReader"><code class="xref cpp cpp-class docutils literal notranslate"><span class="pre">TableBatchReader</span></code></a></p></li>
<li><p>Create a <a class="reference internal" href="../api/dataset.html#_CPPv4N5arrow7dataset7ScannerE" title="arrow::dataset::Scanner"><code class="xref cpp cpp-class docutils literal notranslate"><span class="pre">dataset::Scanner</span></code></a> to pull data from <a class="reference internal" href="../api/table.html#_CPPv4N5arrow16TableBatchReaderE" title="arrow::TableBatchReader"><code class="xref cpp cpp-class docutils literal notranslate"><span class="pre">TableBatchReader</span></code></a></p></li>
<li><p>Prepare schema, partitioning, and file format options</p></li>
<li><p>Set up <a class="reference internal" href="../api/dataset.html#_CPPv4N5arrow7dataset29FileSystemDatasetWriteOptionsE" title="arrow::dataset::FileSystemDatasetWriteOptions"><code class="xref cpp cpp-class docutils literal notranslate"><span class="pre">dataset::FileSystemDatasetWriteOptions</span></code></a> – a struct that configures our writing functions</p></li>
<li><p>Write dataset to disk</p></li>
</ol>
<section id="prepare-data-from-table-for-writing">
<h3>Prepare Data from Table for Writing<a class="headerlink" href="#prepare-data-from-table-for-writing" title="Permalink to this heading">#</a></h3>
<p>We have a <a class="reference internal" href="../api/table.html#_CPPv4N5arrow5TableE" title="arrow::Table"><code class="xref cpp cpp-class docutils literal notranslate"><span class="pre">Table</span></code></a>, and we want to get a <a class="reference internal" href="../api/dataset.html#_CPPv4N5arrow7dataset7DatasetE" title="arrow::dataset::Dataset"><code class="xref cpp cpp-class docutils literal notranslate"><span class="pre">dataset::Dataset</span></code></a> on disk. In fact, for the
sake of exploration, we’ll use a different partitioning scheme for the
dataset – instead of just breaking into halves like the original
fragments, we’ll partition based on each row’s value in the “a” column.</p>
<p>To get started on that, let’s get a <a class="reference internal" href="../api/table.html#_CPPv4N5arrow16TableBatchReaderE" title="arrow::TableBatchReader"><code class="xref cpp cpp-class docutils literal notranslate"><span class="pre">TableBatchReader</span></code></a>! This makes it very
easy to write to a <code class="xref cpp cpp-class docutils literal notranslate"><span class="pre">Dataset</span></code>, and can be used elsewhere whenever a <a class="reference internal" href="../api/table.html#_CPPv4N5arrow5TableE" title="arrow::Table"><code class="xref cpp cpp-class docutils literal notranslate"><span class="pre">Table</span></code></a>
needs to be broken into a stream of <a class="reference internal" href="../api/table.html#_CPPv4N5arrow11RecordBatchE" title="arrow::RecordBatch"><code class="xref cpp cpp-class docutils literal notranslate"><span class="pre">RecordBatches</span></code></a>. Here, we can just use
the <a class="reference internal" href="../api/table.html#_CPPv4N5arrow16TableBatchReaderE" title="arrow::TableBatchReader"><code class="xref cpp cpp-class docutils literal notranslate"><span class="pre">TableBatchReader’s</span></code></a> constructor, with our table:</p>
<div class="highlight-cpp notranslate"><div class="highlight"><pre><span></span><span class="w"> </span><span class="c1">// Now, let&#39;s get a table out to disk as a dataset!</span>
<span class="w"> </span><span class="c1">// We make a RecordBatchReader from our Table, then set up a scanner, which lets us</span>
<span class="w"> </span><span class="c1">// go to a file.</span>
<span class="w"> </span><span class="n">std</span><span class="o">::</span><span class="n">shared_ptr</span><span class="o">&lt;</span><span class="n">arrow</span><span class="o">::</span><span class="n">TableBatchReader</span><span class="o">&gt;</span><span class="w"> </span><span class="n">write_dataset</span><span class="w"> </span><span class="o">=</span>
<span class="w"> </span><span class="n">std</span><span class="o">::</span><span class="n">make_shared</span><span class="o">&lt;</span><span class="n">arrow</span><span class="o">::</span><span class="n">TableBatchReader</span><span class="o">&gt;</span><span class="p">(</span><span class="n">table</span><span class="p">);</span>
</pre></div>
</div>
</section>
<section id="create-scanner-for-moving-table-data">
<h3>Create Scanner for Moving Table Data<a class="headerlink" href="#create-scanner-for-moving-table-data" title="Permalink to this heading">#</a></h3>
<p>The process for writing a <a class="reference internal" href="../api/dataset.html#_CPPv4N5arrow7dataset7DatasetE" title="arrow::dataset::Dataset"><code class="xref cpp cpp-class docutils literal notranslate"><span class="pre">dataset::Dataset</span></code></a>, once a source of data is available,
is similar to the reverse of reading it. Before, we used a <a class="reference internal" href="../api/dataset.html#_CPPv4N5arrow7dataset7ScannerE" title="arrow::dataset::Scanner"><code class="xref cpp cpp-class docutils literal notranslate"><span class="pre">dataset::Scanner</span></code></a> in
order to scan into a <a class="reference internal" href="../api/table.html#_CPPv4N5arrow5TableE" title="arrow::Table"><code class="xref cpp cpp-class docutils literal notranslate"><span class="pre">Table</span></code></a> – now, we need one to read out of our
<a class="reference internal" href="../api/table.html#_CPPv4N5arrow16TableBatchReaderE" title="arrow::TableBatchReader"><code class="xref cpp cpp-class docutils literal notranslate"><span class="pre">TableBatchReader</span></code></a>. To get that <a class="reference internal" href="../api/dataset.html#_CPPv4N5arrow7dataset7ScannerE" title="arrow::dataset::Scanner"><code class="xref cpp cpp-class docutils literal notranslate"><span class="pre">dataset::Scanner</span></code></a>, we’ll make a <a class="reference internal" href="../api/dataset.html#_CPPv4N5arrow7dataset14ScannerBuilderE" title="arrow::dataset::ScannerBuilder"><code class="xref cpp cpp-class docutils literal notranslate"><span class="pre">dataset::ScannerBuilder</span></code></a>
based on our <a class="reference internal" href="../api/table.html#_CPPv4N5arrow16TableBatchReaderE" title="arrow::TableBatchReader"><code class="xref cpp cpp-class docutils literal notranslate"><span class="pre">TableBatchReader</span></code></a>, then use that Builder to build a <a class="reference internal" href="../api/dataset.html#_CPPv4N5arrow7dataset7ScannerE" title="arrow::dataset::Scanner"><code class="xref cpp cpp-class docutils literal notranslate"><span class="pre">dataset::Scanner</span></code></a>:</p>
<div class="highlight-cpp notranslate"><div class="highlight"><pre><span></span><span class="w"> </span><span class="k">auto</span><span class="w"> </span><span class="n">write_scanner_builder</span><span class="w"> </span><span class="o">=</span>
<span class="w"> </span><span class="n">arrow</span><span class="o">::</span><span class="n">dataset</span><span class="o">::</span><span class="n">ScannerBuilder</span><span class="o">::</span><span class="n">FromRecordBatchReader</span><span class="p">(</span><span class="n">write_dataset</span><span class="p">);</span>
<span class="w"> </span><span class="n">ARROW_ASSIGN_OR_RAISE</span><span class="p">(</span><span class="k">auto</span><span class="w"> </span><span class="n">write_scanner</span><span class="p">,</span><span class="w"> </span><span class="n">write_scanner_builder</span><span class="o">-&gt;</span><span class="n">Finish</span><span class="p">())</span>
</pre></div>
</div>
</section>
<section id="prepare-schema-partitioning-and-file-format-variables">
<h3>Prepare Schema, Partitioning, and File Format Variables<a class="headerlink" href="#prepare-schema-partitioning-and-file-format-variables" title="Permalink to this heading">#</a></h3>
<p>Since we want to partition based on the “a” column, we need to declare
that. When defining our partitioning <a class="reference internal" href="../api/datatype.html#_CPPv4N5arrow6SchemaE" title="arrow::Schema"><code class="xref cpp cpp-class docutils literal notranslate"><span class="pre">Schema</span></code></a>, we’ll just have a single
<a class="reference internal" href="../api/datatype.html#_CPPv4N5arrow5FieldE" title="arrow::Field"><code class="xref cpp cpp-class docutils literal notranslate"><span class="pre">Field</span></code></a> that contains “a”:</p>
<div class="highlight-cpp notranslate"><div class="highlight"><pre><span></span><span class="w"> </span><span class="c1">// The partition schema determines which fields are used as keys for partitioning.</span>
<span class="w"> </span><span class="k">auto</span><span class="w"> </span><span class="n">partition_schema</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">arrow</span><span class="o">::</span><span class="n">schema</span><span class="p">({</span><span class="n">arrow</span><span class="o">::</span><span class="n">field</span><span class="p">(</span><span class="s">&quot;a&quot;</span><span class="p">,</span><span class="w"> </span><span class="n">arrow</span><span class="o">::</span><span class="n">utf8</span><span class="p">())});</span>
</pre></div>
</div>
<p>This <a class="reference internal" href="../api/datatype.html#_CPPv4N5arrow6SchemaE" title="arrow::Schema"><code class="xref cpp cpp-class docutils literal notranslate"><span class="pre">Schema</span></code></a> determines what the key is for partitioning, but we need to
choose the algorithm that’ll do something with this key. We will use
Hive-style again, this time with our schema passed to it as
configuration:</p>
<div class="highlight-cpp notranslate"><div class="highlight"><pre><span></span><span class="w"> </span><span class="c1">// We&#39;ll use Hive-style partitioning, which creates directories with &quot;key=value&quot;</span>
<span class="w"> </span><span class="c1">// pairs.</span>
<span class="w"> </span><span class="k">auto</span><span class="w"> </span><span class="n">partitioning</span><span class="w"> </span><span class="o">=</span>
<span class="w"> </span><span class="n">std</span><span class="o">::</span><span class="n">make_shared</span><span class="o">&lt;</span><span class="n">arrow</span><span class="o">::</span><span class="n">dataset</span><span class="o">::</span><span class="n">HivePartitioning</span><span class="o">&gt;</span><span class="p">(</span><span class="n">partition_schema</span><span class="p">);</span>
</pre></div>
</div>
<p>Several file formats are available, but Parquet is commonly used with
Arrow, so we’ll write back out to that:</p>
<div class="highlight-cpp notranslate"><div class="highlight"><pre><span></span><span class="w"> </span><span class="c1">// Now, we declare we&#39;ll be writing Parquet files.</span>
<span class="w"> </span><span class="k">auto</span><span class="w"> </span><span class="n">write_format</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">std</span><span class="o">::</span><span class="n">make_shared</span><span class="o">&lt;</span><span class="n">arrow</span><span class="o">::</span><span class="n">dataset</span><span class="o">::</span><span class="n">ParquetFileFormat</span><span class="o">&gt;</span><span class="p">();</span>
</pre></div>
</div>
</section>
<section id="configure-filesystemdatasetwriteoptions">
<h3>Configure FileSystemDatasetWriteOptions<a class="headerlink" href="#configure-filesystemdatasetwriteoptions" title="Permalink to this heading">#</a></h3>
<p>In order to write to disk, we need some configuration. We’ll do so via
setting values in a <a class="reference internal" href="../api/dataset.html#_CPPv4N5arrow7dataset29FileSystemDatasetWriteOptionsE" title="arrow::dataset::FileSystemDatasetWriteOptions"><code class="xref cpp cpp-class docutils literal notranslate"><span class="pre">dataset::FileSystemDatasetWriteOptions</span></code></a> struct. We’ll
initialize it with defaults where possible:</p>
<div class="highlight-cpp notranslate"><div class="highlight"><pre><span></span><span class="w"> </span><span class="c1">// This time, we make Options for writing, but do much more configuration.</span>
<span class="w"> </span><span class="n">arrow</span><span class="o">::</span><span class="n">dataset</span><span class="o">::</span><span class="n">FileSystemDatasetWriteOptions</span><span class="w"> </span><span class="n">write_options</span><span class="p">;</span>
<span class="w"> </span><span class="c1">// Defaults to start.</span>
<span class="w"> </span><span class="n">write_options</span><span class="p">.</span><span class="n">file_write_options</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">write_format</span><span class="o">-&gt;</span><span class="n">DefaultWriteOptions</span><span class="p">();</span>
</pre></div>
</div>
<p>One important step in writing to file is having a <a class="reference internal" href="../api/filesystem.html#_CPPv4N5arrow2fs10FileSystemE" title="arrow::fs::FileSystem"><code class="xref cpp cpp-class docutils literal notranslate"><span class="pre">fs::FileSystem</span></code></a> to target.
Luckily, we have one from when we set it up for reading. This is a
simple variable assignment:</p>
<div class="highlight-cpp notranslate"><div class="highlight"><pre><span></span><span class="w"> </span><span class="c1">// Use the filesystem we already have.</span>
<span class="w"> </span><span class="n">write_options</span><span class="p">.</span><span class="n">filesystem</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">fs</span><span class="p">;</span>
</pre></div>
</div>
<p>Arrow can make the directory, but it does need a name for said
directory, so let’s give it one, call it “write_dataset”:</p>
<div class="highlight-cpp notranslate"><div class="highlight"><pre><span></span><span class="w"> </span><span class="c1">// Write to the folder &quot;write_dataset&quot; in current directory.</span>
<span class="w"> </span><span class="n">write_options</span><span class="p">.</span><span class="n">base_dir</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="s">&quot;write_dataset&quot;</span><span class="p">;</span>
</pre></div>
</div>
<p>We made a partitioning method previously, declaring that we’d use
Hive-style – this is where we actually pass that to our writing
function:</p>
<div class="highlight-cpp notranslate"><div class="highlight"><pre><span></span><span class="w"> </span><span class="c1">// Use the partitioning declared above.</span>
<span class="w"> </span><span class="n">write_options</span><span class="p">.</span><span class="n">partitioning</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">partitioning</span><span class="p">;</span>
</pre></div>
</div>
<p>Part of what’ll happen is Arrow will break up files, thus preventing
them from being too large to handle. This is what makes a dataset
fragmented in the first place. In order to set this up, we need a base
name for each fragment in a directory – in this case, we’ll have
“part{i}.parquet”, which means the third file (within the same
directory) will be called “part3.parquet”, for example:</p>
<div class="highlight-cpp notranslate"><div class="highlight"><pre><span></span><span class="w"> </span><span class="c1">// Define what the name for the files making up the dataset will be.</span>
<span class="w"> </span><span class="n">write_options</span><span class="p">.</span><span class="n">basename_template</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="s">&quot;part{i}.parquet&quot;</span><span class="p">;</span>
</pre></div>
</div>
<p>Sometimes, data will be written to the same location more than once, and
overwriting will be accepted. Since we may want to run this application
more than once, we will set Arrow to overwrite existing data – if we
didn’t, Arrow would abort due to seeing existing data after the first
run of this application:</p>
<div class="highlight-cpp notranslate"><div class="highlight"><pre><span></span><span class="w"> </span><span class="c1">// Set behavior to overwrite existing data -- specifically, this lets this example</span>
<span class="w"> </span><span class="c1">// be run more than once, and allows whatever code you have to overwrite what&#39;s there.</span>
<span class="w"> </span><span class="n">write_options</span><span class="p">.</span><span class="n">existing_data_behavior</span><span class="w"> </span><span class="o">=</span>
<span class="w"> </span><span class="n">arrow</span><span class="o">::</span><span class="n">dataset</span><span class="o">::</span><span class="n">ExistingDataBehavior</span><span class="o">::</span><span class="n">kOverwriteOrIgnore</span><span class="p">;</span>
</pre></div>
</div>
</section>
<section id="write-dataset-to-disk">
<h3>Write Dataset to Disk<a class="headerlink" href="#write-dataset-to-disk" title="Permalink to this heading">#</a></h3>
<p>Once the <a class="reference internal" href="../api/dataset.html#_CPPv4N5arrow7dataset29FileSystemDatasetWriteOptionsE" title="arrow::dataset::FileSystemDatasetWriteOptions"><code class="xref cpp cpp-class docutils literal notranslate"><span class="pre">dataset::FileSystemDatasetWriteOptions</span></code></a> has been configured, and a
<a class="reference internal" href="../api/dataset.html#_CPPv4N5arrow7dataset7ScannerE" title="arrow::dataset::Scanner"><code class="xref cpp cpp-class docutils literal notranslate"><span class="pre">dataset::Scanner</span></code></a> is prepared to parse the data, we can pass the Options and
<a class="reference internal" href="../api/dataset.html#_CPPv4N5arrow7dataset7ScannerE" title="arrow::dataset::Scanner"><code class="xref cpp cpp-class docutils literal notranslate"><span class="pre">dataset::Scanner</span></code></a> to the <a class="reference internal" href="../api/dataset.html#_CPPv4N5arrow7dataset17FileSystemDataset5WriteERK29FileSystemDatasetWriteOptionsNSt10shared_ptrI7ScannerEE" title="arrow::dataset::FileSystemDataset::Write"><code class="xref cpp cpp-func docutils literal notranslate"><span class="pre">dataset::FileSystemDataset::Write()</span></code></a> to write out to
disk:</p>
<div class="highlight-cpp notranslate"><div class="highlight"><pre><span></span><span class="w"> </span><span class="c1">// Write to disk!</span>
<span class="w"> </span><span class="n">ARROW_RETURN_NOT_OK</span><span class="p">(</span>
<span class="w"> </span><span class="n">arrow</span><span class="o">::</span><span class="n">dataset</span><span class="o">::</span><span class="n">FileSystemDataset</span><span class="o">::</span><span class="n">Write</span><span class="p">(</span><span class="n">write_options</span><span class="p">,</span><span class="w"> </span><span class="n">write_scanner</span><span class="p">));</span>
</pre></div>
</div>
<p>You can review your disk to see that you’ve written a folder containing
subfolders for every value of “a”, which each have Parquet files!</p>
</section>
</section>
<section id="ending-program">
<h2>Ending Program<a class="headerlink" href="#ending-program" title="Permalink to this heading">#</a></h2>
<p>At the end, we just return <a class="reference internal" href="../api/support.html#_CPPv4N5arrow6Status2OKEv" title="arrow::Status::OK"><code class="xref cpp cpp-func docutils literal notranslate"><span class="pre">Status::OK()</span></code></a>, so the <code class="docutils literal notranslate"><span class="pre">main()</span></code> knows that
we’re done, and that everything’s okay, just like the preceding
tutorials.</p>
<div class="highlight-cpp notranslate"><div class="highlight"><pre><span></span><span class="w"> </span><span class="k">return</span><span class="w"> </span><span class="n">arrow</span><span class="o">::</span><span class="n">Status</span><span class="o">::</span><span class="n">OK</span><span class="p">();</span>
<span class="p">}</span>
</pre></div>
</div>
<p>With that, you’ve read and written partitioned datasets! This method,
with some configuration, will work for any supported dataset format. For
an example of such a dataset, the NYC Taxi dataset is a well-known
one, which you can find <a class="reference external" href="https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page">here</a>.
Now you can get larger-than-memory data mapped for use!</p>
<p>Which means that now we have to be able to process this data without
pulling it all into memory at once. For this, try Acero.</p>
<div class="admonition seealso">
<p class="admonition-title">See also</p>
<p><a class="reference internal" href="../streaming_execution.html"><span class="doc">Acero: A C++ streaming execution engine</span></a> for more information on Acero.</p>
</div>
<p>Refer to the below for a copy of the complete code:</p>
<div class="highlight-cpp notranslate"><div class="highlight"><pre><span></span><span class="linenos"> 19</span><span class="c1">// (Doc section: Includes)</span>
<span class="linenos"> 20</span><span class="cp">#include</span><span class="w"> </span><span class="cpf">&lt;arrow/api.h&gt;</span>
<span class="linenos"> 21</span><span class="cp">#include</span><span class="w"> </span><span class="cpf">&lt;arrow/dataset/api.h&gt;</span>
<span class="linenos"> 22</span><span class="c1">// We use Parquet headers for setting up examples; they are not required for using</span>
<span class="linenos"> 23</span><span class="c1">// datasets.</span>
<span class="linenos"> 24</span><span class="cp">#include</span><span class="w"> </span><span class="cpf">&lt;parquet/arrow/reader.h&gt;</span>
<span class="linenos"> 25</span><span class="cp">#include</span><span class="w"> </span><span class="cpf">&lt;parquet/arrow/writer.h&gt;</span>
<span class="linenos"> 26</span>
<span class="linenos"> 27</span><span class="cp">#include</span><span class="w"> </span><span class="cpf">&lt;unistd.h&gt;</span>
<span class="linenos"> 28</span><span class="cp">#include</span><span class="w"> </span><span class="cpf">&lt;iostream&gt;</span>
<span class="linenos"> 29</span><span class="c1">// (Doc section: Includes)</span>
<span class="linenos"> 30</span>
<span class="linenos"> 31</span><span class="c1">// (Doc section: Helper Functions)</span>
<span class="linenos"> 32</span><span class="c1">// Generate some data for the rest of this example.</span>
<span class="linenos"> 33</span><span class="n">arrow</span><span class="o">::</span><span class="n">Result</span><span class="o">&lt;</span><span class="n">std</span><span class="o">::</span><span class="n">shared_ptr</span><span class="o">&lt;</span><span class="n">arrow</span><span class="o">::</span><span class="n">Table</span><span class="o">&gt;&gt;</span><span class="w"> </span><span class="n">CreateTable</span><span class="p">()</span><span class="w"> </span><span class="p">{</span>
<span class="linenos"> 34</span><span class="w"> </span><span class="c1">// This code should look familiar from the basic Arrow example, and is not the</span>
<span class="linenos"> 35</span><span class="w"> </span><span class="c1">// focus of this example. However, we need data to work on it, and this makes that!</span>
<span class="linenos"> 36</span><span class="w"> </span><span class="k">auto</span><span class="w"> </span><span class="n">schema</span><span class="w"> </span><span class="o">=</span>
<span class="linenos"> 37</span><span class="w"> </span><span class="n">arrow</span><span class="o">::</span><span class="n">schema</span><span class="p">({</span><span class="n">arrow</span><span class="o">::</span><span class="n">field</span><span class="p">(</span><span class="s">&quot;a&quot;</span><span class="p">,</span><span class="w"> </span><span class="n">arrow</span><span class="o">::</span><span class="n">int64</span><span class="p">()),</span><span class="w"> </span><span class="n">arrow</span><span class="o">::</span><span class="n">field</span><span class="p">(</span><span class="s">&quot;b&quot;</span><span class="p">,</span><span class="w"> </span><span class="n">arrow</span><span class="o">::</span><span class="n">int64</span><span class="p">()),</span>
<span class="linenos"> 38</span><span class="w"> </span><span class="n">arrow</span><span class="o">::</span><span class="n">field</span><span class="p">(</span><span class="s">&quot;c&quot;</span><span class="p">,</span><span class="w"> </span><span class="n">arrow</span><span class="o">::</span><span class="n">int64</span><span class="p">())});</span>
<span class="linenos"> 39</span><span class="w"> </span><span class="n">std</span><span class="o">::</span><span class="n">shared_ptr</span><span class="o">&lt;</span><span class="n">arrow</span><span class="o">::</span><span class="n">Array</span><span class="o">&gt;</span><span class="w"> </span><span class="n">array_a</span><span class="p">;</span>
<span class="linenos"> 40</span><span class="w"> </span><span class="n">std</span><span class="o">::</span><span class="n">shared_ptr</span><span class="o">&lt;</span><span class="n">arrow</span><span class="o">::</span><span class="n">Array</span><span class="o">&gt;</span><span class="w"> </span><span class="n">array_b</span><span class="p">;</span>
<span class="linenos"> 41</span><span class="w"> </span><span class="n">std</span><span class="o">::</span><span class="n">shared_ptr</span><span class="o">&lt;</span><span class="n">arrow</span><span class="o">::</span><span class="n">Array</span><span class="o">&gt;</span><span class="w"> </span><span class="n">array_c</span><span class="p">;</span>
<span class="linenos"> 42</span><span class="w"> </span><span class="n">arrow</span><span class="o">::</span><span class="n">NumericBuilder</span><span class="o">&lt;</span><span class="n">arrow</span><span class="o">::</span><span class="n">Int64Type</span><span class="o">&gt;</span><span class="w"> </span><span class="n">builder</span><span class="p">;</span>
<span class="linenos"> 43</span><span class="w"> </span><span class="n">ARROW_RETURN_NOT_OK</span><span class="p">(</span><span class="n">builder</span><span class="p">.</span><span class="n">AppendValues</span><span class="p">({</span><span class="mi">0</span><span class="p">,</span><span class="w"> </span><span class="mi">1</span><span class="p">,</span><span class="w"> </span><span class="mi">2</span><span class="p">,</span><span class="w"> </span><span class="mi">3</span><span class="p">,</span><span class="w"> </span><span class="mi">4</span><span class="p">,</span><span class="w"> </span><span class="mi">5</span><span class="p">,</span><span class="w"> </span><span class="mi">6</span><span class="p">,</span><span class="w"> </span><span class="mi">7</span><span class="p">,</span><span class="w"> </span><span class="mi">8</span><span class="p">,</span><span class="w"> </span><span class="mi">9</span><span class="p">}));</span>
<span class="linenos"> 44</span><span class="w"> </span><span class="n">ARROW_RETURN_NOT_OK</span><span class="p">(</span><span class="n">builder</span><span class="p">.</span><span class="n">Finish</span><span class="p">(</span><span class="o">&amp;</span><span class="n">array_a</span><span class="p">));</span>
<span class="linenos"> 45</span><span class="w"> </span><span class="n">builder</span><span class="p">.</span><span class="n">Reset</span><span class="p">();</span>
<span class="linenos"> 46</span><span class="w"> </span><span class="n">ARROW_RETURN_NOT_OK</span><span class="p">(</span><span class="n">builder</span><span class="p">.</span><span class="n">AppendValues</span><span class="p">({</span><span class="mi">9</span><span class="p">,</span><span class="w"> </span><span class="mi">8</span><span class="p">,</span><span class="w"> </span><span class="mi">7</span><span class="p">,</span><span class="w"> </span><span class="mi">6</span><span class="p">,</span><span class="w"> </span><span class="mi">5</span><span class="p">,</span><span class="w"> </span><span class="mi">4</span><span class="p">,</span><span class="w"> </span><span class="mi">3</span><span class="p">,</span><span class="w"> </span><span class="mi">2</span><span class="p">,</span><span class="w"> </span><span class="mi">1</span><span class="p">,</span><span class="w"> </span><span class="mi">0</span><span class="p">}));</span>
<span class="linenos"> 47</span><span class="w"> </span><span class="n">ARROW_RETURN_NOT_OK</span><span class="p">(</span><span class="n">builder</span><span class="p">.</span><span class="n">Finish</span><span class="p">(</span><span class="o">&amp;</span><span class="n">array_b</span><span class="p">));</span>
<span class="linenos"> 48</span><span class="w"> </span><span class="n">builder</span><span class="p">.</span><span class="n">Reset</span><span class="p">();</span>
<span class="linenos"> 49</span><span class="w"> </span><span class="n">ARROW_RETURN_NOT_OK</span><span class="p">(</span><span class="n">builder</span><span class="p">.</span><span class="n">AppendValues</span><span class="p">({</span><span class="mi">1</span><span class="p">,</span><span class="w"> </span><span class="mi">2</span><span class="p">,</span><span class="w"> </span><span class="mi">1</span><span class="p">,</span><span class="w"> </span><span class="mi">2</span><span class="p">,</span><span class="w"> </span><span class="mi">1</span><span class="p">,</span><span class="w"> </span><span class="mi">2</span><span class="p">,</span><span class="w"> </span><span class="mi">1</span><span class="p">,</span><span class="w"> </span><span class="mi">2</span><span class="p">,</span><span class="w"> </span><span class="mi">1</span><span class="p">,</span><span class="w"> </span><span class="mi">2</span><span class="p">}));</span>
<span class="linenos"> 50</span><span class="w"> </span><span class="n">ARROW_RETURN_NOT_OK</span><span class="p">(</span><span class="n">builder</span><span class="p">.</span><span class="n">Finish</span><span class="p">(</span><span class="o">&amp;</span><span class="n">array_c</span><span class="p">));</span>
<span class="linenos"> 51</span><span class="w"> </span><span class="k">return</span><span class="w"> </span><span class="n">arrow</span><span class="o">::</span><span class="n">Table</span><span class="o">::</span><span class="n">Make</span><span class="p">(</span><span class="n">schema</span><span class="p">,</span><span class="w"> </span><span class="p">{</span><span class="n">array_a</span><span class="p">,</span><span class="w"> </span><span class="n">array_b</span><span class="p">,</span><span class="w"> </span><span class="n">array_c</span><span class="p">});</span>
<span class="linenos"> 52</span><span class="p">}</span>
<span class="linenos"> 53</span>
<span class="linenos"> 54</span><span class="c1">// Set up a dataset by writing two Parquet files.</span>
<span class="linenos"> 55</span><span class="n">arrow</span><span class="o">::</span><span class="n">Result</span><span class="o">&lt;</span><span class="n">std</span><span class="o">::</span><span class="n">string</span><span class="o">&gt;</span><span class="w"> </span><span class="n">CreateExampleParquetDataset</span><span class="p">(</span>
<span class="linenos"> 56</span><span class="w"> </span><span class="k">const</span><span class="w"> </span><span class="n">std</span><span class="o">::</span><span class="n">shared_ptr</span><span class="o">&lt;</span><span class="n">arrow</span><span class="o">::</span><span class="n">fs</span><span class="o">::</span><span class="n">FileSystem</span><span class="o">&gt;&amp;</span><span class="w"> </span><span class="n">filesystem</span><span class="p">,</span>
<span class="linenos"> 57</span><span class="w"> </span><span class="k">const</span><span class="w"> </span><span class="n">std</span><span class="o">::</span><span class="n">string</span><span class="o">&amp;</span><span class="w"> </span><span class="n">root_path</span><span class="p">)</span><span class="w"> </span><span class="p">{</span>
<span class="linenos"> 58</span><span class="w"> </span><span class="c1">// Much like CreateTable(), this is utility that gets us the dataset we&#39;ll be reading</span>
<span class="linenos"> 59</span><span class="w"> </span><span class="c1">// from. Don&#39;t worry, we also write a dataset in the example proper.</span>
<span class="linenos"> 60</span><span class="w"> </span><span class="k">auto</span><span class="w"> </span><span class="n">base_path</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">root_path</span><span class="w"> </span><span class="o">+</span><span class="w"> </span><span class="s">&quot;parquet_dataset&quot;</span><span class="p">;</span>
<span class="linenos"> 61</span><span class="w"> </span><span class="n">ARROW_RETURN_NOT_OK</span><span class="p">(</span><span class="n">filesystem</span><span class="o">-&gt;</span><span class="n">CreateDir</span><span class="p">(</span><span class="n">base_path</span><span class="p">));</span>
<span class="linenos"> 62</span><span class="w"> </span><span class="c1">// Create an Arrow Table</span>
<span class="linenos"> 63</span><span class="w"> </span><span class="n">ARROW_ASSIGN_OR_RAISE</span><span class="p">(</span><span class="k">auto</span><span class="w"> </span><span class="n">table</span><span class="p">,</span><span class="w"> </span><span class="n">CreateTable</span><span class="p">());</span>
<span class="linenos"> 64</span><span class="w"> </span><span class="c1">// Write it into two Parquet files</span>
<span class="linenos"> 65</span><span class="w"> </span><span class="n">ARROW_ASSIGN_OR_RAISE</span><span class="p">(</span><span class="k">auto</span><span class="w"> </span><span class="n">output</span><span class="p">,</span>
<span class="linenos"> 66</span><span class="w"> </span><span class="n">filesystem</span><span class="o">-&gt;</span><span class="n">OpenOutputStream</span><span class="p">(</span><span class="n">base_path</span><span class="w"> </span><span class="o">+</span><span class="w"> </span><span class="s">&quot;/data1.parquet&quot;</span><span class="p">));</span>
<span class="linenos"> 67</span><span class="w"> </span><span class="n">ARROW_RETURN_NOT_OK</span><span class="p">(</span><span class="n">parquet</span><span class="o">::</span><span class="n">arrow</span><span class="o">::</span><span class="n">WriteTable</span><span class="p">(</span>
<span class="linenos"> 68</span><span class="w"> </span><span class="o">*</span><span class="n">table</span><span class="o">-&gt;</span><span class="n">Slice</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span><span class="w"> </span><span class="mi">5</span><span class="p">),</span><span class="w"> </span><span class="n">arrow</span><span class="o">::</span><span class="n">default_memory_pool</span><span class="p">(),</span><span class="w"> </span><span class="n">output</span><span class="p">,</span><span class="w"> </span><span class="mi">2048</span><span class="p">));</span>
<span class="linenos"> 69</span><span class="w"> </span><span class="n">ARROW_ASSIGN_OR_RAISE</span><span class="p">(</span><span class="n">output</span><span class="p">,</span>
<span class="linenos"> 70</span><span class="w"> </span><span class="n">filesystem</span><span class="o">-&gt;</span><span class="n">OpenOutputStream</span><span class="p">(</span><span class="n">base_path</span><span class="w"> </span><span class="o">+</span><span class="w"> </span><span class="s">&quot;/data2.parquet&quot;</span><span class="p">));</span>
<span class="linenos"> 71</span><span class="w"> </span><span class="n">ARROW_RETURN_NOT_OK</span><span class="p">(</span><span class="n">parquet</span><span class="o">::</span><span class="n">arrow</span><span class="o">::</span><span class="n">WriteTable</span><span class="p">(</span>
<span class="linenos"> 72</span><span class="w"> </span><span class="o">*</span><span class="n">table</span><span class="o">-&gt;</span><span class="n">Slice</span><span class="p">(</span><span class="mi">5</span><span class="p">),</span><span class="w"> </span><span class="n">arrow</span><span class="o">::</span><span class="n">default_memory_pool</span><span class="p">(),</span><span class="w"> </span><span class="n">output</span><span class="p">,</span><span class="w"> </span><span class="mi">2048</span><span class="p">));</span>
<span class="linenos"> 73</span><span class="w"> </span><span class="k">return</span><span class="w"> </span><span class="n">base_path</span><span class="p">;</span>
<span class="linenos"> 74</span><span class="p">}</span>
<span class="linenos"> 75</span>
<span class="linenos"> 76</span><span class="n">arrow</span><span class="o">::</span><span class="n">Status</span><span class="w"> </span><span class="n">PrepareEnv</span><span class="p">()</span><span class="w"> </span><span class="p">{</span>
<span class="linenos"> 77</span><span class="w"> </span><span class="c1">// Get our environment prepared for reading, by setting up some quick writing.</span>
<span class="linenos"> 78</span><span class="w"> </span><span class="n">ARROW_ASSIGN_OR_RAISE</span><span class="p">(</span><span class="k">auto</span><span class="w"> </span><span class="n">src_table</span><span class="p">,</span><span class="w"> </span><span class="n">CreateTable</span><span class="p">())</span>
<span class="linenos"> 79</span><span class="w"> </span><span class="n">std</span><span class="o">::</span><span class="n">shared_ptr</span><span class="o">&lt;</span><span class="n">arrow</span><span class="o">::</span><span class="n">fs</span><span class="o">::</span><span class="n">FileSystem</span><span class="o">&gt;</span><span class="w"> </span><span class="n">setup_fs</span><span class="p">;</span>
<span class="linenos"> 80</span><span class="w"> </span><span class="c1">// Note this operates in the directory the executable is built in.</span>
<span class="linenos"> 81</span><span class="w"> </span><span class="kt">char</span><span class="w"> </span><span class="n">setup_path</span><span class="p">[</span><span class="mi">256</span><span class="p">];</span>
<span class="linenos"> 82</span><span class="w"> </span><span class="kt">char</span><span class="o">*</span><span class="w"> </span><span class="n">result</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">getcwd</span><span class="p">(</span><span class="n">setup_path</span><span class="p">,</span><span class="w"> </span><span class="mi">256</span><span class="p">);</span>
<span class="linenos"> 83</span><span class="w"> </span><span class="k">if</span><span class="w"> </span><span class="p">(</span><span class="n">result</span><span class="w"> </span><span class="o">==</span><span class="w"> </span><span class="nb">NULL</span><span class="p">)</span><span class="w"> </span><span class="p">{</span>
<span class="linenos"> 84</span><span class="w"> </span><span class="k">return</span><span class="w"> </span><span class="n">arrow</span><span class="o">::</span><span class="n">Status</span><span class="o">::</span><span class="n">IOError</span><span class="p">(</span><span class="s">&quot;Fetching PWD failed.&quot;</span><span class="p">);</span>
<span class="linenos"> 85</span><span class="w"> </span><span class="p">}</span>
<span class="linenos"> 86</span>
<span class="linenos"> 87</span><span class="w"> </span><span class="n">ARROW_ASSIGN_OR_RAISE</span><span class="p">(</span><span class="n">setup_fs</span><span class="p">,</span><span class="w"> </span><span class="n">arrow</span><span class="o">::</span><span class="n">fs</span><span class="o">::</span><span class="n">FileSystemFromUriOrPath</span><span class="p">(</span><span class="n">setup_path</span><span class="p">));</span>
<span class="linenos"> 88</span><span class="w"> </span><span class="n">ARROW_ASSIGN_OR_RAISE</span><span class="p">(</span><span class="k">auto</span><span class="w"> </span><span class="n">dset_path</span><span class="p">,</span><span class="w"> </span><span class="n">CreateExampleParquetDataset</span><span class="p">(</span><span class="n">setup_fs</span><span class="p">,</span><span class="w"> </span><span class="s">&quot;&quot;</span><span class="p">));</span>
<span class="linenos"> 89</span>
<span class="linenos"> 90</span><span class="w"> </span><span class="k">return</span><span class="w"> </span><span class="n">arrow</span><span class="o">::</span><span class="n">Status</span><span class="o">::</span><span class="n">OK</span><span class="p">();</span>
<span class="linenos"> 91</span><span class="p">}</span>
<span class="linenos"> 92</span><span class="c1">// (Doc section: Helper Functions)</span>
<span class="linenos"> 93</span>
<span class="linenos"> 94</span><span class="c1">// (Doc section: RunMain)</span>
<span class="linenos"> 95</span><span class="n">arrow</span><span class="o">::</span><span class="n">Status</span><span class="w"> </span><span class="n">RunMain</span><span class="p">()</span><span class="w"> </span><span class="p">{</span>
<span class="linenos"> 96</span><span class="w"> </span><span class="c1">// (Doc section: RunMain)</span>
<span class="linenos"> 97</span><span class="w"> </span><span class="c1">// (Doc section: PrepareEnv)</span>
<span class="linenos"> 98</span><span class="w"> </span><span class="n">ARROW_RETURN_NOT_OK</span><span class="p">(</span><span class="n">PrepareEnv</span><span class="p">());</span>
<span class="linenos"> 99</span><span class="w"> </span><span class="c1">// (Doc section: PrepareEnv)</span>
<span class="linenos">100</span>
<span class="linenos">101</span><span class="w"> </span><span class="c1">// (Doc section: FileSystem Declare)</span>
<span class="linenos">102</span><span class="w"> </span><span class="c1">// First, we need a filesystem object, which lets us interact with our local</span>
<span class="linenos">103</span><span class="w"> </span><span class="c1">// filesystem starting at a given path. For the sake of simplicity, that&#39;ll be</span>
<span class="linenos">104</span><span class="w"> </span><span class="c1">// the current directory.</span>
<span class="linenos">105</span><span class="w"> </span><span class="n">std</span><span class="o">::</span><span class="n">shared_ptr</span><span class="o">&lt;</span><span class="n">arrow</span><span class="o">::</span><span class="n">fs</span><span class="o">::</span><span class="n">FileSystem</span><span class="o">&gt;</span><span class="w"> </span><span class="n">fs</span><span class="p">;</span>
<span class="linenos">106</span><span class="w"> </span><span class="c1">// (Doc section: FileSystem Declare)</span>
<span class="linenos">107</span>
<span class="linenos">108</span><span class="w"> </span><span class="c1">// (Doc section: FileSystem Init)</span>
<span class="linenos">109</span><span class="w"> </span><span class="c1">// Get the CWD, use it to make the FileSystem object.</span>
<span class="linenos">110</span><span class="w"> </span><span class="kt">char</span><span class="w"> </span><span class="n">init_path</span><span class="p">[</span><span class="mi">256</span><span class="p">];</span>
<span class="linenos">111</span><span class="w"> </span><span class="kt">char</span><span class="o">*</span><span class="w"> </span><span class="n">result</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">getcwd</span><span class="p">(</span><span class="n">init_path</span><span class="p">,</span><span class="w"> </span><span class="mi">256</span><span class="p">);</span>
<span class="linenos">112</span><span class="w"> </span><span class="k">if</span><span class="w"> </span><span class="p">(</span><span class="n">result</span><span class="w"> </span><span class="o">==</span><span class="w"> </span><span class="nb">NULL</span><span class="p">)</span><span class="w"> </span><span class="p">{</span>
<span class="linenos">113</span><span class="w"> </span><span class="k">return</span><span class="w"> </span><span class="n">arrow</span><span class="o">::</span><span class="n">Status</span><span class="o">::</span><span class="n">IOError</span><span class="p">(</span><span class="s">&quot;Fetching PWD failed.&quot;</span><span class="p">);</span>
<span class="linenos">114</span><span class="w"> </span><span class="p">}</span>
<span class="linenos">115</span><span class="w"> </span><span class="n">ARROW_ASSIGN_OR_RAISE</span><span class="p">(</span><span class="n">fs</span><span class="p">,</span><span class="w"> </span><span class="n">arrow</span><span class="o">::</span><span class="n">fs</span><span class="o">::</span><span class="n">FileSystemFromUriOrPath</span><span class="p">(</span><span class="n">init_path</span><span class="p">));</span>
<span class="linenos">116</span><span class="w"> </span><span class="c1">// (Doc section: FileSystem Init)</span>
<span class="linenos">117</span>
<span class="linenos">118</span><span class="w"> </span><span class="c1">// (Doc section: FileSelector Declare)</span>
<span class="linenos">119</span><span class="w"> </span><span class="c1">// A file selector lets us actually traverse a multi-file dataset.</span>
<span class="linenos">120</span><span class="w"> </span><span class="n">arrow</span><span class="o">::</span><span class="n">fs</span><span class="o">::</span><span class="n">FileSelector</span><span class="w"> </span><span class="n">selector</span><span class="p">;</span>
<span class="linenos">121</span><span class="w"> </span><span class="c1">// (Doc section: FileSelector Declare)</span>
<span class="linenos">122</span><span class="w"> </span><span class="c1">// (Doc section: FileSelector Config)</span>
<span class="linenos">123</span><span class="w"> </span><span class="n">selector</span><span class="p">.</span><span class="n">base_dir</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="s">&quot;parquet_dataset&quot;</span><span class="p">;</span>
<span class="linenos">124</span><span class="w"> </span><span class="c1">// Recursive is a safe bet if you don&#39;t know the nesting of your dataset.</span>
<span class="linenos">125</span><span class="w"> </span><span class="n">selector</span><span class="p">.</span><span class="n">recursive</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="nb">true</span><span class="p">;</span>
<span class="linenos">126</span><span class="w"> </span><span class="c1">// (Doc section: FileSelector Config)</span>
<span class="linenos">127</span><span class="w"> </span><span class="c1">// (Doc section: FileSystemFactoryOptions)</span>
<span class="linenos">128</span><span class="w"> </span><span class="c1">// Making an options object lets us configure our dataset reading.</span>
<span class="linenos">129</span><span class="w"> </span><span class="n">arrow</span><span class="o">::</span><span class="n">dataset</span><span class="o">::</span><span class="n">FileSystemFactoryOptions</span><span class="w"> </span><span class="n">options</span><span class="p">;</span>
<span class="linenos">130</span><span class="w"> </span><span class="c1">// We&#39;ll use Hive-style partitioning. We&#39;ll let Arrow Datasets infer the partition</span>
<span class="linenos">131</span><span class="w"> </span><span class="c1">// schema. We won&#39;t set any other options, defaults are fine.</span>
<span class="linenos">132</span><span class="w"> </span><span class="n">options</span><span class="p">.</span><span class="n">partitioning</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">arrow</span><span class="o">::</span><span class="n">dataset</span><span class="o">::</span><span class="n">HivePartitioning</span><span class="o">::</span><span class="n">MakeFactory</span><span class="p">();</span>
<span class="linenos">133</span><span class="w"> </span><span class="c1">// (Doc section: FileSystemFactoryOptions)</span>
<span class="linenos">134</span><span class="w"> </span><span class="c1">// (Doc section: File Format Setup)</span>
<span class="linenos">135</span><span class="w"> </span><span class="k">auto</span><span class="w"> </span><span class="n">read_format</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">std</span><span class="o">::</span><span class="n">make_shared</span><span class="o">&lt;</span><span class="n">arrow</span><span class="o">::</span><span class="n">dataset</span><span class="o">::</span><span class="n">ParquetFileFormat</span><span class="o">&gt;</span><span class="p">();</span>
<span class="linenos">136</span><span class="w"> </span><span class="c1">// (Doc section: File Format Setup)</span>
<span class="linenos">137</span><span class="w"> </span><span class="c1">// (Doc section: FileSystemDatasetFactory Make)</span>
<span class="linenos">138</span><span class="w"> </span><span class="c1">// Now, we get a factory that will let us get our dataset -- we don&#39;t have the</span>
<span class="linenos">139</span><span class="w"> </span><span class="c1">// dataset yet!</span>
<span class="linenos">140</span><span class="w"> </span><span class="n">ARROW_ASSIGN_OR_RAISE</span><span class="p">(</span><span class="k">auto</span><span class="w"> </span><span class="n">factory</span><span class="p">,</span><span class="w"> </span><span class="n">arrow</span><span class="o">::</span><span class="n">dataset</span><span class="o">::</span><span class="n">FileSystemDatasetFactory</span><span class="o">::</span><span class="n">Make</span><span class="p">(</span>
<span class="linenos">141</span><span class="w"> </span><span class="n">fs</span><span class="p">,</span><span class="w"> </span><span class="n">selector</span><span class="p">,</span><span class="w"> </span><span class="n">read_format</span><span class="p">,</span><span class="w"> </span><span class="n">options</span><span class="p">));</span>
<span class="linenos">142</span><span class="w"> </span><span class="c1">// (Doc section: FileSystemDatasetFactory Make)</span>
<span class="linenos">143</span><span class="w"> </span><span class="c1">// (Doc section: FileSystemDatasetFactory Finish)</span>
<span class="linenos">144</span><span class="w"> </span><span class="c1">// Now we build our dataset from the factory.</span>
<span class="linenos">145</span><span class="w"> </span><span class="n">ARROW_ASSIGN_OR_RAISE</span><span class="p">(</span><span class="k">auto</span><span class="w"> </span><span class="n">read_dataset</span><span class="p">,</span><span class="w"> </span><span class="n">factory</span><span class="o">-&gt;</span><span class="n">Finish</span><span class="p">());</span>
<span class="linenos">146</span><span class="w"> </span><span class="c1">// (Doc section: FileSystemDatasetFactory Finish)</span>
<span class="linenos">147</span><span class="w"> </span><span class="c1">// (Doc section: Dataset Fragments)</span>
<span class="linenos">148</span><span class="w"> </span><span class="c1">// Print out the fragments</span>
<span class="linenos">149</span><span class="w"> </span><span class="n">ARROW_ASSIGN_OR_RAISE</span><span class="p">(</span><span class="k">auto</span><span class="w"> </span><span class="n">fragments</span><span class="p">,</span><span class="w"> </span><span class="n">read_dataset</span><span class="o">-&gt;</span><span class="n">GetFragments</span><span class="p">());</span>
<span class="linenos">150</span><span class="w"> </span><span class="k">for</span><span class="w"> </span><span class="p">(</span><span class="k">const</span><span class="w"> </span><span class="k">auto</span><span class="o">&amp;</span><span class="w"> </span><span class="n">fragment</span><span class="w"> </span><span class="o">:</span><span class="w"> </span><span class="n">fragments</span><span class="p">)</span><span class="w"> </span><span class="p">{</span>
<span class="linenos">151</span><span class="w"> </span><span class="n">std</span><span class="o">::</span><span class="n">cout</span><span class="w"> </span><span class="o">&lt;&lt;</span><span class="w"> </span><span class="s">&quot;Found fragment: &quot;</span><span class="w"> </span><span class="o">&lt;&lt;</span><span class="w"> </span><span class="p">(</span><span class="o">*</span><span class="n">fragment</span><span class="p">)</span><span class="o">-&gt;</span><span class="n">ToString</span><span class="p">()</span><span class="w"> </span><span class="o">&lt;&lt;</span><span class="w"> </span><span class="n">std</span><span class="o">::</span><span class="n">endl</span><span class="p">;</span>
<span class="linenos">152</span><span class="w"> </span><span class="n">std</span><span class="o">::</span><span class="n">cout</span><span class="w"> </span><span class="o">&lt;&lt;</span><span class="w"> </span><span class="s">&quot;Partition expression: &quot;</span>
<span class="linenos">153</span><span class="w"> </span><span class="o">&lt;&lt;</span><span class="w"> </span><span class="p">(</span><span class="o">*</span><span class="n">fragment</span><span class="p">)</span><span class="o">-&gt;</span><span class="n">partition_expression</span><span class="p">().</span><span class="n">ToString</span><span class="p">()</span><span class="w"> </span><span class="o">&lt;&lt;</span><span class="w"> </span><span class="n">std</span><span class="o">::</span><span class="n">endl</span><span class="p">;</span>
<span class="linenos">154</span><span class="w"> </span><span class="p">}</span>
<span class="linenos">155</span><span class="w"> </span><span class="c1">// (Doc section: Dataset Fragments)</span>
<span class="linenos">156</span><span class="w"> </span><span class="c1">// (Doc section: Read Scan Builder)</span>
<span class="linenos">157</span><span class="w"> </span><span class="c1">// Scan dataset into a Table -- once this is done, you can do</span>
<span class="linenos">158</span><span class="w"> </span><span class="c1">// normal table things with it, like computation and printing. However, now you&#39;re</span>
<span class="linenos">159</span><span class="w"> </span><span class="c1">// also dedicated to being in memory.</span>
<span class="linenos">160</span><span class="w"> </span><span class="n">ARROW_ASSIGN_OR_RAISE</span><span class="p">(</span><span class="k">auto</span><span class="w"> </span><span class="n">read_scan_builder</span><span class="p">,</span><span class="w"> </span><span class="n">read_dataset</span><span class="o">-&gt;</span><span class="n">NewScan</span><span class="p">());</span>
<span class="linenos">161</span><span class="w"> </span><span class="c1">// (Doc section: Read Scan Builder)</span>
<span class="linenos">162</span><span class="w"> </span><span class="c1">// (Doc section: Read Scanner)</span>
<span class="linenos">163</span><span class="w"> </span><span class="n">ARROW_ASSIGN_OR_RAISE</span><span class="p">(</span><span class="k">auto</span><span class="w"> </span><span class="n">read_scanner</span><span class="p">,</span><span class="w"> </span><span class="n">read_scan_builder</span><span class="o">-&gt;</span><span class="n">Finish</span><span class="p">());</span>
<span class="linenos">164</span><span class="w"> </span><span class="c1">// (Doc section: Read Scanner)</span>
<span class="linenos">165</span><span class="w"> </span><span class="c1">// (Doc section: To Table)</span>
<span class="linenos">166</span><span class="w"> </span><span class="n">ARROW_ASSIGN_OR_RAISE</span><span class="p">(</span><span class="n">std</span><span class="o">::</span><span class="n">shared_ptr</span><span class="o">&lt;</span><span class="n">arrow</span><span class="o">::</span><span class="n">Table</span><span class="o">&gt;</span><span class="w"> </span><span class="n">table</span><span class="p">,</span><span class="w"> </span><span class="n">read_scanner</span><span class="o">-&gt;</span><span class="n">ToTable</span><span class="p">());</span>
<span class="linenos">167</span><span class="w"> </span><span class="n">std</span><span class="o">::</span><span class="n">cout</span><span class="w"> </span><span class="o">&lt;&lt;</span><span class="w"> </span><span class="n">table</span><span class="o">-&gt;</span><span class="n">ToString</span><span class="p">();</span>
<span class="linenos">168</span><span class="w"> </span><span class="c1">// (Doc section: To Table)</span>
<span class="linenos">169</span>
<span class="linenos">170</span><span class="w"> </span><span class="c1">// (Doc section: TableBatchReader)</span>
<span class="linenos">171</span><span class="w"> </span><span class="c1">// Now, let&#39;s get a table out to disk as a dataset!</span>
<span class="linenos">172</span><span class="w"> </span><span class="c1">// We make a RecordBatchReader from our Table, then set up a scanner, which lets us</span>
<span class="linenos">173</span><span class="w"> </span><span class="c1">// go to a file.</span>
<span class="linenos">174</span><span class="w"> </span><span class="n">std</span><span class="o">::</span><span class="n">shared_ptr</span><span class="o">&lt;</span><span class="n">arrow</span><span class="o">::</span><span class="n">TableBatchReader</span><span class="o">&gt;</span><span class="w"> </span><span class="n">write_dataset</span><span class="w"> </span><span class="o">=</span>
<span class="linenos">175</span><span class="w"> </span><span class="n">std</span><span class="o">::</span><span class="n">make_shared</span><span class="o">&lt;</span><span class="n">arrow</span><span class="o">::</span><span class="n">TableBatchReader</span><span class="o">&gt;</span><span class="p">(</span><span class="n">table</span><span class="p">);</span>
<span class="linenos">176</span><span class="w"> </span><span class="c1">// (Doc section: TableBatchReader)</span>
<span class="linenos">177</span><span class="w"> </span><span class="c1">// (Doc section: WriteScanner)</span>
<span class="linenos">178</span><span class="w"> </span><span class="k">auto</span><span class="w"> </span><span class="n">write_scanner_builder</span><span class="w"> </span><span class="o">=</span>
<span class="linenos">179</span><span class="w"> </span><span class="n">arrow</span><span class="o">::</span><span class="n">dataset</span><span class="o">::</span><span class="n">ScannerBuilder</span><span class="o">::</span><span class="n">FromRecordBatchReader</span><span class="p">(</span><span class="n">write_dataset</span><span class="p">);</span>
<span class="linenos">180</span><span class="w"> </span><span class="n">ARROW_ASSIGN_OR_RAISE</span><span class="p">(</span><span class="k">auto</span><span class="w"> </span><span class="n">write_scanner</span><span class="p">,</span><span class="w"> </span><span class="n">write_scanner_builder</span><span class="o">-&gt;</span><span class="n">Finish</span><span class="p">())</span>
<span class="linenos">181</span><span class="w"> </span><span class="c1">// (Doc section: WriteScanner)</span>
<span class="linenos">182</span><span class="w"> </span><span class="c1">// (Doc section: Partition Schema)</span>
<span class="linenos">183</span><span class="w"> </span><span class="c1">// The partition schema determines which fields are used as keys for partitioning.</span>
<span class="linenos">184</span><span class="w"> </span><span class="k">auto</span><span class="w"> </span><span class="n">partition_schema</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">arrow</span><span class="o">::</span><span class="n">schema</span><span class="p">({</span><span class="n">arrow</span><span class="o">::</span><span class="n">field</span><span class="p">(</span><span class="s">&quot;a&quot;</span><span class="p">,</span><span class="w"> </span><span class="n">arrow</span><span class="o">::</span><span class="n">utf8</span><span class="p">())});</span>
<span class="linenos">185</span><span class="w"> </span><span class="c1">// (Doc section: Partition Schema)</span>
<span class="linenos">186</span><span class="w"> </span><span class="c1">// (Doc section: Partition Create)</span>
<span class="linenos">187</span><span class="w"> </span><span class="c1">// We&#39;ll use Hive-style partitioning, which creates directories with &quot;key=value&quot;</span>
<span class="linenos">188</span><span class="w"> </span><span class="c1">// pairs.</span>
<span class="linenos">189</span><span class="w"> </span><span class="k">auto</span><span class="w"> </span><span class="n">partitioning</span><span class="w"> </span><span class="o">=</span>
<span class="linenos">190</span><span class="w"> </span><span class="n">std</span><span class="o">::</span><span class="n">make_shared</span><span class="o">&lt;</span><span class="n">arrow</span><span class="o">::</span><span class="n">dataset</span><span class="o">::</span><span class="n">HivePartitioning</span><span class="o">&gt;</span><span class="p">(</span><span class="n">partition_schema</span><span class="p">);</span>
<span class="linenos">191</span><span class="w"> </span><span class="c1">// (Doc section: Partition Create)</span>
<span class="linenos">192</span><span class="w"> </span><span class="c1">// (Doc section: Write Format)</span>
<span class="linenos">193</span><span class="w"> </span><span class="c1">// Now, we declare we&#39;ll be writing Parquet files.</span>
<span class="linenos">194</span><span class="w"> </span><span class="k">auto</span><span class="w"> </span><span class="n">write_format</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">std</span><span class="o">::</span><span class="n">make_shared</span><span class="o">&lt;</span><span class="n">arrow</span><span class="o">::</span><span class="n">dataset</span><span class="o">::</span><span class="n">ParquetFileFormat</span><span class="o">&gt;</span><span class="p">();</span>
<span class="linenos">195</span><span class="w"> </span><span class="c1">// (Doc section: Write Format)</span>
<span class="linenos">196</span><span class="w"> </span><span class="c1">// (Doc section: Write Options)</span>
<span class="linenos">197</span><span class="w"> </span><span class="c1">// This time, we make Options for writing, but do much more configuration.</span>
<span class="linenos">198</span><span class="w"> </span><span class="n">arrow</span><span class="o">::</span><span class="n">dataset</span><span class="o">::</span><span class="n">FileSystemDatasetWriteOptions</span><span class="w"> </span><span class="n">write_options</span><span class="p">;</span>
<span class="linenos">199</span><span class="w"> </span><span class="c1">// Defaults to start.</span>
<span class="linenos">200</span><span class="w"> </span><span class="n">write_options</span><span class="p">.</span><span class="n">file_write_options</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">write_format</span><span class="o">-&gt;</span><span class="n">DefaultWriteOptions</span><span class="p">();</span>
<span class="linenos">201</span><span class="w"> </span><span class="c1">// (Doc section: Write Options)</span>
<span class="linenos">202</span><span class="w"> </span><span class="c1">// (Doc section: Options FS)</span>
<span class="linenos">203</span><span class="w"> </span><span class="c1">// Use the filesystem we already have.</span>
<span class="linenos">204</span><span class="w"> </span><span class="n">write_options</span><span class="p">.</span><span class="n">filesystem</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">fs</span><span class="p">;</span>
<span class="linenos">205</span><span class="w"> </span><span class="c1">// (Doc section: Options FS)</span>
<span class="linenos">206</span><span class="w"> </span><span class="c1">// (Doc section: Options Target)</span>
<span class="linenos">207</span><span class="w"> </span><span class="c1">// Write to the folder &quot;write_dataset&quot; in current directory.</span>
<span class="linenos">208</span><span class="w"> </span><span class="n">write_options</span><span class="p">.</span><span class="n">base_dir</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="s">&quot;write_dataset&quot;</span><span class="p">;</span>
<span class="linenos">209</span><span class="w"> </span><span class="c1">// (Doc section: Options Target)</span>
<span class="linenos">210</span><span class="w"> </span><span class="c1">// (Doc section: Options Partitioning)</span>
<span class="linenos">211</span><span class="w"> </span><span class="c1">// Use the partitioning declared above.</span>
<span class="linenos">212</span><span class="w"> </span><span class="n">write_options</span><span class="p">.</span><span class="n">partitioning</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">partitioning</span><span class="p">;</span>
<span class="linenos">213</span><span class="w"> </span><span class="c1">// (Doc section: Options Partitioning)</span>
<span class="linenos">214</span><span class="w"> </span><span class="c1">// (Doc section: Options Name Template)</span>
<span class="linenos">215</span><span class="w"> </span><span class="c1">// Define what the name for the files making up the dataset will be.</span>
<span class="linenos">216</span><span class="w"> </span><span class="n">write_options</span><span class="p">.</span><span class="n">basename_template</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="s">&quot;part{i}.parquet&quot;</span><span class="p">;</span>
<span class="linenos">217</span><span class="w"> </span><span class="c1">// (Doc section: Options Name Template)</span>
<span class="linenos">218</span><span class="w"> </span><span class="c1">// (Doc section: Options File Behavior)</span>
<span class="linenos">219</span><span class="w"> </span><span class="c1">// Set behavior to overwrite existing data -- specifically, this lets this example</span>
<span class="linenos">220</span><span class="w"> </span><span class="c1">// be run more than once, and allows whatever code you have to overwrite what&#39;s there.</span>
<span class="linenos">221</span><span class="w"> </span><span class="n">write_options</span><span class="p">.</span><span class="n">existing_data_behavior</span><span class="w"> </span><span class="o">=</span>
<span class="linenos">222</span><span class="w"> </span><span class="n">arrow</span><span class="o">::</span><span class="n">dataset</span><span class="o">::</span><span class="n">ExistingDataBehavior</span><span class="o">::</span><span class="n">kOverwriteOrIgnore</span><span class="p">;</span>
<span class="linenos">223</span><span class="w"> </span><span class="c1">// (Doc section: Options File Behavior)</span>
<span class="linenos">224</span><span class="w"> </span><span class="c1">// (Doc section: Write Dataset)</span>
<span class="linenos">225</span><span class="w"> </span><span class="c1">// Write to disk!</span>
<span class="linenos">226</span><span class="w"> </span><span class="n">ARROW_RETURN_NOT_OK</span><span class="p">(</span>
<span class="linenos">227</span><span class="w"> </span><span class="n">arrow</span><span class="o">::</span><span class="n">dataset</span><span class="o">::</span><span class="n">FileSystemDataset</span><span class="o">::</span><span class="n">Write</span><span class="p">(</span><span class="n">write_options</span><span class="p">,</span><span class="w"> </span><span class="n">write_scanner</span><span class="p">));</span>
<span class="linenos">228</span><span class="w"> </span><span class="c1">// (Doc section: Write Dataset)</span>
<span class="linenos">229</span><span class="w"> </span><span class="c1">// (Doc section: Ret)</span>
<span class="linenos">230</span><span class="w"> </span><span class="k">return</span><span class="w"> </span><span class="n">arrow</span><span class="o">::</span><span class="n">Status</span><span class="o">::</span><span class="n">OK</span><span class="p">();</span>
<span class="linenos">231</span><span class="p">}</span>
<span class="linenos">232</span><span class="c1">// (Doc section: Ret)</span>
<span class="linenos">233</span><span class="c1">// (Doc section: Main)</span>
<span class="linenos">234</span><span class="kt">int</span><span class="w"> </span><span class="n">main</span><span class="p">()</span><span class="w"> </span><span class="p">{</span>
<span class="linenos">235</span><span class="w"> </span><span class="n">arrow</span><span class="o">::</span><span class="n">Status</span><span class="w"> </span><span class="n">st</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">RunMain</span><span class="p">();</span>
<span class="linenos">236</span><span class="w"> </span><span class="k">if</span><span class="w"> </span><span class="p">(</span><span class="o">!</span><span class="n">st</span><span class="p">.</span><span class="n">ok</span><span class="p">())</span><span class="w"> </span><span class="p">{</span>
<span class="linenos">237</span><span class="w"> </span><span class="n">std</span><span class="o">::</span><span class="n">cerr</span><span class="w"> </span><span class="o">&lt;&lt;</span><span class="w"> </span><span class="n">st</span><span class="w"> </span><span class="o">&lt;&lt;</span><span class="w"> </span><span class="n">std</span><span class="o">::</span><span class="n">endl</span><span class="p">;</span>
<span class="linenos">238</span><span class="w"> </span><span class="k">return</span><span class="w"> </span><span class="mi">1</span><span class="p">;</span>
<span class="linenos">239</span><span class="w"> </span><span class="p">}</span>
<span class="linenos">240</span><span class="w"> </span><span class="k">return</span><span class="w"> </span><span class="mi">0</span><span class="p">;</span>
<span class="linenos">241</span><span class="p">}</span>
<span class="linenos">242</span><span class="c1">// (Doc section: Main)</span>
</pre></div>
</div>
</section>
</section>
</article>
<footer class="prev-next-footer">
<div class="prev-next-area">
<a class="left-prev"
href="compute_tutorial.html"
title="previous page">
<i class="fa-solid fa-angle-left"></i>
<div class="prev-next-info">
<p class="prev-next-subtitle">previous</p>
<p class="prev-next-title">Arrow Compute</p>
</div>
</a>
<a class="right-next"
href="../user_guide.html"
title="next page">
<div class="prev-next-info">
<p class="prev-next-subtitle">next</p>
<p class="prev-next-title">User Guide</p>
</div>
<i class="fa-solid fa-angle-right"></i>
</a>
</div>
</footer>
</div>
<div class="bd-sidebar-secondary bd-toc"><div class="sidebar-secondary-items sidebar-secondary__inner">
<div class="sidebar-secondary-item">
<div
id="pst-page-navigation-heading-2"
class="page-toc tocsection onthispage">
<i class="fa-solid fa-list"></i> On this page
</div>
<nav class="bd-toc-nav page-toc" aria-labelledby="pst-page-navigation-heading-2">
<ul class="visible nav section-nav flex-column">
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#pre-requisites">Pre-requisites</a></li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#setup">Setup</a><ul class="visible nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#includes">Includes</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#main">Main()</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#generating-files-for-reading">Generating Files for Reading</a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#reading-a-partitioned-dataset">Reading a Partitioned Dataset</a><ul class="visible nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#preparing-a-filesystem-object">Preparing a FileSystem Object</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#creating-a-filesystemdatasetfactory">Creating a FileSystemDatasetFactory</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#build-dataset-using-factory">Build Dataset using Factory</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#move-dataset-into-table">Move Dataset into Table</a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#writing-a-dataset-to-disk-from-table">Writing a Dataset to Disk from Table</a><ul class="visible nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#prepare-data-from-table-for-writing">Prepare Data from Table for Writing</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#create-scanner-for-moving-table-data">Create Scanner for Moving Table Data</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#prepare-schema-partitioning-and-file-format-variables">Prepare Schema, Partitioning, and File Format Variables</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#configure-filesystemdatasetwriteoptions">Configure FileSystemDatasetWriteOptions</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#write-dataset-to-disk">Write Dataset to Disk</a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#ending-program">Ending Program</a></li>
</ul>
</nav></div>
<div class="sidebar-secondary-item">
<div class="tocsection editthispage">
<a href="https://github.com/apache/arrow/edit/main/docs/source/cpp/tutorials/datasets_tutorial.rst">
<i class="fa-solid fa-pencil"></i>
Edit on GitHub
</a>
</div>
</div>
</div></div>
</div>
<footer class="bd-footer-content">
</footer>
</main>
</div>
</div>
<!-- Scripts loaded after <body> so the DOM is not blocked -->
<script src="../../_static/scripts/bootstrap.js?digest=8d27b9dea8ad943066ae"></script>
<script src="../../_static/scripts/pydata-sphinx-theme.js?digest=8d27b9dea8ad943066ae"></script>
<footer class="bd-footer">
<div class="bd-footer__inner bd-page-width">
<div class="footer-items__start">
<div class="footer-item">
<p class="copyright">
© Copyright 2016-2024 Apache Software Foundation.
Apache Arrow, Arrow, Apache, the Apache feather logo, and the Apache Arrow project logo are either registered trademarks or trademarks of The Apache Software Foundation in the United States and other countries.
<br/>
</p>
</div>
<div class="footer-item">
<p class="sphinx-version">
Created using <a href="https://www.sphinx-doc.org/">Sphinx</a> 6.2.0.
<br/>
</p>
</div>
</div>
<div class="footer-items__end">
<div class="footer-item">
<p class="theme-version">
Built with the <a href="https://pydata-sphinx-theme.readthedocs.io/en/stable/index.html">PyData Sphinx Theme</a> 0.15.2.
</p></div>
</div>
</div>
</footer>
</body>
</html>