blob: 1f9ad0ec896c6c757e743da2d767caeeb48296b9 [file] [log] [blame]
<!DOCTYPE html>
<html lang="en" data-content_root="" >
<head>
<meta charset="utf-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="generator" content="Docutils 0.19: https://docutils.sourceforge.io/" />
<title>Dataset &#8212; Apache Arrow v17.0.0.dev52</title>
<script data-cfasync="false">
document.documentElement.dataset.mode = localStorage.getItem("mode") || "";
document.documentElement.dataset.theme = localStorage.getItem("theme") || "light";
</script>
<!-- Loaded before other Sphinx assets -->
<link href="../_static/styles/theme.css?digest=8d27b9dea8ad943066ae" rel="stylesheet" />
<link href="../_static/styles/bootstrap.css?digest=8d27b9dea8ad943066ae" rel="stylesheet" />
<link href="../_static/styles/pydata-sphinx-theme.css?digest=8d27b9dea8ad943066ae" rel="stylesheet" />
<link href="../_static/vendor/fontawesome/6.5.1/css/all.min.css?digest=8d27b9dea8ad943066ae" rel="stylesheet" />
<link rel="preload" as="font" type="font/woff2" crossorigin href="../_static/vendor/fontawesome/6.5.1/webfonts/fa-solid-900.woff2" />
<link rel="preload" as="font" type="font/woff2" crossorigin href="../_static/vendor/fontawesome/6.5.1/webfonts/fa-brands-400.woff2" />
<link rel="preload" as="font" type="font/woff2" crossorigin href="../_static/vendor/fontawesome/6.5.1/webfonts/fa-regular-400.woff2" />
<link rel="stylesheet" type="text/css" href="../_static/pygments.css" />
<link rel="stylesheet" type="text/css" href="../_static/copybutton.css" />
<link rel="stylesheet" type="text/css" href="../_static/design-style.1e8bd061cd6da7fc9cf755528e8ffc24.min.css" />
<link rel="stylesheet" type="text/css" href="../_static/theme_overrides.css" />
<!-- Pre-loaded scripts that we'll load fully later -->
<link rel="preload" as="script" href="../_static/scripts/bootstrap.js?digest=8d27b9dea8ad943066ae" />
<link rel="preload" as="script" href="../_static/scripts/pydata-sphinx-theme.js?digest=8d27b9dea8ad943066ae" />
<script src="../_static/vendor/fontawesome/6.5.1/js/all.min.js?digest=8d27b9dea8ad943066ae"></script>
<script data-url_root="../" id="documentation_options" src="../_static/documentation_options.js"></script>
<script src="../_static/doctools.js"></script>
<script src="../_static/sphinx_highlight.js"></script>
<script src="../_static/clipboard.min.js"></script>
<script src="../_static/copybutton.js"></script>
<script src="../_static/design-tabs.js"></script>
<script>DOCUMENTATION_OPTIONS.pagename = 'java/dataset';</script>
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.15.2';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = '/docs/_static/versions.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = 'dev/';
DOCUMENTATION_OPTIONS.show_version_warning_banner = true;
</script>
<link rel="canonical" href="https://arrow.apache.org/docs/java/dataset.html" />
<link rel="icon" href="../_static/favicon.ico"/>
<link rel="index" title="Index" href="../genindex.html" />
<link rel="search" title="Search" href="../search.html" />
<link rel="next" title="Substrait" href="substrait.html" />
<link rel="prev" title="Arrow Flight SQL JDBC Driver" href="flight_sql_jdbc_driver.html" />
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<!-- Matomo -->
<script>
var _paq = window._paq = window._paq || [];
/* tracker methods like "setCustomDimension" should be called before "trackPageView" */
/* We explicitly disable cookie tracking to avoid privacy issues */
_paq.push(['disableCookies']);
_paq.push(['trackPageView']);
_paq.push(['enableLinkTracking']);
(function() {
var u="https://analytics.apache.org/";
_paq.push(['setTrackerUrl', u+'matomo.php']);
_paq.push(['setSiteId', '20']);
var d=document, g=d.createElement('script'), s=d.getElementsByTagName('script')[0];
g.async=true; g.src=u+'matomo.js'; s.parentNode.insertBefore(g,s);
})();
</script>
<!-- End Matomo Code -->
</head>
<body data-bs-spy="scroll" data-bs-target=".bd-toc-nav" data-offset="180" data-bs-root-margin="0px 0px -60%" data-default-mode="">
<a id="pst-skip-link" class="skip-link" href="#main-content">Skip to main content</a>
<div id="pst-scroll-pixel-helper"></div>
<button type="button" class="btn rounded-pill" id="pst-back-to-top">
<i class="fa-solid fa-arrow-up"></i>
Back to top
</button>
<input type="checkbox"
class="sidebar-toggle"
name="__primary"
id="__primary"/>
<label class="overlay overlay-primary" for="__primary"></label>
<input type="checkbox"
class="sidebar-toggle"
name="__secondary"
id="__secondary"/>
<label class="overlay overlay-secondary" for="__secondary"></label>
<div class="search-button__wrapper">
<div class="search-button__overlay"></div>
<div class="search-button__search-container">
<form class="bd-search d-flex align-items-center"
action="../search.html"
method="get">
<i class="fa-solid fa-magnifying-glass"></i>
<input type="search"
class="form-control"
name="q"
id="search-input"
placeholder="Search the docs ..."
aria-label="Search the docs ..."
autocomplete="off"
autocorrect="off"
autocapitalize="off"
spellcheck="false"/>
<span class="search-button__kbd-shortcut"><kbd class="kbd-shortcut__modifier">Ctrl</kbd>+<kbd>K</kbd></span>
</form></div>
</div>
<header class="bd-header navbar navbar-expand-lg bd-navbar">
<div class="bd-header__inner bd-page-width">
<label class="sidebar-toggle primary-toggle" for="__primary">
<span class="fa-solid fa-bars"></span>
</label>
<div class="col-lg-3 navbar-header-items__start">
<div class="navbar-item">
<a class="navbar-brand logo" href="../index.html">
<img src="../_static/arrow.png" class="logo__image only-light" alt="Apache Arrow v17.0.0.dev52 - Home"/>
<script>document.write(`<img src="../_static/arrow-dark.png" class="logo__image only-dark" alt="Apache Arrow v17.0.0.dev52 - Home"/>`);</script>
</a></div>
</div>
<div class="col-lg-9 navbar-header-items">
<div class="me-auto navbar-header-items__center">
<div class="navbar-item">
<nav class="navbar-nav">
<ul class="bd-navbar-elements navbar-nav">
<li class="nav-item">
<a class="nav-link nav-internal" href="../format/index.html">
Specifications
</a>
</li>
<li class="nav-item">
<a class="nav-link nav-internal" href="../developers/index.html">
Development
</a>
</li>
<li class="nav-item dropdown">
<button class="btn dropdown-toggle nav-item" type="button" data-bs-toggle="dropdown" aria-expanded="false" aria-controls="pst-nav-more-links">
Implementations
</button>
<ul id="pst-nav-more-links" class="dropdown-menu">
<li class="nav-item">
<a class="nav-link dropdown-item nav-internal" href="../c_glib/index.html">
C/GLib
</a>
</li>
<li class="nav-item">
<a class="nav-link dropdown-item nav-internal" href="../cpp/index.html">
C++
</a>
</li>
<li class="nav-item">
<a class="nav-link dropdown-item nav-external" href="https://github.com/apache/arrow/blob/main/csharp/README.md">
C#
</a>
</li>
<li class="nav-item">
<a class="nav-link dropdown-item nav-external" href="https://pkg.go.dev/github.com/apache/arrow/go/v17">
Go
</a>
</li>
<li class="nav-item current active">
<a class="nav-link dropdown-item nav-internal" href="index.html">
Java
</a>
</li>
<li class="nav-item">
<a class="nav-link dropdown-item nav-internal" href="../js/index.html">
JavaScript
</a>
</li>
<li class="nav-item">
<a class="nav-link dropdown-item nav-external" href="https://arrow.apache.org/julia/">
Julia
</a>
</li>
<li class="nav-item">
<a class="nav-link dropdown-item nav-external" href="https://github.com/apache/arrow/blob/main/matlab/README.md">
MATLAB
</a>
</li>
<li class="nav-item">
<a class="nav-link dropdown-item nav-external" href="https://arrow.apache.org/nanoarrow/">
nanoarrow
</a>
</li>
<li class="nav-item">
<a class="nav-link dropdown-item nav-internal" href="../python/index.html">
Python
</a>
</li>
<li class="nav-item">
<a class="nav-link dropdown-item nav-internal" href="../r/index.html">
R
</a>
</li>
<li class="nav-item">
<a class="nav-link dropdown-item nav-external" href="https://github.com/apache/arrow/blob/main/ruby/README.md">
Ruby
</a>
</li>
<li class="nav-item">
<a class="nav-link dropdown-item nav-external" href="https://docs.rs/crate/arrow/">
Rust
</a>
</li>
<li class="nav-item">
<a class="nav-link dropdown-item nav-internal" href="../status.html">
Implementation Status
</a>
</li>
<li class="nav-item">
<a class="nav-link dropdown-item nav-external" href="https://arrow.apache.org/cookbook/cpp/">
C++ cookbook
</a>
</li>
<li class="nav-item">
<a class="nav-link dropdown-item nav-external" href="https://arrow.apache.org/cookbook/java/">
Java cookbook
</a>
</li>
<li class="nav-item">
<a class="nav-link dropdown-item nav-external" href="https://arrow.apache.org/cookbook/py/">
Python cookbook
</a>
</li>
<li class="nav-item">
<a class="nav-link dropdown-item nav-external" href="https://arrow.apache.org/cookbook/r/">
R cookbook
</a>
</li>
</ul>
</li>
</ul>
</nav></div>
</div>
<div class="navbar-header-items__end">
<div class="navbar-item navbar-persistent--container">
<script>
document.write(`
<button class="btn navbar-btn search-button-field search-button__button" title="Search" aria-label="Search" data-bs-placement="bottom" data-bs-toggle="tooltip">
<i class="fa-solid fa-magnifying-glass"></i>
<span class="search-button__default-text">Search</span>
<span class="search-button__kbd-shortcut"><kbd class="kbd-shortcut__modifier">Ctrl</kbd>+<kbd class="kbd-shortcut__modifier">K</kbd></span>
</button>
`);
</script>
</div>
<div class="navbar-item">
<script>
document.write(`
<div class="version-switcher__container dropdown">
<button id="pst-version-switcher-button-2"
type="button"
class="version-switcher__button btn btn-sm navbar-btn dropdown-toggle"
data-bs-toggle="dropdown"
aria-haspopup="listbox"
aria-controls="pst-version-switcher-list-2"
aria-label="Version switcher list"
>
Choose version <!-- this text may get changed later by javascript -->
<span class="caret"></span>
</button>
<div id="pst-version-switcher-list-2"
class="version-switcher__menu dropdown-menu list-group-flush py-0"
role="listbox" aria-labelledby="pst-version-switcher-button-2">
<!-- dropdown will be populated by javascript on page load -->
</div>
</div>
`);
</script></div>
<div class="navbar-item">
<script>
document.write(`
<button class="btn btn-sm navbar-btn theme-switch-button" title="light/dark" aria-label="light/dark" data-bs-placement="bottom" data-bs-toggle="tooltip">
<span class="theme-switch nav-link" data-mode="light"><i class="fa-solid fa-sun fa-lg"></i></span>
<span class="theme-switch nav-link" data-mode="dark"><i class="fa-solid fa-moon fa-lg"></i></span>
<span class="theme-switch nav-link" data-mode="auto"><i class="fa-solid fa-circle-half-stroke fa-lg"></i></span>
</button>
`);
</script></div>
<div class="navbar-item"><ul class="navbar-icon-links navbar-nav"
aria-label="Icon Links">
<li class="nav-item">
<a href="https://github.com/apache/arrow" title="GitHub" class="nav-link" rel="noopener" target="_blank" data-bs-toggle="tooltip" data-bs-placement="bottom"><span><i class="fa-brands fa-square-github fa-lg" aria-hidden="true"></i></span>
<span class="sr-only">GitHub</span></a>
</li>
<li class="nav-item">
<a href="https://twitter.com/ApacheArrow" title="X" class="nav-link" rel="noopener" target="_blank" data-bs-toggle="tooltip" data-bs-placement="bottom"><span><i class="fa-brands fa-square-x-twitter fa-lg" aria-hidden="true"></i></span>
<span class="sr-only">X</span></a>
</li>
</ul></div>
</div>
</div>
<div class="navbar-persistent--mobile">
<script>
document.write(`
<button class="btn navbar-btn search-button-field search-button__button" title="Search" aria-label="Search" data-bs-placement="bottom" data-bs-toggle="tooltip">
<i class="fa-solid fa-magnifying-glass"></i>
<span class="search-button__default-text">Search</span>
<span class="search-button__kbd-shortcut"><kbd class="kbd-shortcut__modifier">Ctrl</kbd>+<kbd class="kbd-shortcut__modifier">K</kbd></span>
</button>
`);
</script>
</div>
<label class="sidebar-toggle secondary-toggle" for="__secondary" tabindex="0">
<span class="fa-solid fa-outdent"></span>
</label>
</div>
</header>
<div class="bd-container">
<div class="bd-container__inner bd-page-width">
<div class="bd-sidebar-primary bd-sidebar">
<div class="sidebar-header-items sidebar-primary__section">
<div class="sidebar-header-items__center">
<div class="navbar-item">
<nav class="navbar-nav">
<ul class="bd-navbar-elements navbar-nav">
<li class="nav-item">
<a class="nav-link nav-internal" href="../format/index.html">
Specifications
</a>
</li>
<li class="nav-item">
<a class="nav-link nav-internal" href="../developers/index.html">
Development
</a>
</li>
<li class="nav-item dropdown">
<button class="btn dropdown-toggle nav-item" type="button" data-bs-toggle="dropdown" aria-expanded="false" aria-controls="pst-nav-more-links-2">
Implementations
</button>
<ul id="pst-nav-more-links-2" class="dropdown-menu">
<li class="nav-item">
<a class="nav-link dropdown-item nav-internal" href="../c_glib/index.html">
C/GLib
</a>
</li>
<li class="nav-item">
<a class="nav-link dropdown-item nav-internal" href="../cpp/index.html">
C++
</a>
</li>
<li class="nav-item">
<a class="nav-link dropdown-item nav-external" href="https://github.com/apache/arrow/blob/main/csharp/README.md">
C#
</a>
</li>
<li class="nav-item">
<a class="nav-link dropdown-item nav-external" href="https://pkg.go.dev/github.com/apache/arrow/go/v17">
Go
</a>
</li>
<li class="nav-item current active">
<a class="nav-link dropdown-item nav-internal" href="index.html">
Java
</a>
</li>
<li class="nav-item">
<a class="nav-link dropdown-item nav-internal" href="../js/index.html">
JavaScript
</a>
</li>
<li class="nav-item">
<a class="nav-link dropdown-item nav-external" href="https://arrow.apache.org/julia/">
Julia
</a>
</li>
<li class="nav-item">
<a class="nav-link dropdown-item nav-external" href="https://github.com/apache/arrow/blob/main/matlab/README.md">
MATLAB
</a>
</li>
<li class="nav-item">
<a class="nav-link dropdown-item nav-external" href="https://arrow.apache.org/nanoarrow/">
nanoarrow
</a>
</li>
<li class="nav-item">
<a class="nav-link dropdown-item nav-internal" href="../python/index.html">
Python
</a>
</li>
<li class="nav-item">
<a class="nav-link dropdown-item nav-internal" href="../r/index.html">
R
</a>
</li>
<li class="nav-item">
<a class="nav-link dropdown-item nav-external" href="https://github.com/apache/arrow/blob/main/ruby/README.md">
Ruby
</a>
</li>
<li class="nav-item">
<a class="nav-link dropdown-item nav-external" href="https://docs.rs/crate/arrow/">
Rust
</a>
</li>
<li class="nav-item">
<a class="nav-link dropdown-item nav-internal" href="../status.html">
Implementation Status
</a>
</li>
<li class="nav-item">
<a class="nav-link dropdown-item nav-external" href="https://arrow.apache.org/cookbook/cpp/">
C++ cookbook
</a>
</li>
<li class="nav-item">
<a class="nav-link dropdown-item nav-external" href="https://arrow.apache.org/cookbook/java/">
Java cookbook
</a>
</li>
<li class="nav-item">
<a class="nav-link dropdown-item nav-external" href="https://arrow.apache.org/cookbook/py/">
Python cookbook
</a>
</li>
<li class="nav-item">
<a class="nav-link dropdown-item nav-external" href="https://arrow.apache.org/cookbook/r/">
R cookbook
</a>
</li>
</ul>
</li>
</ul>
</nav></div>
</div>
<div class="sidebar-header-items__end">
<div class="navbar-item">
<script>
document.write(`
<div class="version-switcher__container dropdown">
<button id="pst-version-switcher-button-3"
type="button"
class="version-switcher__button btn btn-sm navbar-btn dropdown-toggle"
data-bs-toggle="dropdown"
aria-haspopup="listbox"
aria-controls="pst-version-switcher-list-3"
aria-label="Version switcher list"
>
Choose version <!-- this text may get changed later by javascript -->
<span class="caret"></span>
</button>
<div id="pst-version-switcher-list-3"
class="version-switcher__menu dropdown-menu list-group-flush py-0"
role="listbox" aria-labelledby="pst-version-switcher-button-3">
<!-- dropdown will be populated by javascript on page load -->
</div>
</div>
`);
</script></div>
<div class="navbar-item">
<script>
document.write(`
<button class="btn btn-sm navbar-btn theme-switch-button" title="light/dark" aria-label="light/dark" data-bs-placement="bottom" data-bs-toggle="tooltip">
<span class="theme-switch nav-link" data-mode="light"><i class="fa-solid fa-sun fa-lg"></i></span>
<span class="theme-switch nav-link" data-mode="dark"><i class="fa-solid fa-moon fa-lg"></i></span>
<span class="theme-switch nav-link" data-mode="auto"><i class="fa-solid fa-circle-half-stroke fa-lg"></i></span>
</button>
`);
</script></div>
<div class="navbar-item"><ul class="navbar-icon-links navbar-nav"
aria-label="Icon Links">
<li class="nav-item">
<a href="https://github.com/apache/arrow" title="GitHub" class="nav-link" rel="noopener" target="_blank" data-bs-toggle="tooltip" data-bs-placement="bottom"><span><i class="fa-brands fa-square-github fa-lg" aria-hidden="true"></i></span>
<span class="sr-only">GitHub</span></a>
</li>
<li class="nav-item">
<a href="https://twitter.com/ApacheArrow" title="X" class="nav-link" rel="noopener" target="_blank" data-bs-toggle="tooltip" data-bs-placement="bottom"><span><i class="fa-brands fa-square-x-twitter fa-lg" aria-hidden="true"></i></span>
<span class="sr-only">X</span></a>
</li>
</ul></div>
</div>
</div>
<div class="sidebar-primary-items__start sidebar-primary__section">
<div class="sidebar-primary-item">
<nav class="bd-docs-nav bd-links"
aria-label="Section Navigation">
<p class="bd-links__title" role="heading" aria-level="1">Section Navigation</p>
<div class="bd-toc-item navbar-nav"><ul class="current nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="quickstartguide.html">Quick Start Guide</a></li>
<li class="toctree-l1"><a class="reference internal" href="overview.html">High-Level Overview</a></li>
<li class="toctree-l1"><a class="reference internal" href="install.html">Installing Java Modules</a></li>
<li class="toctree-l1"><a class="reference internal" href="memory.html">Memory Management</a></li>
<li class="toctree-l1"><a class="reference internal" href="vector.html">ValueVector</a></li>
<li class="toctree-l1"><a class="reference internal" href="vector_schema_root.html">Tabular Data</a></li>
<li class="toctree-l1"><a class="reference internal" href="table.html">Table</a></li>
<li class="toctree-l1"><a class="reference internal" href="ipc.html">Reading/Writing IPC formats</a></li>
<li class="toctree-l1"><a class="reference internal" href="algorithm.html">Java Algorithms</a></li>
<li class="toctree-l1"><a class="reference internal" href="flight.html">Arrow Flight RPC</a></li>
<li class="toctree-l1"><a class="reference internal" href="flight_sql.html">Arrow Flight SQL</a></li>
<li class="toctree-l1"><a class="reference internal" href="flight_sql_jdbc_driver.html">Arrow Flight SQL JDBC Driver</a></li>
<li class="toctree-l1 current active"><a class="current reference internal" href="#">Dataset</a></li>
<li class="toctree-l1"><a class="reference internal" href="substrait.html">Substrait</a></li>
<li class="toctree-l1"><a class="reference internal" href="cdata.html">C Data Interface</a></li>
<li class="toctree-l1"><a class="reference internal" href="jdbc.html">Arrow JDBC Adapter</a></li>
<li class="toctree-l1"><a class="reference internal" href="reference/index.html">Reference (javadoc)</a></li>
<li class="toctree-l1"><a class="reference external" href="https://arrow.apache.org/cookbook/java/">Java cookbook</a></li>
</ul>
</div>
</nav></div>
</div>
<div class="sidebar-primary-items__end sidebar-primary__section">
</div>
<div id="rtd-footer-container"></div>
</div>
<main id="main-content" class="bd-main">
<div class="bd-content">
<div class="bd-article-container">
<div class="bd-header-article">
<div class="header-article-items header-article__inner">
<div class="header-article-items__start">
<div class="header-article-item">
<nav aria-label="Breadcrumb">
<ul class="bd-breadcrumbs">
<li class="breadcrumb-item breadcrumb-home">
<a href="../index.html" class="nav-link" aria-label="Home">
<i class="fa-solid fa-home"></i>
</a>
</li>
<li class="breadcrumb-item"><a href="index.html" class="nav-link">Java Implementation</a></li>
<li class="breadcrumb-item active" aria-current="page">Dataset</li>
</ul>
</nav>
</div>
</div>
</div>
</div>
<div id="searchbox"></div>
<article class="bd-article">
<section id="dataset">
<h1><a class="toc-backref" href="#id1" role="doc-backlink">Dataset</a><a class="headerlink" href="#dataset" title="Permalink to this heading">#</a></h1>
<div class="admonition warning">
<p class="admonition-title">Warning</p>
<p>Experimental: The Java module <code class="docutils literal notranslate"><span class="pre">dataset</span></code> is currently under early
development. API might be changed in each release of Apache Arrow until it
gets mature.</p>
</div>
<p>Dataset is an universal layer in Apache Arrow for querying data in different
formats or in different partitioning strategies. Usually the data to be queried
is supposed to be located from a traditional file system, however Arrow Dataset
is not designed only for querying files but can be extended to serve all
possible data sources such as from inter-process communication or from other
network locations, etc.</p>
<nav class="contents" id="contents">
<p class="topic-title">Contents</p>
<ul class="simple">
<li><p><a class="reference internal" href="#dataset" id="id1">Dataset</a></p>
<ul>
<li><p><a class="reference internal" href="#getting-started" id="id2">Getting Started</a></p></li>
<li><p><a class="reference internal" href="#schema" id="id3">Schema</a></p></li>
<li><p><a class="reference internal" href="#projection-subset-of-columns" id="id4">Projection (Subset of Columns)</a></p></li>
<li><p><a class="reference internal" href="#projection-produce-new-columns-and-filters" id="id5">Projection (Produce New Columns) and Filters</a></p></li>
<li><p><a class="reference internal" href="#read-data-from-hdfs" id="id6">Read Data from HDFS</a></p></li>
<li><p><a class="reference internal" href="#native-memory-management" id="id7">Native Memory Management</a></p></li>
<li><p><a class="reference internal" href="#usage-notes" id="id8">Usage Notes</a></p>
<ul>
<li><p><a class="reference internal" href="#native-object-resource-management" id="id9">Native Object Resource Management</a></p></li>
<li><p><a class="reference internal" href="#batchsize" id="id10">BatchSize</a></p></li>
</ul>
</li>
</ul>
</li>
</ul>
</nav>
<section id="getting-started">
<h2><a class="toc-backref" href="#id2" role="doc-backlink">Getting Started</a><a class="headerlink" href="#getting-started" title="Permalink to this heading">#</a></h2>
<p>Currently supported file formats are:</p>
<ul class="simple">
<li><p>Apache Arrow (<code class="docutils literal notranslate"><span class="pre">.arrow</span></code>)</p></li>
<li><p>Apache ORC (<code class="docutils literal notranslate"><span class="pre">.orc</span></code>)</p></li>
<li><p>Apache Parquet (<code class="docutils literal notranslate"><span class="pre">.parquet</span></code>)</p></li>
<li><p>Comma-Separated Values (<code class="docutils literal notranslate"><span class="pre">.csv</span></code>)</p></li>
<li><p>Line-delimited JSON Values (<code class="docutils literal notranslate"><span class="pre">.json</span></code>)</p></li>
</ul>
<p>Below shows a simplest example of using Dataset to query a Parquet file in Java:</p>
<div class="highlight-Java notranslate"><div class="highlight"><pre><span></span><span class="c1">// read data from file /opt/example.parquet</span>
<span class="n">String</span><span class="w"> </span><span class="n">uri</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="s">&quot;file:/opt/example.parquet&quot;</span><span class="p">;</span>
<span class="n">ScanOptions</span><span class="w"> </span><span class="n">options</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="k">new</span><span class="w"> </span><span class="n">ScanOptions</span><span class="p">(</span><span class="cm">/*batchSize*/</span><span class="w"> </span><span class="mi">32768</span><span class="p">);</span>
<span class="k">try</span><span class="w"> </span><span class="p">(</span>
<span class="w"> </span><span class="n">BufferAllocator</span><span class="w"> </span><span class="n">allocator</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="k">new</span><span class="w"> </span><span class="n">RootAllocator</span><span class="p">();</span>
<span class="w"> </span><span class="n">DatasetFactory</span><span class="w"> </span><span class="n">datasetFactory</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="k">new</span><span class="w"> </span><span class="n">FileSystemDatasetFactory</span><span class="p">(</span>
<span class="w"> </span><span class="n">allocator</span><span class="p">,</span><span class="w"> </span><span class="n">NativeMemoryPool</span><span class="p">.</span><span class="na">getDefault</span><span class="p">(),</span>
<span class="w"> </span><span class="n">FileFormat</span><span class="p">.</span><span class="na">PARQUET</span><span class="p">,</span><span class="w"> </span><span class="n">uri</span><span class="p">);</span>
<span class="w"> </span><span class="n">Dataset</span><span class="w"> </span><span class="n">dataset</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">datasetFactory</span><span class="p">.</span><span class="na">finish</span><span class="p">();</span>
<span class="w"> </span><span class="n">Scanner</span><span class="w"> </span><span class="n">scanner</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">dataset</span><span class="p">.</span><span class="na">newScan</span><span class="p">(</span><span class="n">options</span><span class="p">);</span>
<span class="w"> </span><span class="n">ArrowReader</span><span class="w"> </span><span class="n">reader</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">scanner</span><span class="p">.</span><span class="na">scanBatches</span><span class="p">()</span>
<span class="p">)</span><span class="w"> </span><span class="p">{</span>
<span class="w"> </span><span class="n">List</span><span class="o">&lt;</span><span class="n">ArrowRecordBatch</span><span class="o">&gt;</span><span class="w"> </span><span class="n">batches</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="k">new</span><span class="w"> </span><span class="n">ArrayList</span><span class="o">&lt;&gt;</span><span class="p">();</span>
<span class="w"> </span><span class="k">while</span><span class="w"> </span><span class="p">(</span><span class="n">reader</span><span class="p">.</span><span class="na">loadNextBatch</span><span class="p">())</span><span class="w"> </span><span class="p">{</span>
<span class="w"> </span><span class="k">try</span><span class="w"> </span><span class="p">(</span><span class="n">VectorSchemaRoot</span><span class="w"> </span><span class="n">root</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">reader</span><span class="p">.</span><span class="na">getVectorSchemaRoot</span><span class="p">())</span><span class="w"> </span><span class="p">{</span>
<span class="w"> </span><span class="kd">final</span><span class="w"> </span><span class="n">VectorUnloader</span><span class="w"> </span><span class="n">unloader</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="k">new</span><span class="w"> </span><span class="n">VectorUnloader</span><span class="p">(</span><span class="n">root</span><span class="p">);</span>
<span class="w"> </span><span class="n">batches</span><span class="p">.</span><span class="na">add</span><span class="p">(</span><span class="n">unloader</span><span class="p">.</span><span class="na">getRecordBatch</span><span class="p">());</span>
<span class="w"> </span><span class="p">}</span>
<span class="w"> </span><span class="p">}</span>
<span class="w"> </span><span class="c1">// do something with read record batches, for example:</span>
<span class="w"> </span><span class="n">analyzeArrowData</span><span class="p">(</span><span class="n">batches</span><span class="p">);</span>
<span class="w"> </span><span class="c1">// finished the analysis of the data, close all resources:</span>
<span class="w"> </span><span class="n">AutoCloseables</span><span class="p">.</span><span class="na">close</span><span class="p">(</span><span class="n">batches</span><span class="p">);</span>
<span class="p">}</span><span class="w"> </span><span class="k">catch</span><span class="w"> </span><span class="p">(</span><span class="n">Exception</span><span class="w"> </span><span class="n">e</span><span class="p">)</span><span class="w"> </span><span class="p">{</span>
<span class="w"> </span><span class="n">e</span><span class="p">.</span><span class="na">printStackTrace</span><span class="p">();</span>
<span class="p">}</span>
</pre></div>
</div>
<div class="admonition note">
<p class="admonition-title">Note</p>
<p><code class="docutils literal notranslate"><span class="pre">ArrowRecordBatch</span></code> is a low-level composite Arrow data exchange format
that doesn’t provide API to read typed data from it directly.
It’s recommended to use utilities <code class="docutils literal notranslate"><span class="pre">VectorLoader</span></code> to load it into a schema
aware container <code class="docutils literal notranslate"><span class="pre">VectorSchemaRoot</span></code> by which user could be able to access
decoded data conveniently in Java.</p>
<p>The <code class="docutils literal notranslate"><span class="pre">ScanOptions</span> <span class="pre">batchSize</span></code> argument takes effect only if it is set to a value
smaller than the number of rows in the recordbatch.</p>
</div>
<div class="admonition seealso">
<p class="admonition-title">See also</p>
<p>Load record batches with <a class="reference internal" href="vector_schema_root.html"><span class="doc">VectorSchemaRoot</span></a>.</p>
</div>
</section>
<section id="schema">
<h2><a class="toc-backref" href="#id3" role="doc-backlink">Schema</a><a class="headerlink" href="#schema" title="Permalink to this heading">#</a></h2>
<p>Schema of the data to be queried can be inspected via method
<code class="docutils literal notranslate"><span class="pre">DatasetFactory#inspect()</span></code> before actually reading it. For example:</p>
<div class="highlight-Java notranslate"><div class="highlight"><pre><span></span><span class="c1">// read data from local file /opt/example.parquet</span>
<span class="n">String</span><span class="w"> </span><span class="n">uri</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="s">&quot;file:/opt/example.parquet&quot;</span><span class="p">;</span>
<span class="n">BufferAllocator</span><span class="w"> </span><span class="n">allocator</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="k">new</span><span class="w"> </span><span class="n">RootAllocator</span><span class="p">(</span><span class="n">Long</span><span class="p">.</span><span class="na">MAX_VALUE</span><span class="p">);</span>
<span class="n">DatasetFactory</span><span class="w"> </span><span class="n">factory</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="k">new</span><span class="w"> </span><span class="n">FileSystemDatasetFactory</span><span class="p">(</span><span class="n">allocator</span><span class="p">,</span>
<span class="w"> </span><span class="n">NativeMemoryPool</span><span class="p">.</span><span class="na">getDefault</span><span class="p">(),</span><span class="w"> </span><span class="n">FileFormat</span><span class="p">.</span><span class="na">PARQUET</span><span class="p">,</span><span class="w"> </span><span class="n">uri</span><span class="p">);</span>
<span class="c1">// inspect schema</span>
<span class="n">Schema</span><span class="w"> </span><span class="n">schema</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">factory</span><span class="p">.</span><span class="na">inspect</span><span class="p">();</span>
</pre></div>
</div>
<p>For some of the data format that is compatible with a user-defined schema, user
can use method <code class="docutils literal notranslate"><span class="pre">DatasetFactory#inspect(Schema</span> <span class="pre">schema)</span></code> to create the dataset:</p>
<div class="highlight-Java notranslate"><div class="highlight"><pre><span></span><span class="n">Schema</span><span class="w"> </span><span class="n">schema</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">createUserSchema</span><span class="p">()</span>
<span class="n">Dataset</span><span class="w"> </span><span class="n">dataset</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">factory</span><span class="p">.</span><span class="na">finish</span><span class="p">(</span><span class="n">schema</span><span class="p">);</span>
</pre></div>
</div>
<p>Otherwise when the non-parameter method <code class="docutils literal notranslate"><span class="pre">DatasetFactory#inspect()</span></code> is called,
schema will be inferred automatically from data source. The same as the result
of <code class="docutils literal notranslate"><span class="pre">DatasetFactory#inspect()</span></code>.</p>
<p>Also, if projector is specified during scanning (see next section
<a class="reference internal" href="#java-dataset-projection"><span class="std std-ref">Projection (Subset of Columns)</span></a>), the actual schema of output data can be got
within method <code class="docutils literal notranslate"><span class="pre">Scanner::schema()</span></code>:</p>
<div class="highlight-Java notranslate"><div class="highlight"><pre><span></span><span class="n">Scanner</span><span class="w"> </span><span class="n">scanner</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">dataset</span><span class="p">.</span><span class="na">newScan</span><span class="p">(</span>
<span class="w"> </span><span class="k">new</span><span class="w"> </span><span class="n">ScanOptions</span><span class="p">(</span><span class="mi">32768</span><span class="p">,</span><span class="w"> </span><span class="n">Optional</span><span class="p">.</span><span class="na">of</span><span class="p">(</span><span class="k">new</span><span class="w"> </span><span class="n">String</span><span class="o">[]</span><span class="w"> </span><span class="p">{</span><span class="s">&quot;id&quot;</span><span class="p">,</span><span class="w"> </span><span class="s">&quot;name&quot;</span><span class="p">})));</span>
<span class="n">Schema</span><span class="w"> </span><span class="n">projectedSchema</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">scanner</span><span class="p">.</span><span class="na">schema</span><span class="p">();</span>
</pre></div>
</div>
</section>
<section id="projection-subset-of-columns">
<span id="java-dataset-projection"></span><h2><a class="toc-backref" href="#id4" role="doc-backlink">Projection (Subset of Columns)</a><a class="headerlink" href="#projection-subset-of-columns" title="Permalink to this heading">#</a></h2>
<p>User can specify projections in ScanOptions. For example:</p>
<div class="highlight-Java notranslate"><div class="highlight"><pre><span></span><span class="n">String</span><span class="o">[]</span><span class="w"> </span><span class="n">projection</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="k">new</span><span class="w"> </span><span class="n">String</span><span class="o">[]</span><span class="w"> </span><span class="p">{</span><span class="s">&quot;id&quot;</span><span class="p">,</span><span class="w"> </span><span class="s">&quot;name&quot;</span><span class="p">};</span>
<span class="n">ScanOptions</span><span class="w"> </span><span class="n">options</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="k">new</span><span class="w"> </span><span class="n">ScanOptions</span><span class="p">(</span><span class="mi">32768</span><span class="p">,</span><span class="w"> </span><span class="n">Optional</span><span class="p">.</span><span class="na">of</span><span class="p">(</span><span class="n">projection</span><span class="p">));</span>
</pre></div>
</div>
<p>If no projection is needed, leave the optional projection argument absent in
ScanOptions:</p>
<div class="highlight-Java notranslate"><div class="highlight"><pre><span></span><span class="n">ScanOptions</span><span class="w"> </span><span class="n">options</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="k">new</span><span class="w"> </span><span class="n">ScanOptions</span><span class="p">(</span><span class="mi">32768</span><span class="p">,</span><span class="w"> </span><span class="n">Optional</span><span class="p">.</span><span class="na">empty</span><span class="p">());</span>
</pre></div>
</div>
<p>Or use shortcut constructor:</p>
<div class="highlight-Java notranslate"><div class="highlight"><pre><span></span><span class="n">ScanOptions</span><span class="w"> </span><span class="n">options</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="k">new</span><span class="w"> </span><span class="n">ScanOptions</span><span class="p">(</span><span class="mi">32768</span><span class="p">);</span>
</pre></div>
</div>
<p>Then all columns will be emitted during scanning.</p>
</section>
<section id="projection-produce-new-columns-and-filters">
<h2><a class="toc-backref" href="#id5" role="doc-backlink">Projection (Produce New Columns) and Filters</a><a class="headerlink" href="#projection-produce-new-columns-and-filters" title="Permalink to this heading">#</a></h2>
<p>User can specify projections (new columns) or filters in ScanOptions using Substrait. For example:</p>
<div class="highlight-Java notranslate"><div class="highlight"><pre><span></span><span class="n">ByteBuffer</span><span class="w"> </span><span class="n">substraitExpressionFilter</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">getSubstraitExpressionFilter</span><span class="p">();</span>
<span class="n">ByteBuffer</span><span class="w"> </span><span class="n">substraitExpressionProject</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">getSubstraitExpressionProjection</span><span class="p">();</span>
<span class="c1">// Use Substrait APIs to create an Expression and serialize to a ByteBuffer</span>
<span class="n">ScanOptions</span><span class="w"> </span><span class="n">options</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="k">new</span><span class="w"> </span><span class="n">ScanOptions</span><span class="p">.</span><span class="na">Builder</span><span class="p">(</span><span class="cm">/*batchSize*/</span><span class="w"> </span><span class="mi">32768</span><span class="p">)</span>
<span class="w"> </span><span class="p">.</span><span class="na">columns</span><span class="p">(</span><span class="n">Optional</span><span class="p">.</span><span class="na">empty</span><span class="p">())</span>
<span class="w"> </span><span class="p">.</span><span class="na">substraitExpressionFilter</span><span class="p">(</span><span class="n">substraitExpressionFilter</span><span class="p">)</span>
<span class="w"> </span><span class="p">.</span><span class="na">substraitExpressionProjection</span><span class="p">(</span><span class="n">getSubstraitExpressionProjection</span><span class="p">())</span>
<span class="w"> </span><span class="p">.</span><span class="na">build</span><span class="p">();</span>
</pre></div>
</div>
<div class="admonition seealso">
<p class="admonition-title">See also</p>
<dl class="simple">
<dt><a class="reference internal" href="substrait.html"><span class="doc">Executing Projections and Filters Using Extended Expressions</span></a></dt><dd><p>Projections and Filters using Substrait.</p>
</dd>
</dl>
</div>
</section>
<section id="read-data-from-hdfs">
<h2><a class="toc-backref" href="#id6" role="doc-backlink">Read Data from HDFS</a><a class="headerlink" href="#read-data-from-hdfs" title="Permalink to this heading">#</a></h2>
<p><code class="docutils literal notranslate"><span class="pre">FileSystemDataset</span></code> supports reading data from non-local file systems. HDFS
support is included in the official Apache Arrow Java package releases and
can be used directly without re-building the source code.</p>
<p>To access HDFS data using Dataset API, pass a general HDFS URI to
<code class="docutils literal notranslate"><span class="pre">FilesSystemDatasetFactory</span></code>:</p>
<div class="highlight-Java notranslate"><div class="highlight"><pre><span></span><span class="n">String</span><span class="w"> </span><span class="n">uri</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="s">&quot;hdfs://{hdfs_host}:{port}/data/example.parquet&quot;</span><span class="p">;</span>
<span class="n">BufferAllocator</span><span class="w"> </span><span class="n">allocator</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="k">new</span><span class="w"> </span><span class="n">RootAllocator</span><span class="p">(</span><span class="n">Long</span><span class="p">.</span><span class="na">MAX_VALUE</span><span class="p">);</span>
<span class="n">DatasetFactory</span><span class="w"> </span><span class="n">factory</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="k">new</span><span class="w"> </span><span class="n">FileSystemDatasetFactory</span><span class="p">(</span><span class="n">allocator</span><span class="p">,</span>
<span class="w"> </span><span class="n">NativeMemoryPool</span><span class="p">.</span><span class="na">getDefault</span><span class="p">(),</span><span class="w"> </span><span class="n">FileFormat</span><span class="p">.</span><span class="na">PARQUET</span><span class="p">,</span><span class="w"> </span><span class="n">uri</span><span class="p">);</span>
</pre></div>
</div>
</section>
<section id="native-memory-management">
<h2><a class="toc-backref" href="#id7" role="doc-backlink">Native Memory Management</a><a class="headerlink" href="#native-memory-management" title="Permalink to this heading">#</a></h2>
<p>To gain better performance and reduce code complexity, Java
<code class="docutils literal notranslate"><span class="pre">FileSystemDataset</span></code> internally relies on C++
<code class="docutils literal notranslate"><span class="pre">arrow::dataset::FileSystemDataset</span></code> via JNI.
As a result, all Arrow data read from <code class="docutils literal notranslate"><span class="pre">FileSystemDataset</span></code> is supposed to be
allocated off the JVM heap. To manage this part of memory, an utility class
<code class="docutils literal notranslate"><span class="pre">NativeMemoryPool</span></code> is provided to users.</p>
<p>As a basic example, by using a listenable <code class="docutils literal notranslate"><span class="pre">NativeMemoryPool</span></code>, user can pass
a listener hooking on C++ buffer allocation/deallocation:</p>
<div class="highlight-Java notranslate"><div class="highlight"><pre><span></span><span class="n">AtomicLong</span><span class="w"> </span><span class="n">reserved</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="k">new</span><span class="w"> </span><span class="n">AtomicLong</span><span class="p">(</span><span class="mi">0</span><span class="n">L</span><span class="p">);</span>
<span class="n">ReservationListener</span><span class="w"> </span><span class="n">listener</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="k">new</span><span class="w"> </span><span class="n">ReservationListener</span><span class="p">()</span><span class="w"> </span><span class="p">{</span>
<span class="w"> </span><span class="nd">@Override</span>
<span class="w"> </span><span class="kd">public</span><span class="w"> </span><span class="kt">void</span><span class="w"> </span><span class="nf">reserve</span><span class="p">(</span><span class="kt">long</span><span class="w"> </span><span class="n">size</span><span class="p">)</span><span class="w"> </span><span class="p">{</span>
<span class="w"> </span><span class="n">reserved</span><span class="p">.</span><span class="na">getAndAdd</span><span class="p">(</span><span class="n">size</span><span class="p">);</span>
<span class="w"> </span><span class="p">}</span>
<span class="w"> </span><span class="nd">@Override</span>
<span class="w"> </span><span class="kd">public</span><span class="w"> </span><span class="kt">void</span><span class="w"> </span><span class="nf">unreserve</span><span class="p">(</span><span class="kt">long</span><span class="w"> </span><span class="n">size</span><span class="p">)</span><span class="w"> </span><span class="p">{</span>
<span class="w"> </span><span class="n">reserved</span><span class="p">.</span><span class="na">getAndAdd</span><span class="p">(</span><span class="o">-</span><span class="n">size</span><span class="p">);</span>
<span class="w"> </span><span class="p">}</span>
<span class="p">};</span>
<span class="n">NativeMemoryPool</span><span class="w"> </span><span class="n">pool</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">NativeMemoryPool</span><span class="p">.</span><span class="na">createListenable</span><span class="p">(</span><span class="n">listener</span><span class="p">);</span>
<span class="n">FileSystemDatasetFactory</span><span class="w"> </span><span class="n">factory</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="k">new</span><span class="w"> </span><span class="n">FileSystemDatasetFactory</span><span class="p">(</span><span class="n">allocator</span><span class="p">,</span>
<span class="w"> </span><span class="n">pool</span><span class="p">,</span><span class="w"> </span><span class="n">FileFormat</span><span class="p">.</span><span class="na">PARQUET</span><span class="p">,</span><span class="w"> </span><span class="n">uri</span><span class="p">);</span>
</pre></div>
</div>
<p>Also, it’s a very common case to reserve the same amount of JVM direct memory
for the data read from datasets. For this use a built-in utility
class <code class="docutils literal notranslate"><span class="pre">DirectReservationListener</span></code> is provided:</p>
<div class="highlight-Java notranslate"><div class="highlight"><pre><span></span><span class="n">NativeMemoryPool</span><span class="w"> </span><span class="n">pool</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">NativeMemoryPool</span><span class="p">.</span><span class="na">createListenable</span><span class="p">(</span>
<span class="w"> </span><span class="n">DirectReservationListener</span><span class="p">.</span><span class="na">instance</span><span class="p">());</span>
</pre></div>
</div>
<p>This way, once the allocated byte count of Arrow buffers reaches the limit of
JVM direct memory, <code class="docutils literal notranslate"><span class="pre">OutOfMemoryError:</span> <span class="pre">Direct</span> <span class="pre">buffer</span> <span class="pre">memory</span></code> will
be thrown during scanning.</p>
<div class="admonition note">
<p class="admonition-title">Note</p>
<p>The default instance <code class="docutils literal notranslate"><span class="pre">NativeMemoryPool.getDefaultMemoryPool()</span></code> does
nothing on buffer allocation/deallocation. It’s OK to use it in
the case of POC or testing, but for production use in complex environment,
it’s recommended to manage memory by using a listenable memory pool.</p>
</div>
<div class="admonition note">
<p class="admonition-title">Note</p>
<p>The <code class="docutils literal notranslate"><span class="pre">BufferAllocator</span></code> instance passed to <code class="docutils literal notranslate"><span class="pre">FileSystemDatasetFactory</span></code>’s
constructor is also aware of the overall memory usage of the produced
dataset instances. Once the Java buffers are created the passed allocator
will become their parent allocator.</p>
</div>
</section>
<section id="usage-notes">
<h2><a class="toc-backref" href="#id8" role="doc-backlink">Usage Notes</a><a class="headerlink" href="#usage-notes" title="Permalink to this heading">#</a></h2>
<section id="native-object-resource-management">
<h3><a class="toc-backref" href="#id9" role="doc-backlink">Native Object Resource Management</a><a class="headerlink" href="#native-object-resource-management" title="Permalink to this heading">#</a></h3>
<p>As another result of relying on JNI, all components related to
<code class="docutils literal notranslate"><span class="pre">FileSystemDataset</span></code> should be closed manually or use try-with-resources to
release the corresponding native objects after using. For example:</p>
<div class="highlight-Java notranslate"><div class="highlight"><pre><span></span><span class="n">String</span><span class="w"> </span><span class="n">uri</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="s">&quot;file:/opt/example.parquet&quot;</span><span class="p">;</span>
<span class="n">ScanOptions</span><span class="w"> </span><span class="n">options</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="k">new</span><span class="w"> </span><span class="n">ScanOptions</span><span class="p">(</span><span class="cm">/*batchSize*/</span><span class="w"> </span><span class="mi">32768</span><span class="p">);</span>
<span class="k">try</span><span class="w"> </span><span class="p">(</span>
<span class="w"> </span><span class="n">BufferAllocator</span><span class="w"> </span><span class="n">allocator</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="k">new</span><span class="w"> </span><span class="n">RootAllocator</span><span class="p">();</span>
<span class="w"> </span><span class="n">DatasetFactory</span><span class="w"> </span><span class="n">factory</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="k">new</span><span class="w"> </span><span class="n">FileSystemDatasetFactory</span><span class="p">(</span>
<span class="w"> </span><span class="n">allocator</span><span class="p">,</span><span class="w"> </span><span class="n">NativeMemoryPool</span><span class="p">.</span><span class="na">getDefault</span><span class="p">(),</span>
<span class="w"> </span><span class="n">FileFormat</span><span class="p">.</span><span class="na">PARQUET</span><span class="p">,</span><span class="w"> </span><span class="n">uri</span><span class="p">);</span>
<span class="w"> </span><span class="n">Dataset</span><span class="w"> </span><span class="n">dataset</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">factory</span><span class="p">.</span><span class="na">finish</span><span class="p">();</span>
<span class="w"> </span><span class="n">Scanner</span><span class="w"> </span><span class="n">scanner</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">dataset</span><span class="p">.</span><span class="na">newScan</span><span class="p">(</span><span class="n">options</span><span class="p">)</span>
<span class="p">)</span><span class="w"> </span><span class="p">{</span>
<span class="w"> </span><span class="c1">// do something</span>
<span class="p">}</span><span class="w"> </span><span class="k">catch</span><span class="w"> </span><span class="p">(</span><span class="n">Exception</span><span class="w"> </span><span class="n">e</span><span class="p">)</span><span class="w"> </span><span class="p">{</span>
<span class="w"> </span><span class="n">e</span><span class="p">.</span><span class="na">printStackTrace</span><span class="p">();</span>
<span class="p">}</span>
</pre></div>
</div>
<p>If user forgets to close them then native object leakage might be caused.</p>
</section>
<section id="batchsize">
<h3><a class="toc-backref" href="#id10" role="doc-backlink">BatchSize</a><a class="headerlink" href="#batchsize" title="Permalink to this heading">#</a></h3>
<p>The <code class="docutils literal notranslate"><span class="pre">batchSize</span></code> argument of <code class="docutils literal notranslate"><span class="pre">ScanOptions</span></code> is a limit on the size of an individual batch.</p>
<p>For example, let’s try to read a Parquet file with gzip compression and 3 row groups:</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span># Let configure ScanOptions as:
ScanOptions options = new ScanOptions(/*batchSize*/ 32768);
$ parquet-tools meta data4_3rg_gzip.parquet
file schema: schema
age: OPTIONAL INT64 R:0 D:1
name: OPTIONAL BINARY L:STRING R:0 D:1
row group 1: RC:4 TS:182 OFFSET:4
row group 2: RC:4 TS:190 OFFSET:420
row group 3: RC:3 TS:179 OFFSET:838
</pre></div>
</div>
<p>Here, we set the batchSize in ScanOptions to 32768. Because that’s greater
than the number of rows in the next batch, which is 4 rows because the first
row group has only 4 rows, then the program gets only 4 rows. The scanner
will not combine smaller batches to reach the limit, but it will split
large batches to stay under the limit. So in the case the row group had more
than 32768 rows, it would get split into blocks of 32768 rows or less.</p>
</section>
</section>
</section>
</article>
<footer class="prev-next-footer">
<div class="prev-next-area">
<a class="left-prev"
href="flight_sql_jdbc_driver.html"
title="previous page">
<i class="fa-solid fa-angle-left"></i>
<div class="prev-next-info">
<p class="prev-next-subtitle">previous</p>
<p class="prev-next-title">Arrow Flight SQL JDBC Driver</p>
</div>
</a>
<a class="right-next"
href="substrait.html"
title="next page">
<div class="prev-next-info">
<p class="prev-next-subtitle">next</p>
<p class="prev-next-title">Substrait</p>
</div>
<i class="fa-solid fa-angle-right"></i>
</a>
</div>
</footer>
</div>
<div class="bd-sidebar-secondary bd-toc"><div class="sidebar-secondary-items sidebar-secondary__inner">
<div class="sidebar-secondary-item">
<div
id="pst-page-navigation-heading-2"
class="page-toc tocsection onthispage">
<i class="fa-solid fa-list"></i> On this page
</div>
<nav class="bd-toc-nav page-toc" aria-labelledby="pst-page-navigation-heading-2">
<ul class="visible nav section-nav flex-column">
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#getting-started">Getting Started</a></li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#schema">Schema</a></li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#projection-subset-of-columns">Projection (Subset of Columns)</a></li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#projection-produce-new-columns-and-filters">Projection (Produce New Columns) and Filters</a></li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#read-data-from-hdfs">Read Data from HDFS</a></li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#native-memory-management">Native Memory Management</a></li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#usage-notes">Usage Notes</a><ul class="visible nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#native-object-resource-management">Native Object Resource Management</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#batchsize">BatchSize</a></li>
</ul>
</li>
</ul>
</nav></div>
<div class="sidebar-secondary-item">
<div class="tocsection editthispage">
<a href="https://github.com/apache/arrow/edit/main/docs/source/java/dataset.rst">
<i class="fa-solid fa-pencil"></i>
Edit on GitHub
</a>
</div>
</div>
</div></div>
</div>
<footer class="bd-footer-content">
</footer>
</main>
</div>
</div>
<!-- Scripts loaded after <body> so the DOM is not blocked -->
<script src="../_static/scripts/bootstrap.js?digest=8d27b9dea8ad943066ae"></script>
<script src="../_static/scripts/pydata-sphinx-theme.js?digest=8d27b9dea8ad943066ae"></script>
<footer class="bd-footer">
<div class="bd-footer__inner bd-page-width">
<div class="footer-items__start">
<div class="footer-item">
<p class="copyright">
© Copyright 2016-2024 Apache Software Foundation.
Apache Arrow, Arrow, Apache, the Apache feather logo, and the Apache Arrow project logo are either registered trademarks or trademarks of The Apache Software Foundation in the United States and other countries.
<br/>
</p>
</div>
<div class="footer-item">
<p class="sphinx-version">
Created using <a href="https://www.sphinx-doc.org/">Sphinx</a> 6.2.0.
<br/>
</p>
</div>
</div>
<div class="footer-items__end">
<div class="footer-item">
<p class="theme-version">
Built with the <a href="https://pydata-sphinx-theme.readthedocs.io/en/stable/index.html">PyData Sphinx Theme</a> 0.15.2.
</p></div>
</div>
</div>
</footer>
</body>
</html>