blob: feb79e18eb6e95f14c449258d5043e8483abb931 [file] [log] [blame]
<!doctype html>
<html class="no-js" lang="en" data-content_root="./">
<head><meta charset="utf-8"/>
<meta name="viewport" content="width=device-width,initial-scale=1"/>
<meta name="color-scheme" content="light dark"><meta name="viewport" content="width=device-width, initial-scale=1" />
<link rel="index" title="Index" href="genindex.html" /><link rel="search" title="Search" href="search.html" /><link rel="next" title="Substrait" href="substrait.html" /><link rel="prev" title="Arrow Flight SQL JDBC Driver" href="flight_sql_jdbc_driver.html" />
<!-- Generated with Sphinx 8.1.3 and Furo 2024.08.06 -->
<title>Dataset - arrow-java 18.1.0 documentation</title>
<link rel="stylesheet" type="text/css" href="_static/pygments.css?v=8f2a1f02" />
<link rel="stylesheet" type="text/css" href="_static/styles/furo.css?v=354aac6f" />
<link rel="stylesheet" type="text/css" href="_static/styles/furo-extensions.css?v=302659d7" />
<style>
body {
--color-code-background: #f8f8f8;
--color-code-foreground: black;
}
@media not print {
body[data-theme="dark"] {
--color-code-background: #202020;
--color-code-foreground: #d0d0d0;
}
@media (prefers-color-scheme: dark) {
body:not([data-theme="light"]) {
--color-code-background: #202020;
--color-code-foreground: #d0d0d0;
}
}
}
</style></head>
<body>
<script>
document.body.dataset.theme = localStorage.getItem("theme") || "auto";
</script>
<svg xmlns="http://www.w3.org/2000/svg" style="display: none;">
<symbol id="svg-toc" viewBox="0 0 24 24">
<title>Contents</title>
<svg stroke="currentColor" fill="currentColor" stroke-width="0" viewBox="0 0 1024 1024">
<path d="M408 442h480c4.4 0 8-3.6 8-8v-56c0-4.4-3.6-8-8-8H408c-4.4 0-8 3.6-8 8v56c0 4.4 3.6 8 8 8zm-8 204c0 4.4 3.6 8 8 8h480c4.4 0 8-3.6 8-8v-56c0-4.4-3.6-8-8-8H408c-4.4 0-8 3.6-8 8v56zm504-486H120c-4.4 0-8 3.6-8 8v56c0 4.4 3.6 8 8 8h784c4.4 0 8-3.6 8-8v-56c0-4.4-3.6-8-8-8zm0 632H120c-4.4 0-8 3.6-8 8v56c0 4.4 3.6 8 8 8h784c4.4 0 8-3.6 8-8v-56c0-4.4-3.6-8-8-8zM115.4 518.9L271.7 642c5.8 4.6 14.4.5 14.4-6.9V388.9c0-7.4-8.5-11.5-14.4-6.9L115.4 505.1a8.74 8.74 0 0 0 0 13.8z"/>
</svg>
</symbol>
<symbol id="svg-menu" viewBox="0 0 24 24">
<title>Menu</title>
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none" stroke="currentColor"
stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="feather-menu">
<line x1="3" y1="12" x2="21" y2="12"></line>
<line x1="3" y1="6" x2="21" y2="6"></line>
<line x1="3" y1="18" x2="21" y2="18"></line>
</svg>
</symbol>
<symbol id="svg-arrow-right" viewBox="0 0 24 24">
<title>Expand</title>
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none" stroke="currentColor"
stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="feather-chevron-right">
<polyline points="9 18 15 12 9 6"></polyline>
</svg>
</symbol>
<symbol id="svg-sun" viewBox="0 0 24 24">
<title>Light mode</title>
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none" stroke="currentColor"
stroke-width="1" stroke-linecap="round" stroke-linejoin="round" class="feather-sun">
<circle cx="12" cy="12" r="5"></circle>
<line x1="12" y1="1" x2="12" y2="3"></line>
<line x1="12" y1="21" x2="12" y2="23"></line>
<line x1="4.22" y1="4.22" x2="5.64" y2="5.64"></line>
<line x1="18.36" y1="18.36" x2="19.78" y2="19.78"></line>
<line x1="1" y1="12" x2="3" y2="12"></line>
<line x1="21" y1="12" x2="23" y2="12"></line>
<line x1="4.22" y1="19.78" x2="5.64" y2="18.36"></line>
<line x1="18.36" y1="5.64" x2="19.78" y2="4.22"></line>
</svg>
</symbol>
<symbol id="svg-moon" viewBox="0 0 24 24">
<title>Dark mode</title>
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none" stroke="currentColor"
stroke-width="1" stroke-linecap="round" stroke-linejoin="round" class="icon-tabler-moon">
<path stroke="none" d="M0 0h24v24H0z" fill="none" />
<path d="M12 3c.132 0 .263 0 .393 0a7.5 7.5 0 0 0 7.92 12.446a9 9 0 1 1 -8.313 -12.454z" />
</svg>
</symbol>
<symbol id="svg-sun-with-moon" viewBox="0 0 24 24">
<title>Auto light/dark, in light mode</title>
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none" stroke="currentColor"
stroke-width="1" stroke-linecap="round" stroke-linejoin="round"
class="icon-custom-derived-from-feather-sun-and-tabler-moon">
<path style="opacity: 50%" d="M 5.411 14.504 C 5.471 14.504 5.532 14.504 5.591 14.504 C 3.639 16.319 4.383 19.569 6.931 20.352 C 7.693 20.586 8.512 20.551 9.25 20.252 C 8.023 23.207 4.056 23.725 2.11 21.184 C 0.166 18.642 1.702 14.949 4.874 14.536 C 5.051 14.512 5.231 14.5 5.411 14.5 L 5.411 14.504 Z"/>
<line x1="14.5" y1="3.25" x2="14.5" y2="1.25"/>
<line x1="14.5" y1="15.85" x2="14.5" y2="17.85"/>
<line x1="10.044" y1="5.094" x2="8.63" y2="3.68"/>
<line x1="19" y1="14.05" x2="20.414" y2="15.464"/>
<line x1="8.2" y1="9.55" x2="6.2" y2="9.55"/>
<line x1="20.8" y1="9.55" x2="22.8" y2="9.55"/>
<line x1="10.044" y1="14.006" x2="8.63" y2="15.42"/>
<line x1="19" y1="5.05" x2="20.414" y2="3.636"/>
<circle cx="14.5" cy="9.55" r="3.6"/>
</svg>
</symbol>
<symbol id="svg-moon-with-sun" viewBox="0 0 24 24">
<title>Auto light/dark, in dark mode</title>
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none" stroke="currentColor"
stroke-width="1" stroke-linecap="round" stroke-linejoin="round"
class="icon-custom-derived-from-feather-sun-and-tabler-moon">
<path d="M 8.282 7.007 C 8.385 7.007 8.494 7.007 8.595 7.007 C 5.18 10.184 6.481 15.869 10.942 17.24 C 12.275 17.648 13.706 17.589 15 17.066 C 12.851 22.236 5.91 23.143 2.505 18.696 C -0.897 14.249 1.791 7.786 7.342 7.063 C 7.652 7.021 7.965 7 8.282 7 L 8.282 7.007 Z"/>
<line style="opacity: 50%" x1="18" y1="3.705" x2="18" y2="2.5"/>
<line style="opacity: 50%" x1="18" y1="11.295" x2="18" y2="12.5"/>
<line style="opacity: 50%" x1="15.316" y1="4.816" x2="14.464" y2="3.964"/>
<line style="opacity: 50%" x1="20.711" y1="10.212" x2="21.563" y2="11.063"/>
<line style="opacity: 50%" x1="14.205" y1="7.5" x2="13.001" y2="7.5"/>
<line style="opacity: 50%" x1="21.795" y1="7.5" x2="23" y2="7.5"/>
<line style="opacity: 50%" x1="15.316" y1="10.184" x2="14.464" y2="11.036"/>
<line style="opacity: 50%" x1="20.711" y1="4.789" x2="21.563" y2="3.937"/>
<circle style="opacity: 50%" cx="18" cy="7.5" r="2.169"/>
</svg>
</symbol>
<symbol id="svg-pencil" viewBox="0 0 24 24">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none" stroke="currentColor"
stroke-width="1" stroke-linecap="round" stroke-linejoin="round" class="icon-tabler-pencil-code">
<path d="M4 20h4l10.5 -10.5a2.828 2.828 0 1 0 -4 -4l-10.5 10.5v4" />
<path d="M13.5 6.5l4 4" />
<path d="M20 21l2 -2l-2 -2" />
<path d="M17 17l-2 2l2 2" />
</svg>
</symbol>
<symbol id="svg-eye" viewBox="0 0 24 24">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none" stroke="currentColor"
stroke-width="1" stroke-linecap="round" stroke-linejoin="round" class="icon-tabler-eye-code">
<path stroke="none" d="M0 0h24v24H0z" fill="none" />
<path d="M10 12a2 2 0 1 0 4 0a2 2 0 0 0 -4 0" />
<path
d="M11.11 17.958c-3.209 -.307 -5.91 -2.293 -8.11 -5.958c2.4 -4 5.4 -6 9 -6c3.6 0 6.6 2 9 6c-.21 .352 -.427 .688 -.647 1.008" />
<path d="M20 21l2 -2l-2 -2" />
<path d="M17 17l-2 2l2 2" />
</svg>
</symbol>
</svg>
<input type="checkbox" class="sidebar-toggle" name="__navigation" id="__navigation">
<input type="checkbox" class="sidebar-toggle" name="__toc" id="__toc">
<label class="overlay sidebar-overlay" for="__navigation">
<div class="visually-hidden">Hide navigation sidebar</div>
</label>
<label class="overlay toc-overlay" for="__toc">
<div class="visually-hidden">Hide table of contents sidebar</div>
</label>
<a class="skip-to-content muted-link" href="#furo-main-content">Skip to content</a>
<div class="page">
<header class="mobile-header">
<div class="header-left">
<label class="nav-overlay-icon" for="__navigation">
<div class="visually-hidden">Toggle site navigation sidebar</div>
<i class="icon"><svg><use href="#svg-menu"></use></svg></i>
</label>
</div>
<div class="header-center">
<a href="index.html"><div class="brand">arrow-java 18.1.0 documentation</div></a>
</div>
<div class="header-right">
<div class="theme-toggle-container theme-toggle-header">
<button class="theme-toggle">
<div class="visually-hidden">Toggle Light / Dark / Auto color theme</div>
<svg class="theme-icon-when-auto-light"><use href="#svg-sun-with-moon"></use></svg>
<svg class="theme-icon-when-auto-dark"><use href="#svg-moon-with-sun"></use></svg>
<svg class="theme-icon-when-dark"><use href="#svg-moon"></use></svg>
<svg class="theme-icon-when-light"><use href="#svg-sun"></use></svg>
</button>
</div>
<label class="toc-overlay-icon toc-header-icon" for="__toc">
<div class="visually-hidden">Toggle table of contents sidebar</div>
<i class="icon"><svg><use href="#svg-toc"></use></svg></i>
</label>
</div>
</header>
<aside class="sidebar-drawer">
<div class="sidebar-container">
<div class="sidebar-sticky"><a class="sidebar-brand" href="index.html">
<span class="sidebar-brand-text">arrow-java 18.1.0 documentation</span>
</a><form class="sidebar-search-container" method="get" action="search.html" role="search">
<input class="sidebar-search" placeholder="Search" name="q" aria-label="Search">
<input type="hidden" name="check_keywords" value="yes">
<input type="hidden" name="area" value="default">
</form>
<div id="searchbox"></div><div class="sidebar-scroll"><div class="sidebar-tree">
<ul class="current">
<li class="toctree-l1"><a class="reference internal" href="quickstartguide.html">Quick Start Guide</a></li>
<li class="toctree-l1"><a class="reference internal" href="overview.html">High-Level Overview</a></li>
<li class="toctree-l1"><a class="reference internal" href="install.html">Installing Java Modules</a></li>
<li class="toctree-l1 has-children"><a class="reference internal" href="developers/index.html">Java Development</a><input class="toctree-checkbox" id="toctree-checkbox-1" name="toctree-checkbox-1" role="switch" type="checkbox"/><label for="toctree-checkbox-1"><div class="visually-hidden">Toggle navigation of Java Development</div><i class="icon"><svg><use href="#svg-arrow-right"></use></svg></i></label><ul>
<li class="toctree-l2"><a class="reference internal" href="developers/building.html">Building Arrow Java</a></li>
<li class="toctree-l2"><a class="reference internal" href="developers/development.html">Development Guidelines</a></li>
</ul>
</li>
<li class="toctree-l1"><a class="reference internal" href="memory.html">Memory Management</a></li>
<li class="toctree-l1"><a class="reference internal" href="vector.html">ValueVector</a></li>
<li class="toctree-l1"><a class="reference internal" href="vector_schema_root.html">Tabular Data</a></li>
<li class="toctree-l1"><a class="reference internal" href="table.html">Table</a></li>
<li class="toctree-l1"><a class="reference internal" href="ipc.html">Reading/Writing IPC formats</a></li>
<li class="toctree-l1"><a class="reference internal" href="algorithm.html">Java Algorithms</a></li>
<li class="toctree-l1"><a class="reference internal" href="flight.html">Arrow Flight RPC</a></li>
<li class="toctree-l1"><a class="reference internal" href="flight_sql.html">Arrow Flight SQL</a></li>
<li class="toctree-l1"><a class="reference internal" href="flight_sql_jdbc_driver.html">Arrow Flight SQL JDBC Driver</a></li>
<li class="toctree-l1 current current-page"><a class="current reference internal" href="#">Dataset</a></li>
<li class="toctree-l1"><a class="reference internal" href="substrait.html">Substrait</a></li>
<li class="toctree-l1"><a class="reference internal" href="cdata.html">C Data Interface</a></li>
<li class="toctree-l1"><a class="reference internal" href="jdbc.html">Arrow JDBC Adapter</a></li>
<li class="toctree-l1"><a class="reference internal" href="reference/index.html">Reference (javadoc)</a></li>
<li class="toctree-l1"><a class="reference external" href="https://arrow.apache.org/cookbook/java/">Cookbook</a></li>
</ul>
</div>
</div>
</div>
</div>
</aside>
<div class="main">
<div class="content">
<div class="article-container">
<a href="#" class="back-to-top muted-link">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24">
<path d="M13 20h-2V8l-5.5 5.5-1.42-1.42L12 4.16l7.92 7.92-1.42 1.42L13 8v12z"></path>
</svg>
<span>Back to top</span>
</a>
<div class="content-icon-container">
<div class="view-this-page">
<a class="muted-link" href="_sources/dataset.rst.txt" title="View this page">
<svg><use href="#svg-eye"></use></svg>
<span class="visually-hidden">View this page</span>
</a>
</div>
<div class="theme-toggle-container theme-toggle-content">
<button class="theme-toggle">
<div class="visually-hidden">Toggle Light / Dark / Auto color theme</div>
<svg class="theme-icon-when-auto-light"><use href="#svg-sun-with-moon"></use></svg>
<svg class="theme-icon-when-auto-dark"><use href="#svg-moon-with-sun"></use></svg>
<svg class="theme-icon-when-dark"><use href="#svg-moon"></use></svg>
<svg class="theme-icon-when-light"><use href="#svg-sun"></use></svg>
</button>
</div>
<label class="toc-overlay-icon toc-content-icon" for="__toc">
<div class="visually-hidden">Toggle table of contents sidebar</div>
<i class="icon"><svg><use href="#svg-toc"></use></svg></i>
</label>
</div>
<article role="main" id="furo-main-content">
<section id="dataset">
<h1>Dataset<a class="headerlink" href="#dataset" title="Link to this heading"></a></h1>
<div class="admonition warning">
<p class="admonition-title">Warning</p>
<p>Experimental: The Java module <code class="docutils literal notranslate"><span class="pre">dataset</span></code> is currently under early
development. API might be changed in each release of Apache Arrow until it
gets mature.</p>
</div>
<p>Dataset is an universal layer in Apache Arrow for querying data in different
formats or in different partitioning strategies. Usually the data to be queried
is supposed to be located from a traditional file system, however Arrow Dataset
is not designed only for querying files but can be extended to serve all
possible data sources such as from inter-process communication or from other
network locations, etc.</p>
<section id="getting-started">
<h2>Getting Started<a class="headerlink" href="#getting-started" title="Link to this heading"></a></h2>
<p>Currently supported file formats are:</p>
<ul class="simple">
<li><p>Apache Arrow (<code class="docutils literal notranslate"><span class="pre">.arrow</span></code>)</p></li>
<li><p>Apache ORC (<code class="docutils literal notranslate"><span class="pre">.orc</span></code>)</p></li>
<li><p>Apache Parquet (<code class="docutils literal notranslate"><span class="pre">.parquet</span></code>)</p></li>
<li><p>Comma-Separated Values (<code class="docutils literal notranslate"><span class="pre">.csv</span></code>)</p></li>
<li><p>Line-delimited JSON Values (<code class="docutils literal notranslate"><span class="pre">.json</span></code>)</p></li>
</ul>
<p>Below shows a simplest example of using Dataset to query a Parquet file in Java:</p>
<div class="highlight-Java notranslate"><div class="highlight"><pre><span></span><span class="c1">// read data from file /opt/example.parquet</span>
<span class="n">String</span><span class="w"> </span><span class="n">uri</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="s">&quot;file:/opt/example.parquet&quot;</span><span class="p">;</span>
<span class="n">ScanOptions</span><span class="w"> </span><span class="n">options</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="k">new</span><span class="w"> </span><span class="n">ScanOptions</span><span class="p">(</span><span class="cm">/*batchSize*/</span><span class="w"> </span><span class="mi">32768</span><span class="p">);</span>
<span class="k">try</span><span class="w"> </span><span class="p">(</span>
<span class="w"> </span><span class="n">BufferAllocator</span><span class="w"> </span><span class="n">allocator</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="k">new</span><span class="w"> </span><span class="n">RootAllocator</span><span class="p">();</span>
<span class="w"> </span><span class="n">DatasetFactory</span><span class="w"> </span><span class="n">datasetFactory</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="k">new</span><span class="w"> </span><span class="n">FileSystemDatasetFactory</span><span class="p">(</span>
<span class="w"> </span><span class="n">allocator</span><span class="p">,</span><span class="w"> </span><span class="n">NativeMemoryPool</span><span class="p">.</span><span class="na">getDefault</span><span class="p">(),</span>
<span class="w"> </span><span class="n">FileFormat</span><span class="p">.</span><span class="na">PARQUET</span><span class="p">,</span><span class="w"> </span><span class="n">uri</span><span class="p">);</span>
<span class="w"> </span><span class="n">Dataset</span><span class="w"> </span><span class="n">dataset</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">datasetFactory</span><span class="p">.</span><span class="na">finish</span><span class="p">();</span>
<span class="w"> </span><span class="n">Scanner</span><span class="w"> </span><span class="n">scanner</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">dataset</span><span class="p">.</span><span class="na">newScan</span><span class="p">(</span><span class="n">options</span><span class="p">);</span>
<span class="w"> </span><span class="n">ArrowReader</span><span class="w"> </span><span class="n">reader</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">scanner</span><span class="p">.</span><span class="na">scanBatches</span><span class="p">()</span>
<span class="p">)</span><span class="w"> </span><span class="p">{</span>
<span class="w"> </span><span class="n">List</span><span class="o">&lt;</span><span class="n">ArrowRecordBatch</span><span class="o">&gt;</span><span class="w"> </span><span class="n">batches</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="k">new</span><span class="w"> </span><span class="n">ArrayList</span><span class="o">&lt;&gt;</span><span class="p">();</span>
<span class="w"> </span><span class="k">while</span><span class="w"> </span><span class="p">(</span><span class="n">reader</span><span class="p">.</span><span class="na">loadNextBatch</span><span class="p">())</span><span class="w"> </span><span class="p">{</span>
<span class="w"> </span><span class="k">try</span><span class="w"> </span><span class="p">(</span><span class="n">VectorSchemaRoot</span><span class="w"> </span><span class="n">root</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">reader</span><span class="p">.</span><span class="na">getVectorSchemaRoot</span><span class="p">())</span><span class="w"> </span><span class="p">{</span>
<span class="w"> </span><span class="kd">final</span><span class="w"> </span><span class="n">VectorUnloader</span><span class="w"> </span><span class="n">unloader</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="k">new</span><span class="w"> </span><span class="n">VectorUnloader</span><span class="p">(</span><span class="n">root</span><span class="p">);</span>
<span class="w"> </span><span class="n">batches</span><span class="p">.</span><span class="na">add</span><span class="p">(</span><span class="n">unloader</span><span class="p">.</span><span class="na">getRecordBatch</span><span class="p">());</span>
<span class="w"> </span><span class="p">}</span>
<span class="w"> </span><span class="p">}</span>
<span class="w"> </span><span class="c1">// do something with read record batches, for example:</span>
<span class="w"> </span><span class="n">analyzeArrowData</span><span class="p">(</span><span class="n">batches</span><span class="p">);</span>
<span class="w"> </span><span class="c1">// finished the analysis of the data, close all resources:</span>
<span class="w"> </span><span class="n">AutoCloseables</span><span class="p">.</span><span class="na">close</span><span class="p">(</span><span class="n">batches</span><span class="p">);</span>
<span class="p">}</span><span class="w"> </span><span class="k">catch</span><span class="w"> </span><span class="p">(</span><span class="n">Exception</span><span class="w"> </span><span class="n">e</span><span class="p">)</span><span class="w"> </span><span class="p">{</span>
<span class="w"> </span><span class="n">e</span><span class="p">.</span><span class="na">printStackTrace</span><span class="p">();</span>
<span class="p">}</span>
</pre></div>
</div>
<div class="admonition note">
<p class="admonition-title">Note</p>
<p><code class="docutils literal notranslate"><span class="pre">ArrowRecordBatch</span></code> is a low-level composite Arrow data exchange format
that doesn’t provide API to read typed data from it directly.
It’s recommended to use utilities <code class="docutils literal notranslate"><span class="pre">VectorLoader</span></code> to load it into a schema
aware container <code class="docutils literal notranslate"><span class="pre">VectorSchemaRoot</span></code> by which user could be able to access
decoded data conveniently in Java.</p>
<p>The <code class="docutils literal notranslate"><span class="pre">ScanOptions</span> <span class="pre">batchSize</span></code> argument takes effect only if it is set to a value
smaller than the number of rows in the recordbatch.</p>
</div>
<div class="admonition seealso">
<p class="admonition-title">See also</p>
<p>Load record batches with <a class="reference internal" href="vector_schema_root.html"><span class="doc">VectorSchemaRoot</span></a>.</p>
</div>
</section>
<section id="schema">
<h2>Schema<a class="headerlink" href="#schema" title="Link to this heading"></a></h2>
<p>Schema of the data to be queried can be inspected via method
<code class="docutils literal notranslate"><span class="pre">DatasetFactory#inspect()</span></code> before actually reading it. For example:</p>
<div class="highlight-Java notranslate"><div class="highlight"><pre><span></span><span class="c1">// read data from local file /opt/example.parquet</span>
<span class="n">String</span><span class="w"> </span><span class="n">uri</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="s">&quot;file:/opt/example.parquet&quot;</span><span class="p">;</span>
<span class="n">BufferAllocator</span><span class="w"> </span><span class="n">allocator</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="k">new</span><span class="w"> </span><span class="n">RootAllocator</span><span class="p">(</span><span class="n">Long</span><span class="p">.</span><span class="na">MAX_VALUE</span><span class="p">);</span>
<span class="n">DatasetFactory</span><span class="w"> </span><span class="n">factory</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="k">new</span><span class="w"> </span><span class="n">FileSystemDatasetFactory</span><span class="p">(</span><span class="n">allocator</span><span class="p">,</span>
<span class="w"> </span><span class="n">NativeMemoryPool</span><span class="p">.</span><span class="na">getDefault</span><span class="p">(),</span><span class="w"> </span><span class="n">FileFormat</span><span class="p">.</span><span class="na">PARQUET</span><span class="p">,</span><span class="w"> </span><span class="n">uri</span><span class="p">);</span>
<span class="c1">// inspect schema</span>
<span class="n">Schema</span><span class="w"> </span><span class="n">schema</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">factory</span><span class="p">.</span><span class="na">inspect</span><span class="p">();</span>
</pre></div>
</div>
<p>For some of the data format that is compatible with a user-defined schema, user
can use method <code class="docutils literal notranslate"><span class="pre">DatasetFactory#inspect(Schema</span> <span class="pre">schema)</span></code> to create the dataset:</p>
<div class="highlight-Java notranslate"><div class="highlight"><pre><span></span><span class="n">Schema</span><span class="w"> </span><span class="n">schema</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">createUserSchema</span><span class="p">()</span>
<span class="n">Dataset</span><span class="w"> </span><span class="n">dataset</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">factory</span><span class="p">.</span><span class="na">finish</span><span class="p">(</span><span class="n">schema</span><span class="p">);</span>
</pre></div>
</div>
<p>Otherwise when the non-parameter method <code class="docutils literal notranslate"><span class="pre">DatasetFactory#inspect()</span></code> is called,
schema will be inferred automatically from data source. The same as the result
of <code class="docutils literal notranslate"><span class="pre">DatasetFactory#inspect()</span></code>.</p>
<p>Also, if projector is specified during scanning (see next section
<a class="reference internal" href="#java-dataset-projection"><span class="std std-ref">Projection (Subset of Columns)</span></a>), the actual schema of output data can be got
within method <code class="docutils literal notranslate"><span class="pre">Scanner::schema()</span></code>:</p>
<div class="highlight-Java notranslate"><div class="highlight"><pre><span></span><span class="n">Scanner</span><span class="w"> </span><span class="n">scanner</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">dataset</span><span class="p">.</span><span class="na">newScan</span><span class="p">(</span>
<span class="w"> </span><span class="k">new</span><span class="w"> </span><span class="n">ScanOptions</span><span class="p">(</span><span class="mi">32768</span><span class="p">,</span><span class="w"> </span><span class="n">Optional</span><span class="p">.</span><span class="na">of</span><span class="p">(</span><span class="k">new</span><span class="w"> </span><span class="n">String</span><span class="o">[]</span><span class="w"> </span><span class="p">{</span><span class="s">&quot;id&quot;</span><span class="p">,</span><span class="w"> </span><span class="s">&quot;name&quot;</span><span class="p">})));</span>
<span class="n">Schema</span><span class="w"> </span><span class="n">projectedSchema</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">scanner</span><span class="p">.</span><span class="na">schema</span><span class="p">();</span>
</pre></div>
</div>
</section>
<section id="projection-subset-of-columns">
<span id="java-dataset-projection"></span><h2>Projection (Subset of Columns)<a class="headerlink" href="#projection-subset-of-columns" title="Link to this heading"></a></h2>
<p>User can specify projections in ScanOptions. For example:</p>
<div class="highlight-Java notranslate"><div class="highlight"><pre><span></span><span class="n">String</span><span class="o">[]</span><span class="w"> </span><span class="n">projection</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="k">new</span><span class="w"> </span><span class="n">String</span><span class="o">[]</span><span class="w"> </span><span class="p">{</span><span class="s">&quot;id&quot;</span><span class="p">,</span><span class="w"> </span><span class="s">&quot;name&quot;</span><span class="p">};</span>
<span class="n">ScanOptions</span><span class="w"> </span><span class="n">options</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="k">new</span><span class="w"> </span><span class="n">ScanOptions</span><span class="p">(</span><span class="mi">32768</span><span class="p">,</span><span class="w"> </span><span class="n">Optional</span><span class="p">.</span><span class="na">of</span><span class="p">(</span><span class="n">projection</span><span class="p">));</span>
</pre></div>
</div>
<p>If no projection is needed, leave the optional projection argument absent in
ScanOptions:</p>
<div class="highlight-Java notranslate"><div class="highlight"><pre><span></span><span class="n">ScanOptions</span><span class="w"> </span><span class="n">options</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="k">new</span><span class="w"> </span><span class="n">ScanOptions</span><span class="p">(</span><span class="mi">32768</span><span class="p">,</span><span class="w"> </span><span class="n">Optional</span><span class="p">.</span><span class="na">empty</span><span class="p">());</span>
</pre></div>
</div>
<p>Or use shortcut constructor:</p>
<div class="highlight-Java notranslate"><div class="highlight"><pre><span></span><span class="n">ScanOptions</span><span class="w"> </span><span class="n">options</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="k">new</span><span class="w"> </span><span class="n">ScanOptions</span><span class="p">(</span><span class="mi">32768</span><span class="p">);</span>
</pre></div>
</div>
<p>Then all columns will be emitted during scanning.</p>
</section>
<section id="projection-produce-new-columns-and-filters">
<h2>Projection (Produce New Columns) and Filters<a class="headerlink" href="#projection-produce-new-columns-and-filters" title="Link to this heading"></a></h2>
<p>User can specify projections (new columns) or filters in ScanOptions using Substrait. For example:</p>
<div class="highlight-Java notranslate"><div class="highlight"><pre><span></span><span class="n">ByteBuffer</span><span class="w"> </span><span class="n">substraitExpressionFilter</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">getSubstraitExpressionFilter</span><span class="p">();</span>
<span class="n">ByteBuffer</span><span class="w"> </span><span class="n">substraitExpressionProject</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">getSubstraitExpressionProjection</span><span class="p">();</span>
<span class="c1">// Use Substrait APIs to create an Expression and serialize to a ByteBuffer</span>
<span class="n">ScanOptions</span><span class="w"> </span><span class="n">options</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="k">new</span><span class="w"> </span><span class="n">ScanOptions</span><span class="p">.</span><span class="na">Builder</span><span class="p">(</span><span class="cm">/*batchSize*/</span><span class="w"> </span><span class="mi">32768</span><span class="p">)</span>
<span class="w"> </span><span class="p">.</span><span class="na">columns</span><span class="p">(</span><span class="n">Optional</span><span class="p">.</span><span class="na">empty</span><span class="p">())</span>
<span class="w"> </span><span class="p">.</span><span class="na">substraitExpressionFilter</span><span class="p">(</span><span class="n">substraitExpressionFilter</span><span class="p">)</span>
<span class="w"> </span><span class="p">.</span><span class="na">substraitExpressionProjection</span><span class="p">(</span><span class="n">getSubstraitExpressionProjection</span><span class="p">())</span>
<span class="w"> </span><span class="p">.</span><span class="na">build</span><span class="p">();</span>
</pre></div>
</div>
<div class="admonition seealso">
<p class="admonition-title">See also</p>
<dl class="simple">
<dt><a class="reference internal" href="substrait.html"><span class="doc">Executing Projections and Filters Using Extended Expressions</span></a></dt><dd><p>Projections and Filters using Substrait.</p>
</dd>
</dl>
</div>
</section>
<section id="read-data-from-hdfs">
<h2>Read Data from HDFS<a class="headerlink" href="#read-data-from-hdfs" title="Link to this heading"></a></h2>
<p><code class="docutils literal notranslate"><span class="pre">FileSystemDataset</span></code> supports reading data from non-local file systems. HDFS
support is included in the official Apache Arrow Java package releases and
can be used directly without re-building the source code.</p>
<p>To access HDFS data using Dataset API, pass a general HDFS URI to
<code class="docutils literal notranslate"><span class="pre">FilesSystemDatasetFactory</span></code>:</p>
<div class="highlight-Java notranslate"><div class="highlight"><pre><span></span><span class="n">String</span><span class="w"> </span><span class="n">uri</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="s">&quot;hdfs://{hdfs_host}:{port}/data/example.parquet&quot;</span><span class="p">;</span>
<span class="n">BufferAllocator</span><span class="w"> </span><span class="n">allocator</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="k">new</span><span class="w"> </span><span class="n">RootAllocator</span><span class="p">(</span><span class="n">Long</span><span class="p">.</span><span class="na">MAX_VALUE</span><span class="p">);</span>
<span class="n">DatasetFactory</span><span class="w"> </span><span class="n">factory</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="k">new</span><span class="w"> </span><span class="n">FileSystemDatasetFactory</span><span class="p">(</span><span class="n">allocator</span><span class="p">,</span>
<span class="w"> </span><span class="n">NativeMemoryPool</span><span class="p">.</span><span class="na">getDefault</span><span class="p">(),</span><span class="w"> </span><span class="n">FileFormat</span><span class="p">.</span><span class="na">PARQUET</span><span class="p">,</span><span class="w"> </span><span class="n">uri</span><span class="p">);</span>
</pre></div>
</div>
</section>
<section id="native-memory-management">
<h2>Native Memory Management<a class="headerlink" href="#native-memory-management" title="Link to this heading"></a></h2>
<p>To gain better performance and reduce code complexity, Java
<code class="docutils literal notranslate"><span class="pre">FileSystemDataset</span></code> internally relies on C++
<code class="docutils literal notranslate"><span class="pre">arrow::dataset::FileSystemDataset</span></code> via JNI.
As a result, all Arrow data read from <code class="docutils literal notranslate"><span class="pre">FileSystemDataset</span></code> is supposed to be
allocated off the JVM heap. To manage this part of memory, an utility class
<code class="docutils literal notranslate"><span class="pre">NativeMemoryPool</span></code> is provided to users.</p>
<p>As a basic example, by using a listenable <code class="docutils literal notranslate"><span class="pre">NativeMemoryPool</span></code>, user can pass
a listener hooking on C++ buffer allocation/deallocation:</p>
<div class="highlight-Java notranslate"><div class="highlight"><pre><span></span><span class="n">AtomicLong</span><span class="w"> </span><span class="n">reserved</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="k">new</span><span class="w"> </span><span class="n">AtomicLong</span><span class="p">(</span><span class="mi">0</span><span class="n">L</span><span class="p">);</span>
<span class="n">ReservationListener</span><span class="w"> </span><span class="n">listener</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="k">new</span><span class="w"> </span><span class="n">ReservationListener</span><span class="p">()</span><span class="w"> </span><span class="p">{</span>
<span class="w"> </span><span class="nd">@Override</span>
<span class="w"> </span><span class="kd">public</span><span class="w"> </span><span class="kt">void</span><span class="w"> </span><span class="nf">reserve</span><span class="p">(</span><span class="kt">long</span><span class="w"> </span><span class="n">size</span><span class="p">)</span><span class="w"> </span><span class="p">{</span>
<span class="w"> </span><span class="n">reserved</span><span class="p">.</span><span class="na">getAndAdd</span><span class="p">(</span><span class="n">size</span><span class="p">);</span>
<span class="w"> </span><span class="p">}</span>
<span class="w"> </span><span class="nd">@Override</span>
<span class="w"> </span><span class="kd">public</span><span class="w"> </span><span class="kt">void</span><span class="w"> </span><span class="nf">unreserve</span><span class="p">(</span><span class="kt">long</span><span class="w"> </span><span class="n">size</span><span class="p">)</span><span class="w"> </span><span class="p">{</span>
<span class="w"> </span><span class="n">reserved</span><span class="p">.</span><span class="na">getAndAdd</span><span class="p">(</span><span class="o">-</span><span class="n">size</span><span class="p">);</span>
<span class="w"> </span><span class="p">}</span>
<span class="p">};</span>
<span class="n">NativeMemoryPool</span><span class="w"> </span><span class="n">pool</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">NativeMemoryPool</span><span class="p">.</span><span class="na">createListenable</span><span class="p">(</span><span class="n">listener</span><span class="p">);</span>
<span class="n">FileSystemDatasetFactory</span><span class="w"> </span><span class="n">factory</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="k">new</span><span class="w"> </span><span class="n">FileSystemDatasetFactory</span><span class="p">(</span><span class="n">allocator</span><span class="p">,</span>
<span class="w"> </span><span class="n">pool</span><span class="p">,</span><span class="w"> </span><span class="n">FileFormat</span><span class="p">.</span><span class="na">PARQUET</span><span class="p">,</span><span class="w"> </span><span class="n">uri</span><span class="p">);</span>
</pre></div>
</div>
<p>Also, it’s a very common case to reserve the same amount of JVM direct memory
for the data read from datasets. For this use a built-in utility
class <code class="docutils literal notranslate"><span class="pre">DirectReservationListener</span></code> is provided:</p>
<div class="highlight-Java notranslate"><div class="highlight"><pre><span></span><span class="n">NativeMemoryPool</span><span class="w"> </span><span class="n">pool</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">NativeMemoryPool</span><span class="p">.</span><span class="na">createListenable</span><span class="p">(</span>
<span class="w"> </span><span class="n">DirectReservationListener</span><span class="p">.</span><span class="na">instance</span><span class="p">());</span>
</pre></div>
</div>
<p>This way, once the allocated byte count of Arrow buffers reaches the limit of
JVM direct memory, <code class="docutils literal notranslate"><span class="pre">OutOfMemoryError:</span> <span class="pre">Direct</span> <span class="pre">buffer</span> <span class="pre">memory</span></code> will
be thrown during scanning.</p>
<div class="admonition note">
<p class="admonition-title">Note</p>
<p>The default instance <code class="docutils literal notranslate"><span class="pre">NativeMemoryPool.getDefaultMemoryPool()</span></code> does
nothing on buffer allocation/deallocation. It’s OK to use it in
the case of POC or testing, but for production use in complex environment,
it’s recommended to manage memory by using a listenable memory pool.</p>
</div>
<div class="admonition note">
<p class="admonition-title">Note</p>
<p>The <code class="docutils literal notranslate"><span class="pre">BufferAllocator</span></code> instance passed to <code class="docutils literal notranslate"><span class="pre">FileSystemDatasetFactory</span></code>’s
constructor is also aware of the overall memory usage of the produced
dataset instances. Once the Java buffers are created the passed allocator
will become their parent allocator.</p>
</div>
</section>
<section id="usage-notes">
<h2>Usage Notes<a class="headerlink" href="#usage-notes" title="Link to this heading"></a></h2>
<section id="native-object-resource-management">
<h3>Native Object Resource Management<a class="headerlink" href="#native-object-resource-management" title="Link to this heading"></a></h3>
<p>As another result of relying on JNI, all components related to
<code class="docutils literal notranslate"><span class="pre">FileSystemDataset</span></code> should be closed manually or use try-with-resources to
release the corresponding native objects after using. For example:</p>
<div class="highlight-Java notranslate"><div class="highlight"><pre><span></span><span class="n">String</span><span class="w"> </span><span class="n">uri</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="s">&quot;file:/opt/example.parquet&quot;</span><span class="p">;</span>
<span class="n">ScanOptions</span><span class="w"> </span><span class="n">options</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="k">new</span><span class="w"> </span><span class="n">ScanOptions</span><span class="p">(</span><span class="cm">/*batchSize*/</span><span class="w"> </span><span class="mi">32768</span><span class="p">);</span>
<span class="k">try</span><span class="w"> </span><span class="p">(</span>
<span class="w"> </span><span class="n">BufferAllocator</span><span class="w"> </span><span class="n">allocator</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="k">new</span><span class="w"> </span><span class="n">RootAllocator</span><span class="p">();</span>
<span class="w"> </span><span class="n">DatasetFactory</span><span class="w"> </span><span class="n">factory</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="k">new</span><span class="w"> </span><span class="n">FileSystemDatasetFactory</span><span class="p">(</span>
<span class="w"> </span><span class="n">allocator</span><span class="p">,</span><span class="w"> </span><span class="n">NativeMemoryPool</span><span class="p">.</span><span class="na">getDefault</span><span class="p">(),</span>
<span class="w"> </span><span class="n">FileFormat</span><span class="p">.</span><span class="na">PARQUET</span><span class="p">,</span><span class="w"> </span><span class="n">uri</span><span class="p">);</span>
<span class="w"> </span><span class="n">Dataset</span><span class="w"> </span><span class="n">dataset</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">factory</span><span class="p">.</span><span class="na">finish</span><span class="p">();</span>
<span class="w"> </span><span class="n">Scanner</span><span class="w"> </span><span class="n">scanner</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">dataset</span><span class="p">.</span><span class="na">newScan</span><span class="p">(</span><span class="n">options</span><span class="p">)</span>
<span class="p">)</span><span class="w"> </span><span class="p">{</span>
<span class="w"> </span><span class="c1">// do something</span>
<span class="p">}</span><span class="w"> </span><span class="k">catch</span><span class="w"> </span><span class="p">(</span><span class="n">Exception</span><span class="w"> </span><span class="n">e</span><span class="p">)</span><span class="w"> </span><span class="p">{</span>
<span class="w"> </span><span class="n">e</span><span class="p">.</span><span class="na">printStackTrace</span><span class="p">();</span>
<span class="p">}</span>
</pre></div>
</div>
<p>If user forgets to close them then native object leakage might be caused.</p>
</section>
<section id="batchsize">
<h3>BatchSize<a class="headerlink" href="#batchsize" title="Link to this heading"></a></h3>
<p>The <code class="docutils literal notranslate"><span class="pre">batchSize</span></code> argument of <code class="docutils literal notranslate"><span class="pre">ScanOptions</span></code> is a limit on the size of an individual batch.</p>
<p>For example, let’s try to read a Parquet file with gzip compression and 3 row groups:</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span># Let configure ScanOptions as:
ScanOptions options = new ScanOptions(/*batchSize*/ 32768);
$ parquet-tools meta data4_3rg_gzip.parquet
file schema: schema
age: OPTIONAL INT64 R:0 D:1
name: OPTIONAL BINARY L:STRING R:0 D:1
row group 1: RC:4 TS:182 OFFSET:4
row group 2: RC:4 TS:190 OFFSET:420
row group 3: RC:3 TS:179 OFFSET:838
</pre></div>
</div>
<p>Here, we set the batchSize in ScanOptions to 32768. Because that’s greater
than the number of rows in the next batch, which is 4 rows because the first
row group has only 4 rows, then the program gets only 4 rows. The scanner
will not combine smaller batches to reach the limit, but it will split
large batches to stay under the limit. So in the case the row group had more
than 32768 rows, it would get split into blocks of 32768 rows or less.</p>
</section>
</section>
</section>
</article>
</div>
<footer>
<div class="related-pages">
<a class="next-page" href="substrait.html">
<div class="page-info">
<div class="context">
<span>Next</span>
</div>
<div class="title">Substrait</div>
</div>
<svg class="furo-related-icon"><use href="#svg-arrow-right"></use></svg>
</a>
<a class="prev-page" href="flight_sql_jdbc_driver.html">
<svg class="furo-related-icon"><use href="#svg-arrow-right"></use></svg>
<div class="page-info">
<div class="context">
<span>Previous</span>
</div>
<div class="title">Arrow Flight SQL JDBC Driver</div>
</div>
</a>
</div>
<div class="bottom-of-page">
<div class="left-details">
<div class="copyright">
Copyright &#169; 2025, Apache Arrow Developers
</div>
Made with <a href="https://www.sphinx-doc.org/">Sphinx</a> and <a class="muted-link" href="https://pradyunsg.me">@pradyunsg</a>'s
<a href="https://github.com/pradyunsg/furo">Furo</a>
</div>
<div class="right-details">
</div>
</div>
</footer>
</div>
<aside class="toc-drawer">
<div class="toc-sticky toc-scroll">
<div class="toc-title-container">
<span class="toc-title">
On this page
</span>
</div>
<div class="toc-tree-container">
<div class="toc-tree">
<ul>
<li><a class="reference internal" href="#">Dataset</a><ul>
<li><a class="reference internal" href="#getting-started">Getting Started</a></li>
<li><a class="reference internal" href="#schema">Schema</a></li>
<li><a class="reference internal" href="#projection-subset-of-columns">Projection (Subset of Columns)</a></li>
<li><a class="reference internal" href="#projection-produce-new-columns-and-filters">Projection (Produce New Columns) and Filters</a></li>
<li><a class="reference internal" href="#read-data-from-hdfs">Read Data from HDFS</a></li>
<li><a class="reference internal" href="#native-memory-management">Native Memory Management</a></li>
<li><a class="reference internal" href="#usage-notes">Usage Notes</a><ul>
<li><a class="reference internal" href="#native-object-resource-management">Native Object Resource Management</a></li>
<li><a class="reference internal" href="#batchsize">BatchSize</a></li>
</ul>
</li>
</ul>
</li>
</ul>
</div>
</div>
</div>
</aside>
</div>
</div><script src="_static/documentation_options.js?v=c4c92189"></script>
<script src="_static/doctools.js?v=9bcbadda"></script>
<script src="_static/sphinx_highlight.js?v=dc90522c"></script>
<script src="_static/scripts/furo.js?v=5fa4622c"></script>
</body>
</html>