blob: 3a1a3b04318268c19902b026dfb2ce67d54d6942 [file]
<!DOCTYPE html>
<html lang="en" data-content_root="../" >
<head>
<meta charset="utf-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
<title>DataFrame and SQL &#8212; Apache DataFusion Java documentation</title>
<script data-cfasync="false">
document.documentElement.dataset.mode = localStorage.getItem("mode") || "";
document.documentElement.dataset.theme = localStorage.getItem("theme") || "";
</script>
<!--
this give us a css class that will be invisible only if js is disabled
-->
<noscript>
<style>
.pst-js-only { display: none !important; }
</style>
</noscript>
<!-- Loaded before other Sphinx assets -->
<link href="../_static/styles/theme.css?digest=8878045cc6db502f8baf" rel="stylesheet" />
<link href="../_static/styles/pydata-sphinx-theme.css?digest=8878045cc6db502f8baf" rel="stylesheet" />
<link rel="stylesheet" type="text/css" href="../_static/pygments.css?v=8f2a1f02" />
<!-- So that users can add custom icons -->
<script src="../_static/scripts/fontawesome.js?digest=8878045cc6db502f8baf"></script>
<!-- Pre-loaded scripts that we'll load fully later -->
<link rel="preload" as="script" href="../_static/scripts/bootstrap.js?digest=8878045cc6db502f8baf" />
<link rel="preload" as="script" href="../_static/scripts/pydata-sphinx-theme.js?digest=8878045cc6db502f8baf" />
<script src="../_static/documentation_options.js?v=5929fcd5"></script>
<script src="../_static/doctools.js?v=9a2dae69"></script>
<script src="../_static/sphinx_highlight.js?v=dc90522c"></script>
<script>DOCUMENTATION_OPTIONS.pagename = 'user-guide/dataframe';</script>
<link rel="index" title="Index" href="../genindex.html" />
<link rel="search" title="Search" href="../search.html" />
<link rel="next" title="Parquet" href="parquet.html" />
<link rel="prev" title="SessionContext" href="sessioncontext.html" />
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="" />
</head>
<body data-bs-spy="scroll" data-bs-target=".bd-toc-nav" data-offset="180" data-bs-root-margin="0px 0px -60%" data-default-mode="">
<div id="pst-skip-link" class="skip-link d-print-none"><a href="#main-content">Skip to main content</a></div>
<div id="pst-scroll-pixel-helper"></div>
<button type="button" class="btn rounded-pill" id="pst-back-to-top">
<i class="fa-solid fa-arrow-up"></i>Back to top</button>
<dialog id="pst-search-dialog">
<form class="bd-search d-flex align-items-center"
action="../search.html"
method="get">
<i class="fa-solid fa-magnifying-glass"></i>
<input type="search"
class="form-control"
name="q"
placeholder="Search the docs ..."
aria-label="Search the docs ..."
autocomplete="off"
autocorrect="off"
autocapitalize="off"
spellcheck="false"/>
<span class="search-button__kbd-shortcut"><kbd class="kbd-shortcut__modifier">Ctrl</kbd>+<kbd>K</kbd></span>
</form>
</dialog>
<div class="pst-async-banner-revealer d-none">
<aside id="bd-header-version-warning" class="d-none d-print-none" aria-label="Version warning"></aside>
</div>
<header class="bd-header navbar navbar-expand-lg bd-navbar d-print-none">
<div class="bd-header__inner bd-page-width">
<button class="pst-navbar-icon sidebar-toggle primary-toggle" aria-label="Site navigation">
<span class="fa-solid fa-bars"></span>
</button>
<div class="col-lg-3 navbar-header-items__start">
<div class="navbar-item">
<a class="navbar-brand logo" href="../index.html">
<p class="title logo__title">Apache DataFusion Java documentation</p>
</a></div>
</div>
<div class="col-lg-9 navbar-header-items">
<div class="me-auto navbar-header-items__center">
<div class="navbar-item">
<nav>
<ul class="bd-navbar-elements navbar-nav">
<li class="nav-item ">
<a class="nav-link nav-external" href="https://github.com/apache/datafusion-java">
GitHub Repository
</a>
</li>
<li class="nav-item ">
<a class="nav-link nav-external" href="https://github.com/apache/datafusion-java/issues">
Issue Tracker
</a>
</li>
<li class="nav-item ">
<a class="nav-link nav-external" href="https://datafusion.apache.org/">
Apache DataFusion
</a>
</li>
<li class="nav-item ">
<a class="nav-link nav-external" href="https://github.com/apache/datafusion/blob/main/CODE_OF_CONDUCT.md">
Code of Conduct
</a>
</li>
<li class="nav-item current active">
<a class="nav-link nav-internal" href="index.html">
User Guide
</a>
</li>
<li class="nav-item dropdown">
<button class="btn dropdown-toggle nav-item" type="button"
data-bs-toggle="dropdown" aria-expanded="false"
aria-controls="pst-nav-more-links">
More
</button>
<ul id="pst-nav-more-links" class="dropdown-menu">
<li class=" ">
<a class="nav-link dropdown-item nav-internal" href="../contributor-guide/index.html">
Contributor Guide
</a>
</li>
</ul>
</li>
</ul>
</nav></div>
</div>
<div class="navbar-header-items__end">
<div class="navbar-item navbar-persistent--container">
<button class="btn search-button-field search-button__button pst-js-only" title="Search" aria-label="Search" data-bs-placement="bottom" data-bs-toggle="tooltip">
<i class="fa-solid fa-magnifying-glass"></i>
<span class="search-button__default-text">Search</span>
<span class="search-button__kbd-shortcut"><kbd class="kbd-shortcut__modifier">Ctrl</kbd>+<kbd class="kbd-shortcut__modifier">K</kbd></span>
</button>
</div>
<div class="navbar-item">
<button class="btn btn-sm nav-link pst-navbar-icon theme-switch-button pst-js-only" aria-label="Color mode" data-bs-title="Color mode" data-bs-placement="bottom" data-bs-toggle="tooltip">
<i class="theme-switch fa-solid fa-sun fa-lg" data-mode="light" title="Light"></i>
<i class="theme-switch fa-solid fa-moon fa-lg" data-mode="dark" title="Dark"></i>
<i class="theme-switch fa-solid fa-circle-half-stroke fa-lg" data-mode="auto" title="System Settings"></i>
</button></div>
</div>
</div>
<div class="navbar-persistent--mobile">
<button class="btn search-button-field search-button__button pst-js-only" title="Search" aria-label="Search" data-bs-placement="bottom" data-bs-toggle="tooltip">
<i class="fa-solid fa-magnifying-glass"></i>
<span class="search-button__default-text">Search</span>
<span class="search-button__kbd-shortcut"><kbd class="kbd-shortcut__modifier">Ctrl</kbd>+<kbd class="kbd-shortcut__modifier">K</kbd></span>
</button>
</div>
<button class="pst-navbar-icon sidebar-toggle secondary-toggle" aria-label="On this page">
<span class="fa-solid fa-outdent"></span>
</button>
</div>
</header>
<div class="bd-container">
<div class="bd-container__inner bd-page-width">
<dialog id="pst-primary-sidebar-modal"></dialog>
<div id="pst-primary-sidebar" class="bd-sidebar-primary bd-sidebar">
<div class="sidebar-header-items sidebar-primary__section">
<div class="sidebar-header-items__center">
<div class="navbar-item">
<nav>
<ul class="bd-navbar-elements navbar-nav">
<li class="nav-item ">
<a class="nav-link nav-external" href="https://github.com/apache/datafusion-java">
GitHub Repository
</a>
</li>
<li class="nav-item ">
<a class="nav-link nav-external" href="https://github.com/apache/datafusion-java/issues">
Issue Tracker
</a>
</li>
<li class="nav-item ">
<a class="nav-link nav-external" href="https://datafusion.apache.org/">
Apache DataFusion
</a>
</li>
<li class="nav-item ">
<a class="nav-link nav-external" href="https://github.com/apache/datafusion/blob/main/CODE_OF_CONDUCT.md">
Code of Conduct
</a>
</li>
<li class="nav-item current active">
<a class="nav-link nav-internal" href="index.html">
User Guide
</a>
</li>
<li class="nav-item ">
<a class="nav-link nav-internal" href="../contributor-guide/index.html">
Contributor Guide
</a>
</li>
</ul>
</nav></div>
</div>
<div class="sidebar-header-items__end">
<div class="navbar-item">
<button class="btn btn-sm nav-link pst-navbar-icon theme-switch-button pst-js-only" aria-label="Color mode" data-bs-title="Color mode" data-bs-placement="bottom" data-bs-toggle="tooltip">
<i class="theme-switch fa-solid fa-sun fa-lg" data-mode="light" title="Light"></i>
<i class="theme-switch fa-solid fa-moon fa-lg" data-mode="dark" title="Dark"></i>
<i class="theme-switch fa-solid fa-circle-half-stroke fa-lg" data-mode="auto" title="System Settings"></i>
</button></div>
</div>
</div>
<div class="sidebar-primary-items__start sidebar-primary__section">
<div class="sidebar-primary-item">
<nav class="bd-docs-nav bd-links"
aria-label="Section Navigation">
<p class="bd-links__title" role="heading" aria-level="1">Section Navigation</p>
<div class="bd-toc-item navbar-nav"><ul class="current nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="installation.html">Installation</a></li>
<li class="toctree-l1"><a class="reference internal" href="quickstart.html">Quickstart</a></li>
<li class="toctree-l1"><a class="reference internal" href="sessioncontext.html">SessionContext</a></li>
<li class="toctree-l1 current active"><a class="current reference internal" href="#">DataFrame and SQL</a></li>
<li class="toctree-l1"><a class="reference internal" href="parquet.html">Parquet</a></li>
<li class="toctree-l1"><a class="reference internal" href="proto-plans.html">Logical plans via datafusion-proto</a></li>
<li class="toctree-l1"><a class="reference internal" href="scalar-udf.html">Scalar UDFs</a></li>
<li class="toctree-l1"><a class="reference internal" href="table-provider.html">Java table providers</a></li>
<li class="toctree-l1"><a class="reference internal" href="api-reference.html">API Reference</a></li>
</ul>
</div>
</nav></div>
</div>
<div class="sidebar-primary-items__end sidebar-primary__section">
<div class="sidebar-primary-item">
<div id="ethical-ad-placement"
class="flat"
data-ea-publisher="readthedocs"
data-ea-type="readthedocs-sidebar"
data-ea-manual="true">
</div></div>
</div>
</div>
<main id="main-content" class="bd-main" role="main">
<div class="bd-content">
<div class="bd-article-container">
<div class="bd-header-article d-print-none">
<div class="header-article-items header-article__inner">
<div class="header-article-items__start">
<div class="header-article-item">
<nav aria-label="Breadcrumb" class="d-print-none">
<ul class="bd-breadcrumbs">
<li class="breadcrumb-item breadcrumb-home">
<a href="../index.html" class="nav-link" aria-label="Home">
<i class="fa-solid fa-home"></i>
</a>
</li>
<li class="breadcrumb-item"><a href="index.html" class="nav-link">User Guide</a></li>
<li class="breadcrumb-item active" aria-current="page"><span class="ellipsis">DataFrame and SQL</span></li>
</ul>
</nav>
</div>
</div>
</div>
</div>
<div id="searchbox"></div>
<article class="bd-article">
<!--
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
distributed with this work for additional information
regarding copyright ownership. The ASF licenses this file
to you under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing,
software distributed under the License is distributed on an
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, either express or implied. See the License for the
specific language governing permissions and limitations
under the License.
-->
<section id="dataframe-and-sql">
<h1>DataFrame and SQL<a class="headerlink" href="#dataframe-and-sql" title="Link to this heading">#</a></h1>
<p>DataFusion Java supports two query interfaces: SQL strings via
<code class="docutils literal notranslate"><span class="pre">SessionContext.sql(String)</span></code>, and a programmatic DataFrame API.</p>
<section id="sql">
<h2>SQL<a class="headerlink" href="#sql" title="Link to this heading">#</a></h2>
<div class="highlight-java notranslate"><div class="highlight"><pre><span></span><span class="k">try</span><span class="w"> </span><span class="p">(</span><span class="n">DataFrame</span><span class="w"> </span><span class="n">df</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">ctx</span><span class="p">.</span><span class="na">sql</span><span class="p">(</span><span class="s">&quot;SELECT a, b FROM t WHERE a &gt; 10&quot;</span><span class="p">))</span><span class="w"> </span><span class="p">{</span>
<span class="w"> </span><span class="n">df</span><span class="p">.</span><span class="na">show</span><span class="p">();</span>
<span class="p">}</span>
</pre></div>
</div>
<p><code class="docutils literal notranslate"><span class="pre">sql(String)</span></code> plans the query and returns a <code class="docutils literal notranslate"><span class="pre">DataFrame</span></code>. Execution does
not start until you pull results.</p>
</section>
<section id="dataframe-transformations">
<h2>DataFrame transformations<a class="headerlink" href="#dataframe-transformations" title="Link to this heading">#</a></h2>
<p>The DataFrame API exposes <code class="docutils literal notranslate"><span class="pre">select</span></code>, <code class="docutils literal notranslate"><span class="pre">filter</span></code>, <code class="docutils literal notranslate"><span class="pre">limit</span></code>, <code class="docutils literal notranslate"><span class="pre">distinct</span></code>,
<code class="docutils literal notranslate"><span class="pre">dropColumns</span></code>, and <code class="docutils literal notranslate"><span class="pre">withColumnRenamed</span></code>.</p>
<div class="highlight-java notranslate"><div class="highlight"><pre><span></span><span class="k">try</span><span class="w"> </span><span class="p">(</span><span class="n">DataFrame</span><span class="w"> </span><span class="n">df</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">ctx</span><span class="p">.</span><span class="na">readParquet</span><span class="p">(</span><span class="s">&quot;/path/to/orders.parquet&quot;</span><span class="p">))</span><span class="w"> </span><span class="p">{</span>
<span class="w"> </span><span class="k">try</span><span class="w"> </span><span class="p">(</span><span class="n">DataFrame</span><span class="w"> </span><span class="n">filtered</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">df</span><span class="p">.</span><span class="na">filter</span><span class="p">(</span><span class="s">&quot;o_orderpriority = &#39;1-URGENT&#39;&quot;</span><span class="p">))</span><span class="w"> </span><span class="p">{</span>
<span class="w"> </span><span class="n">filtered</span><span class="p">.</span><span class="na">show</span><span class="p">();</span>
<span class="w"> </span><span class="p">}</span>
<span class="p">}</span>
</pre></div>
</div>
<p>Each transformation returns a new <code class="docutils literal notranslate"><span class="pre">DataFrame</span></code> that must be closed.</p>
</section>
<section id="pulling-results">
<h2>Pulling results<a class="headerlink" href="#pulling-results" title="Link to this heading">#</a></h2>
<p>Three patterns are available:</p>
<p><strong>Stream as Arrow batches.</strong> Use <code class="docutils literal notranslate"><span class="pre">collect(allocator)</span></code> to pull the result
set as Arrow record batches via the <a class="reference external" href="https://arrow.apache.org/docs/format/CDataInterface.html">Arrow C Data Interface</a>:</p>
<div class="highlight-java notranslate"><div class="highlight"><pre><span></span><span class="k">try</span><span class="w"> </span><span class="p">(</span><span class="n">DataFrame</span><span class="w"> </span><span class="n">df</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">ctx</span><span class="p">.</span><span class="na">sql</span><span class="p">(</span><span class="s">&quot;SELECT ...&quot;</span><span class="p">);</span>
<span class="w"> </span><span class="n">ArrowReader</span><span class="w"> </span><span class="n">reader</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">df</span><span class="p">.</span><span class="na">collect</span><span class="p">(</span><span class="n">allocator</span><span class="p">))</span><span class="w"> </span><span class="p">{</span>
<span class="w"> </span><span class="k">while</span><span class="w"> </span><span class="p">(</span><span class="n">reader</span><span class="p">.</span><span class="na">loadNextBatch</span><span class="p">())</span><span class="w"> </span><span class="p">{</span>
<span class="w"> </span><span class="kd">var</span><span class="w"> </span><span class="n">batch</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">reader</span><span class="p">.</span><span class="na">getVectorSchemaRoot</span><span class="p">();</span>
<span class="w"> </span><span class="c1">// process batch...</span>
<span class="w"> </span><span class="p">}</span>
<span class="p">}</span>
</pre></div>
</div>
<p><strong>Count rows.</strong> <code class="docutils literal notranslate"><span class="pre">df.count()</span></code> returns the row count without materializing
the rows in the JVM.</p>
<p><strong>Print for inspection.</strong> <code class="docutils literal notranslate"><span class="pre">df.show()</span></code> and <code class="docutils literal notranslate"><span class="pre">df.show(int</span> <span class="pre">n)</span></code> print results
to standard output. Useful for exploration; not appropriate for
production code paths.</p>
</section>
<section id="schema-introspection">
<h2>Schema introspection<a class="headerlink" href="#schema-introspection" title="Link to this heading">#</a></h2>
<p>To get the schema of a registered table without running a query:</p>
<div class="highlight-java notranslate"><div class="highlight"><pre><span></span><span class="n">org</span><span class="p">.</span><span class="na">apache</span><span class="p">.</span><span class="na">arrow</span><span class="p">.</span><span class="na">vector</span><span class="p">.</span><span class="na">types</span><span class="p">.</span><span class="na">pojo</span><span class="p">.</span><span class="na">Schema</span><span class="w"> </span><span class="n">schema</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">ctx</span><span class="p">.</span><span class="na">tableSchema</span><span class="p">(</span><span class="s">&quot;orders&quot;</span><span class="p">);</span>
</pre></div>
</div>
</section>
<section id="plan-input">
<h2>Plan input<a class="headerlink" href="#plan-input" title="Link to this heading">#</a></h2>
<p>A DataFusion logical plan can be deserialized from <code class="docutils literal notranslate"><span class="pre">datafusion-proto</span></code>
bytes via <code class="docutils literal notranslate"><span class="pre">SessionContext.fromProto(byte[])</span></code>. The <code class="docutils literal notranslate"><span class="pre">datafusion-proto</span></code> Java
classes are generated by the Maven build. This is useful for accepting
plans produced by other DataFusion-aware tooling.</p>
</section>
</section>
</article>
<footer class="prev-next-footer d-print-none">
<div class="prev-next-area">
<a class="left-prev"
href="sessioncontext.html"
title="previous page">
<i class="fa-solid fa-angle-left"></i>
<div class="prev-next-info">
<p class="prev-next-subtitle">previous</p>
<p class="prev-next-title">SessionContext</p>
</div>
</a>
<a class="right-next"
href="parquet.html"
title="next page">
<div class="prev-next-info">
<p class="prev-next-subtitle">next</p>
<p class="prev-next-title">Parquet</p>
</div>
<i class="fa-solid fa-angle-right"></i>
</a>
</div>
</footer>
</div>
<dialog id="pst-secondary-sidebar-modal"></dialog>
<div id="pst-secondary-sidebar" class="bd-sidebar-secondary bd-toc"><div class="sidebar-secondary-items sidebar-secondary__inner">
<div class="sidebar-secondary-item">
<div
id="pst-page-navigation-heading-2"
class="page-toc tocsection onthispage">
<i class="fa-solid fa-list"></i> On this page
</div>
<nav class="bd-toc-nav page-toc" aria-labelledby="pst-page-navigation-heading-2">
<ul class="visible nav section-nav flex-column">
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#sql">SQL</a></li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#dataframe-transformations">DataFrame transformations</a></li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#pulling-results">Pulling results</a></li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#schema-introspection">Schema introspection</a></li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#plan-input">Plan input</a></li>
</ul>
</nav></div>
<div class="sidebar-secondary-item">
<div role="note" aria-label="source link">
<h3>This Page</h3>
<ul class="this-page-menu">
<li><a href="../_sources/user-guide/dataframe.md.txt"
rel="nofollow">Show Source</a></li>
</ul>
</div></div>
</div></div>
</div>
<footer class="bd-footer-content">
</footer>
</main>
</div>
</div>
<!-- Scripts loaded after <body> so the DOM is not blocked -->
<script defer src="../_static/scripts/bootstrap.js?digest=8878045cc6db502f8baf"></script>
<script defer src="../_static/scripts/pydata-sphinx-theme.js?digest=8878045cc6db502f8baf"></script>
<footer class="bd-footer">
<div class="bd-footer__inner bd-page-width">
<div class="footer-items__start">
<div class="footer-item">
<p class="copyright">
© Copyright 2026, Apache Software Foundation.
<br/>
</p>
</div>
<div class="footer-item">
<p class="sphinx-version">
Created using <a href="https://www.sphinx-doc.org/">Sphinx</a> 7.4.7.
<br/>
</p>
</div>
</div>
<div class="footer-items__end">
<div class="footer-item">
<p class="theme-version">
<!-- # L10n: Setting the PST URL as an argument as this does not need to be localized -->
Built with the <a href="https://pydata-sphinx-theme.readthedocs.io/en/stable/index.html">PyData Sphinx Theme</a> 0.16.1.
</p></div>
</div>
</div>
</footer>
</body>
</html>