| |
| <!DOCTYPE html> |
| |
| |
| <html lang="en" data-content_root="../" > |
| |
| <head> |
| <meta charset="utf-8" /> |
| <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" /> |
| |
| <title>Using the DataFrame API — Apache DataFusion documentation</title> |
| |
| |
| |
| <script data-cfasync="false"> |
| document.documentElement.dataset.mode = localStorage.getItem("mode") || ""; |
| document.documentElement.dataset.theme = localStorage.getItem("theme") || ""; |
| </script> |
| <!-- |
| this give us a css class that will be invisible only if js is disabled |
| --> |
| <noscript> |
| <style> |
| .pst-js-only { display: none !important; } |
| |
| </style> |
| </noscript> |
| |
| <!-- Loaded before other Sphinx assets --> |
| <link href="../_static/styles/theme.css?digest=8878045cc6db502f8baf" rel="stylesheet" /> |
| <link href="../_static/styles/pydata-sphinx-theme.css?digest=8878045cc6db502f8baf" rel="stylesheet" /> |
| |
| <link rel="stylesheet" type="text/css" href="../_static/pygments.css?v=8f2a1f02" /> |
| <link rel="stylesheet" type="text/css" href="../_static/theme_overrides.css?v=d08b24aa" /> |
| |
| <!-- So that users can add custom icons --> |
| <script src="../_static/scripts/fontawesome.js?digest=8878045cc6db502f8baf"></script> |
| <!-- Pre-loaded scripts that we'll load fully later --> |
| <link rel="preload" as="script" href="../_static/scripts/bootstrap.js?digest=8878045cc6db502f8baf" /> |
| <link rel="preload" as="script" href="../_static/scripts/pydata-sphinx-theme.js?digest=8878045cc6db502f8baf" /> |
| |
| <script src="../_static/documentation_options.js?v=5929fcd5"></script> |
| <script src="../_static/doctools.js?v=fd6eb6e6"></script> |
| <script src="../_static/sphinx_highlight.js?v=6ffebe34"></script> |
| <script>DOCUMENTATION_OPTIONS.pagename = 'library-user-guide/using-the-dataframe-api';</script> |
| <link rel="icon" href="../_static/favicon.svg"/> |
| <link rel="index" title="Index" href="../genindex.html" /> |
| <link rel="search" title="Search" href="../search.html" /> |
| <link rel="next" title="Building Logical Plans" href="building-logical-plans.html" /> |
| <link rel="prev" title="Working with Exprs" href="working-with-exprs.html" /> |
| <meta name="viewport" content="width=device-width, initial-scale=1"/> |
| <meta name="docsearch:language" content="en"/> |
| <meta name="docsearch:version" content="" /> |
| </head> |
| |
| |
| <body data-bs-spy="scroll" data-bs-target=".bd-toc-nav" data-offset="180" data-bs-root-margin="0px 0px -60%" data-default-mode=""> |
| |
| |
| |
| <div id="pst-skip-link" class="skip-link d-print-none"><a href="#main-content">Skip to main content</a></div> |
| |
| <div id="pst-scroll-pixel-helper"></div> |
| |
| <button type="button" class="btn rounded-pill" id="pst-back-to-top"> |
| <i class="fa-solid fa-arrow-up"></i>Back to top</button> |
| |
| |
| <dialog id="pst-search-dialog"> |
| |
| <form class="bd-search d-flex align-items-center" |
| action="../search.html" |
| method="get"> |
| <i class="fa-solid fa-magnifying-glass"></i> |
| <input type="search" |
| class="form-control" |
| name="q" |
| placeholder="Search the docs ..." |
| aria-label="Search the docs ..." |
| autocomplete="off" |
| autocorrect="off" |
| autocapitalize="off" |
| spellcheck="false"/> |
| <span class="search-button__kbd-shortcut"><kbd class="kbd-shortcut__modifier">Ctrl</kbd>+<kbd>K</kbd></span> |
| </form> |
| </dialog> |
| |
| <div class="pst-async-banner-revealer d-none"> |
| <aside id="bd-header-version-warning" class="d-none d-print-none" aria-label="Version warning"></aside> |
| </div> |
| |
| |
| <header class="bd-header navbar navbar-expand-lg bd-navbar d-print-none"> |
| <div class="bd-header__inner bd-page-width"> |
| <button class="pst-navbar-icon sidebar-toggle primary-toggle" aria-label="Site navigation"> |
| <span class="fa-solid fa-bars"></span> |
| </button> |
| |
| |
| <div class="col-lg-3 navbar-header-items__start"> |
| |
| <div class="navbar-item"> |
| |
| |
| |
| |
| |
| <a class="navbar-brand logo" href="../index.html"> |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| <img src="../_static/original.svg" class="logo__image only-light" alt="Apache DataFusion documentation - Home"/> |
| <img src="../_static/original_dark.svg" class="logo__image only-dark pst-js-only" alt="Apache DataFusion documentation - Home"/> |
| |
| |
| </a></div> |
| |
| </div> |
| |
| <div class="col-lg-9 navbar-header-items"> |
| |
| |
| <div class="navbar-header-items__end"> |
| |
| <div class="navbar-item navbar-persistent--container"> |
| |
| |
| <button class="btn search-button-field search-button__button pst-js-only" title="Search" aria-label="Search" data-bs-placement="bottom" data-bs-toggle="tooltip"> |
| <i class="fa-solid fa-magnifying-glass"></i> |
| <span class="search-button__default-text">Search</span> |
| <span class="search-button__kbd-shortcut"><kbd class="kbd-shortcut__modifier">Ctrl</kbd>+<kbd class="kbd-shortcut__modifier">K</kbd></span> |
| </button> |
| </div> |
| |
| |
| <div class="navbar-item"> |
| |
| <button class="btn btn-sm nav-link pst-navbar-icon theme-switch-button pst-js-only" aria-label="Color mode" data-bs-title="Color mode" data-bs-placement="bottom" data-bs-toggle="tooltip"> |
| <i class="theme-switch fa-solid fa-sun fa-lg" data-mode="light" title="Light"></i> |
| <i class="theme-switch fa-solid fa-moon fa-lg" data-mode="dark" title="Dark"></i> |
| <i class="theme-switch fa-solid fa-circle-half-stroke fa-lg" data-mode="auto" title="System Settings"></i> |
| </button></div> |
| |
| </div> |
| |
| </div> |
| |
| |
| <div class="navbar-persistent--mobile"> |
| |
| <button class="btn search-button-field search-button__button pst-js-only" title="Search" aria-label="Search" data-bs-placement="bottom" data-bs-toggle="tooltip"> |
| <i class="fa-solid fa-magnifying-glass"></i> |
| <span class="search-button__default-text">Search</span> |
| <span class="search-button__kbd-shortcut"><kbd class="kbd-shortcut__modifier">Ctrl</kbd>+<kbd class="kbd-shortcut__modifier">K</kbd></span> |
| </button> |
| </div> |
| |
| |
| |
| <button class="pst-navbar-icon sidebar-toggle secondary-toggle" aria-label="On this page"> |
| <span class="fa-solid fa-outdent"></span> |
| </button> |
| |
| </div> |
| |
| </header> |
| |
| |
| <div class="bd-container"> |
| <div class="bd-container__inner bd-page-width"> |
| |
| |
| |
| <dialog id="pst-primary-sidebar-modal"></dialog> |
| <div id="pst-primary-sidebar" class="bd-sidebar-primary bd-sidebar"> |
| |
| |
| |
| <div class="sidebar-header-items sidebar-primary__section"> |
| |
| |
| |
| |
| <div class="sidebar-header-items__end"> |
| |
| <div class="navbar-item"> |
| |
| <button class="btn btn-sm nav-link pst-navbar-icon theme-switch-button pst-js-only" aria-label="Color mode" data-bs-title="Color mode" data-bs-placement="bottom" data-bs-toggle="tooltip"> |
| <i class="theme-switch fa-solid fa-sun fa-lg" data-mode="light" title="Light"></i> |
| <i class="theme-switch fa-solid fa-moon fa-lg" data-mode="dark" title="Dark"></i> |
| <i class="theme-switch fa-solid fa-circle-half-stroke fa-lg" data-mode="auto" title="System Settings"></i> |
| </button></div> |
| |
| </div> |
| |
| </div> |
| |
| <div class="sidebar-primary-items__start sidebar-primary__section"> |
| <div class="sidebar-primary-item"><nav class="bd-links" id="bd-docs-nav" aria-label="Main navigation"> |
| |
| <div class="bd-toc-item active"> |
| |
| <p aria-level="2" class="caption" role="heading"><span class="caption-text">ASF Links</span></p> |
| <ul class="nav bd-sidenav"> |
| <li class="toctree-l1"><a class="reference external" href="https://apache.org">Apache Software Foundation</a></li> |
| <li class="toctree-l1"><a class="reference external" href="https://www.apache.org/licenses/">License</a></li> |
| <li class="toctree-l1"><a class="reference external" href="https://www.apache.org/foundation/sponsorship.html">Donate</a></li> |
| <li class="toctree-l1"><a class="reference external" href="https://www.apache.org/foundation/thanks.html">Thanks</a></li> |
| <li class="toctree-l1"><a class="reference external" href="https://www.apache.org/security/">Security</a></li> |
| </ul> |
| <p aria-level="2" class="caption" role="heading"><span class="caption-text">Links</span></p> |
| <ul class="nav bd-sidenav"> |
| <li class="toctree-l1"><a class="reference external" href="https://github.com/apache/datafusion">GitHub and Issue Tracker</a></li> |
| <li class="toctree-l1"><a class="reference external" href="https://crates.io/crates/datafusion">crates.io</a></li> |
| <li class="toctree-l1"><a class="reference external" href="https://docs.rs/datafusion/latest/datafusion/">API Docs</a></li> |
| <li class="toctree-l1"><a class="reference external" href="https://datafusion.apache.org/blog/">Blog</a></li> |
| <li class="toctree-l1"><a class="reference external" href="https://github.com/apache/datafusion/blob/main/CODE_OF_CONDUCT.md">Code of conduct</a></li> |
| <li class="toctree-l1"><a class="reference internal" href="../download.html">Download</a></li> |
| </ul> |
| <p aria-level="2" class="caption" role="heading"><span class="caption-text">User Guide</span></p> |
| <ul class="nav bd-sidenav"> |
| <li class="toctree-l1"><a class="reference internal" href="../user-guide/introduction.html">Introduction</a></li> |
| <li class="toctree-l1"><a class="reference internal" href="../user-guide/example-usage.html">Example Usage</a></li> |
| <li class="toctree-l1"><a class="reference internal" href="../user-guide/features.html">Features</a></li> |
| <li class="toctree-l1"><a class="reference internal" href="../user-guide/concepts-readings-events.html">Concepts, Readings, Events</a></li> |
| |
| <li class="toctree-l1"><a class="reference internal" href="../user-guide/crate-configuration.html">Crate Configuration</a></li> |
| <li class="toctree-l1 has-children"><a class="reference internal" href="../user-guide/cli/index.html">DataFusion CLI</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul> |
| <li class="toctree-l2"><a class="reference internal" href="../user-guide/cli/overview.html">Overview</a></li> |
| <li class="toctree-l2"><a class="reference internal" href="../user-guide/cli/installation.html">Installation</a></li> |
| <li class="toctree-l2"><a class="reference internal" href="../user-guide/cli/usage.html">Usage</a></li> |
| <li class="toctree-l2"><a class="reference internal" href="../user-guide/cli/datasources.html">Local Files / Directories</a></li> |
| |
| |
| |
| |
| <li class="toctree-l2"><a class="reference internal" href="../user-guide/cli/functions.html">CLI Specific Functions</a></li> |
| </ul> |
| </details></li> |
| <li class="toctree-l1"><a class="reference internal" href="../user-guide/dataframe.html">DataFrame API</a></li> |
| <li class="toctree-l1"><a class="reference internal" href="../user-guide/arrow-introduction.html">Gentle Arrow Introduction</a></li> |
| <li class="toctree-l1"><a class="reference internal" href="../user-guide/expressions.html">Expression API</a></li> |
| <li class="toctree-l1 has-children"><a class="reference internal" href="../user-guide/sql/index.html">SQL Reference</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul> |
| <li class="toctree-l2"><a class="reference internal" href="../user-guide/sql/data_types.html">Data Types</a></li> |
| <li class="toctree-l2"><a class="reference internal" href="../user-guide/sql/struct_coercion.html">Struct Type Coercion and Field Mapping</a></li> |
| <li class="toctree-l2"><a class="reference internal" href="../user-guide/sql/select.html">SELECT syntax</a></li> |
| <li class="toctree-l2"><a class="reference internal" href="../user-guide/sql/subqueries.html">Subqueries</a></li> |
| <li class="toctree-l2"><a class="reference internal" href="../user-guide/sql/ddl.html">DDL</a></li> |
| <li class="toctree-l2"><a class="reference internal" href="../user-guide/sql/dml.html">DML</a></li> |
| <li class="toctree-l2"><a class="reference internal" href="../user-guide/sql/explain.html">EXPLAIN</a></li> |
| <li class="toctree-l2"><a class="reference internal" href="../user-guide/sql/information_schema.html">Information Schema</a></li> |
| <li class="toctree-l2"><a class="reference internal" href="../user-guide/sql/operators.html">Operators and Literals</a></li> |
| <li class="toctree-l2"><a class="reference internal" href="../user-guide/sql/aggregate_functions.html">Aggregate Functions</a></li> |
| <li class="toctree-l2"><a class="reference internal" href="../user-guide/sql/window_functions.html">Window Functions</a></li> |
| <li class="toctree-l2"><a class="reference internal" href="../user-guide/sql/scalar_functions.html">Scalar Functions</a></li> |
| <li class="toctree-l2"><a class="reference internal" href="../user-guide/sql/special_functions.html">Special Functions</a></li> |
| <li class="toctree-l2"><a class="reference internal" href="../user-guide/sql/format_options.html">Format Options</a></li> |
| |
| <li class="toctree-l2"><a class="reference internal" href="../user-guide/sql/prepared_statements.html">Prepared Statements</a></li> |
| </ul> |
| </details></li> |
| <li class="toctree-l1"><a class="reference internal" href="../user-guide/configs.html">Configuration Settings</a></li> |
| |
| |
| |
| <li class="toctree-l1"><a class="reference internal" href="../user-guide/explain-usage.html">Reading Explain Plans</a></li> |
| <li class="toctree-l1"><a class="reference internal" href="../user-guide/metrics.html">Metrics</a></li> |
| <li class="toctree-l1"><a class="reference internal" href="../user-guide/faq.html">Frequently Asked Questions</a></li> |
| |
| </ul> |
| <p aria-level="2" class="caption" role="heading"><span class="caption-text">Library User Guide</span></p> |
| <ul class="current nav bd-sidenav"> |
| <li class="toctree-l1"><a class="reference internal" href="index.html">Introduction</a></li> |
| <li class="toctree-l1 has-children"><a class="reference internal" href="upgrading/index.html">Upgrade Guides</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul> |
| <li class="toctree-l2"><a class="reference internal" href="upgrading/54.0.0.html">DataFusion 54.0.0</a></li> |
| <li class="toctree-l2"><a class="reference internal" href="upgrading/53.0.0.html">DataFusion 53.0.0</a></li> |
| <li class="toctree-l2"><a class="reference internal" href="upgrading/52.0.0.html">DataFusion 52.0.0</a></li> |
| <li class="toctree-l2"><a class="reference internal" href="upgrading/51.0.0.html">DataFusion 51.0.0</a></li> |
| <li class="toctree-l2"><a class="reference internal" href="upgrading/50.0.0.html">DataFusion 50.0.0</a></li> |
| <li class="toctree-l2"><a class="reference internal" href="upgrading/49.0.0.html">DataFusion 49.0.0</a></li> |
| <li class="toctree-l2"><a class="reference internal" href="upgrading/48.0.1.html">DataFusion 48.0.1</a></li> |
| <li class="toctree-l2"><a class="reference internal" href="upgrading/48.0.0.html">DataFusion 48.0.0</a></li> |
| <li class="toctree-l2"><a class="reference internal" href="upgrading/47.0.0.html">DataFusion 47.0.0</a></li> |
| <li class="toctree-l2"><a class="reference internal" href="upgrading/46.0.0.html">DataFusion 46.0.0</a></li> |
| </ul> |
| </details></li> |
| <li class="toctree-l1"><a class="reference internal" href="extensions.html">Extensions List</a></li> |
| <li class="toctree-l1"><a class="reference internal" href="using-the-sql-api.html">Using the SQL API</a></li> |
| <li class="toctree-l1"><a class="reference internal" href="extending-sql.html">Extending SQL Syntax</a></li> |
| <li class="toctree-l1"><a class="reference internal" href="working-with-exprs.html">Working with <code class="docutils literal notranslate"><span class="pre">Expr</span></code>s</a></li> |
| <li class="toctree-l1 current active"><a class="current reference internal" href="#">Using the DataFrame API</a></li> |
| |
| <li class="toctree-l1"><a class="reference internal" href="building-logical-plans.html">Building Logical Plans</a></li> |
| <li class="toctree-l1"><a class="reference internal" href="catalogs.html">Catalogs, Schemas, and Tables</a></li> |
| <li class="toctree-l1 has-children"><a class="reference internal" href="functions/index.html">Functions</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul> |
| <li class="toctree-l2"><a class="reference internal" href="functions/adding-udfs.html">Adding User Defined Functions: Scalar/Window/Aggregate/Table Functions</a></li> |
| <li class="toctree-l2"><a class="reference internal" href="functions/spark.html">Spark Compatible Functions</a></li> |
| </ul> |
| </details></li> |
| <li class="toctree-l1"><a class="reference internal" href="custom-table-providers.html">Custom Table Provider</a></li> |
| <li class="toctree-l1"><a class="reference internal" href="table-constraints.html">Table Constraint Enforcement</a></li> |
| <li class="toctree-l1"><a class="reference internal" href="extending-operators.html">Extending Operators</a></li> |
| <li class="toctree-l1"><a class="reference internal" href="profiling.html">Profiling Cookbook</a></li> |
| <li class="toctree-l1"><a class="reference internal" href="query-optimizer.html">Query Optimizer</a></li> |
| </ul> |
| <p aria-level="2" class="caption" role="heading"><span class="caption-text">Contributor Guide</span></p> |
| <ul class="nav bd-sidenav"> |
| <li class="toctree-l1"><a class="reference internal" href="../contributor-guide/index.html">Introduction</a></li> |
| |
| <li class="toctree-l1"><a class="reference internal" href="../contributor-guide/communication.html">Community Communication</a></li> |
| <li class="toctree-l1"><a class="reference internal" href="../contributor-guide/development_environment.html">Development Environment</a></li> |
| <li class="toctree-l1"><a class="reference internal" href="../contributor-guide/architecture.html">Architecture</a></li> |
| <li class="toctree-l1"><a class="reference internal" href="../contributor-guide/architecture/dependency-graph.html">Workspace Dependency Graph</a></li> |
| <li class="toctree-l1"><a class="reference internal" href="../contributor-guide/testing.html">Testing</a></li> |
| <li class="toctree-l1"><a class="reference internal" href="../contributor-guide/api-health.html">API health policy</a></li> |
| <li class="toctree-l1"><a class="reference internal" href="../contributor-guide/howtos.html">HOWTOs</a></li> |
| <li class="toctree-l1"><a class="reference internal" href="../contributor-guide/roadmap.html">Roadmap and Improvement Proposals</a></li> |
| <li class="toctree-l1"><a class="reference internal" href="../contributor-guide/governance.html">Governance</a></li> |
| <li class="toctree-l1"><a class="reference internal" href="../contributor-guide/inviting.html">Inviting New Committers and PMC Members</a></li> |
| <li class="toctree-l1 has-children"><a class="reference internal" href="../contributor-guide/specification/index.html">Specifications</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul> |
| <li class="toctree-l2"><a class="reference internal" href="../contributor-guide/specification/invariants.html">Invariants</a></li> |
| <li class="toctree-l2"><a class="reference internal" href="../contributor-guide/specification/output-field-name-semantic.html">Output field name semantics</a></li> |
| </ul> |
| </details></li> |
| <li class="toctree-l1 has-children"><a class="reference internal" href="../contributor-guide/gsoc/index.html">Google Summer of Code (GSOC)</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul> |
| <li class="toctree-l2"><a class="reference internal" href="../contributor-guide/gsoc/gsoc_application_guidelines_2025.html">GSoC Application Guidelines (2025)</a></li> |
| <li class="toctree-l2"><a class="reference internal" href="../contributor-guide/gsoc/gsoc_project_ideas_2025.html">GSoC Project Ideas (2025)</a></li> |
| </ul> |
| </details></li> |
| </ul> |
| <p aria-level="2" class="caption" role="heading"><span class="caption-text">DataFusion Subprojects</span></p> |
| <ul class="nav bd-sidenav"> |
| <li class="toctree-l1"><a class="reference external" href="https://datafusion.apache.org/ballista/">DataFusion Ballista</a></li> |
| <li class="toctree-l1"><a class="reference external" href="https://datafusion.apache.org/comet/">DataFusion Comet</a></li> |
| <li class="toctree-l1"><a class="reference external" href="https://datafusion.apache.org/python/">DataFusion Python</a></li> |
| </ul> |
| |
| |
| </div> |
| </nav></div> |
| </div> |
| |
| |
| <div class="sidebar-primary-items__end sidebar-primary__section"> |
| <div class="sidebar-primary-item"> |
| <div id="ethical-ad-placement" |
| class="flat" |
| data-ea-publisher="readthedocs" |
| data-ea-type="readthedocs-sidebar" |
| data-ea-manual="true"> |
| </div></div> |
| </div> |
| |
| |
| </div> |
| |
| <main id="main-content" class="bd-main" role="main"> |
| |
| |
| <div class="bd-content"> |
| <div class="bd-article-container"> |
| |
| <div class="bd-header-article d-print-none"> |
| <div class="header-article-items header-article__inner"> |
| |
| <div class="header-article-items__start"> |
| |
| <div class="header-article-item"> |
| |
| <nav aria-label="Breadcrumb" class="d-print-none"> |
| <ul class="bd-breadcrumbs"> |
| |
| <li class="breadcrumb-item breadcrumb-home"> |
| <a href="../index.html" class="nav-link" aria-label="Home"> |
| <i class="fa-solid fa-home"></i> |
| </a> |
| </li> |
| <li class="breadcrumb-item active" aria-current="page"><span class="ellipsis">Using the DataFrame API</span></li> |
| </ul> |
| </nav> |
| </div> |
| |
| </div> |
| |
| |
| </div> |
| </div> |
| |
| |
| |
| |
| <div id="searchbox"></div> |
| <article class="bd-article"> |
| |
| <!--- |
| Licensed to the Apache Software Foundation (ASF) under one |
| or more contributor license agreements. See the NOTICE file |
| distributed with this work for additional information |
| regarding copyright ownership. The ASF licenses this file |
| to you under the Apache License, Version 2.0 (the |
| "License"); you may not use this file except in compliance |
| with the License. You may obtain a copy of the License at |
| |
| http://www.apache.org/licenses/LICENSE-2.0 |
| |
| Unless required by applicable law or agreed to in writing, |
| software distributed under the License is distributed on an |
| "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| KIND, either express or implied. See the License for the |
| specific language governing permissions and limitations |
| under the License. |
| --> |
| <section id="using-the-dataframe-api"> |
| <h1>Using the DataFrame API<a class="headerlink" href="#using-the-dataframe-api" title="Link to this heading">#</a></h1> |
| <p>The <a class="reference internal" href="../user-guide/dataframe.html"><span class="std std-doc">Users Guide</span></a> introduces the <a class="reference external" href="https://docs.rs/datafusion/latest/datafusion/dataframe/struct.DataFrame.html"><code class="docutils literal notranslate"><span class="pre">DataFrame</span></code></a> API and this section describes |
| that API in more depth.</p> |
| <section id="what-is-a-dataframe"> |
| <h2>What is a DataFrame?<a class="headerlink" href="#what-is-a-dataframe" title="Link to this heading">#</a></h2> |
| <p>As described in the <a class="reference internal" href="../user-guide/dataframe.html"><span class="std std-doc">Users Guide</span></a>, DataFusion <a class="reference external" href="https://docs.rs/datafusion/latest/datafusion/dataframe/struct.DataFrame.html"><code class="docutils literal notranslate"><span class="pre">DataFrame</span></code></a>s are modeled after |
| the <a class="reference external" href="https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.html">Pandas DataFrame</a> interface, and are implemented as thin wrapper over a |
| <a class="reference external" href="https://docs.rs/datafusion/latest/datafusion/logical_expr/enum.LogicalPlan.html"><code class="docutils literal notranslate"><span class="pre">LogicalPlan</span></code></a> that adds functionality for building and executing those plans.</p> |
| <p>The simplest possible dataframe is one that scans a table and that table can be |
| in a file or in memory.</p> |
| </section> |
| <section id="how-to-generate-a-dataframe"> |
| <h2>How to generate a DataFrame<a class="headerlink" href="#how-to-generate-a-dataframe" title="Link to this heading">#</a></h2> |
| <p>You can construct <a class="reference external" href="https://docs.rs/datafusion/latest/datafusion/dataframe/struct.DataFrame.html"><code class="docutils literal notranslate"><span class="pre">DataFrame</span></code></a>s programmatically using the API, similarly to |
| other DataFrame APIs. For example, you can read an in memory <code class="docutils literal notranslate"><span class="pre">RecordBatch</span></code> into |
| a <code class="docutils literal notranslate"><span class="pre">DataFrame</span></code>:</p> |
| <div class="highlight-rust notranslate"><div class="highlight"><pre><span></span><span class="k">use</span><span class="w"> </span><span class="n">std</span><span class="p">::</span><span class="n">sync</span><span class="p">::</span><span class="n">Arc</span><span class="p">;</span> |
| <span class="k">use</span><span class="w"> </span><span class="n">datafusion</span><span class="p">::</span><span class="n">prelude</span><span class="p">::</span><span class="o">*</span><span class="p">;</span> |
| <span class="k">use</span><span class="w"> </span><span class="n">datafusion</span><span class="p">::</span><span class="n">arrow</span><span class="p">::</span><span class="n">array</span><span class="p">::{</span><span class="n">ArrayRef</span><span class="p">,</span><span class="w"> </span><span class="n">Int32Array</span><span class="p">};</span> |
| <span class="k">use</span><span class="w"> </span><span class="n">datafusion</span><span class="p">::</span><span class="n">arrow</span><span class="p">::</span><span class="n">record_batch</span><span class="p">::</span><span class="n">RecordBatch</span><span class="p">;</span> |
| <span class="k">use</span><span class="w"> </span><span class="n">datafusion</span><span class="p">::</span><span class="n">error</span><span class="p">::</span><span class="nb">Result</span><span class="p">;</span> |
| |
| <span class="cp">#[tokio::main]</span> |
| <span class="k">async</span><span class="w"> </span><span class="k">fn</span><span class="w"> </span><span class="nf">main</span><span class="p">()</span><span class="w"> </span><span class="p">-></span><span class="w"> </span><span class="nb">Result</span><span class="o"><</span><span class="p">()</span><span class="o">></span><span class="w"> </span><span class="p">{</span> |
| <span class="w"> </span><span class="kd">let</span><span class="w"> </span><span class="n">ctx</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">SessionContext</span><span class="p">::</span><span class="n">new</span><span class="p">();</span> |
| <span class="w"> </span><span class="c1">// Register an in-memory table containing the following data</span> |
| <span class="w"> </span><span class="c1">// id | bank_account</span> |
| <span class="w"> </span><span class="c1">// ---|-------------</span> |
| <span class="w"> </span><span class="c1">// 1 | 9000</span> |
| <span class="w"> </span><span class="c1">// 2 | 8000</span> |
| <span class="w"> </span><span class="c1">// 3 | 7000</span> |
| <span class="w"> </span><span class="kd">let</span><span class="w"> </span><span class="n">data</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">RecordBatch</span><span class="p">::</span><span class="n">try_from_iter</span><span class="p">(</span><span class="fm">vec!</span><span class="p">[</span> |
| <span class="w"> </span><span class="p">(</span><span class="s">"id"</span><span class="p">,</span><span class="w"> </span><span class="n">Arc</span><span class="p">::</span><span class="n">new</span><span class="p">(</span><span class="n">Int32Array</span><span class="p">::</span><span class="n">from</span><span class="p">(</span><span class="fm">vec!</span><span class="p">[</span><span class="mi">1</span><span class="p">,</span><span class="w"> </span><span class="mi">2</span><span class="p">,</span><span class="w"> </span><span class="mi">3</span><span class="p">]))</span><span class="w"> </span><span class="k">as</span><span class="w"> </span><span class="n">ArrayRef</span><span class="p">),</span> |
| <span class="w"> </span><span class="p">(</span><span class="s">"bank_account"</span><span class="p">,</span><span class="w"> </span><span class="n">Arc</span><span class="p">::</span><span class="n">new</span><span class="p">(</span><span class="n">Int32Array</span><span class="p">::</span><span class="n">from</span><span class="p">(</span><span class="fm">vec!</span><span class="p">[</span><span class="mi">9000</span><span class="p">,</span><span class="w"> </span><span class="mi">8000</span><span class="p">,</span><span class="w"> </span><span class="mi">7000</span><span class="p">]))),</span> |
| <span class="w"> </span><span class="p">])</span><span class="o">?</span><span class="p">;</span> |
| <span class="w"> </span><span class="c1">// Create a DataFrame that scans the user table, and finds</span> |
| <span class="w"> </span><span class="c1">// all users with a bank account at least 8000</span> |
| <span class="w"> </span><span class="c1">// and sorts the results by bank account in descending order</span> |
| <span class="w"> </span><span class="kd">let</span><span class="w"> </span><span class="n">dataframe</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">ctx</span> |
| <span class="w"> </span><span class="p">.</span><span class="n">read_batch</span><span class="p">(</span><span class="n">data</span><span class="p">)</span><span class="o">?</span> |
| <span class="w"> </span><span class="p">.</span><span class="n">filter</span><span class="p">(</span><span class="n">col</span><span class="p">(</span><span class="s">"bank_account"</span><span class="p">).</span><span class="n">gt_eq</span><span class="p">(</span><span class="n">lit</span><span class="p">(</span><span class="mi">8000</span><span class="p">)))</span><span class="o">?</span><span class="w"> </span><span class="c1">// bank_account >= 8000</span> |
| <span class="w"> </span><span class="p">.</span><span class="n">sort</span><span class="p">(</span><span class="fm">vec!</span><span class="p">[</span><span class="n">col</span><span class="p">(</span><span class="s">"bank_account"</span><span class="p">).</span><span class="n">sort</span><span class="p">(</span><span class="kc">false</span><span class="p">,</span><span class="w"> </span><span class="kc">true</span><span class="p">)])</span><span class="o">?</span><span class="p">;</span><span class="w"> </span><span class="c1">// ORDER BY bank_account DESC</span> |
| |
| <span class="w"> </span><span class="nb">Ok</span><span class="p">(())</span> |
| <span class="p">}</span> |
| </pre></div> |
| </div> |
| <p>You can <em>also</em> generate a <code class="docutils literal notranslate"><span class="pre">DataFrame</span></code> from a SQL query and use the DataFrame’s APIs |
| to manipulate the output of the query.</p> |
| <div class="highlight-rust notranslate"><div class="highlight"><pre><span></span><span class="k">use</span><span class="w"> </span><span class="n">std</span><span class="p">::</span><span class="n">sync</span><span class="p">::</span><span class="n">Arc</span><span class="p">;</span> |
| <span class="k">use</span><span class="w"> </span><span class="n">datafusion</span><span class="p">::</span><span class="n">prelude</span><span class="p">::</span><span class="o">*</span><span class="p">;</span> |
| <span class="k">use</span><span class="w"> </span><span class="n">datafusion</span><span class="p">::</span><span class="n">assert_batches_eq</span><span class="p">;</span> |
| <span class="k">use</span><span class="w"> </span><span class="n">datafusion</span><span class="p">::</span><span class="n">arrow</span><span class="p">::</span><span class="n">array</span><span class="p">::{</span><span class="n">ArrayRef</span><span class="p">,</span><span class="w"> </span><span class="n">Int32Array</span><span class="p">};</span> |
| <span class="k">use</span><span class="w"> </span><span class="n">datafusion</span><span class="p">::</span><span class="n">arrow</span><span class="p">::</span><span class="n">record_batch</span><span class="p">::</span><span class="n">RecordBatch</span><span class="p">;</span> |
| <span class="k">use</span><span class="w"> </span><span class="n">datafusion</span><span class="p">::</span><span class="n">error</span><span class="p">::</span><span class="nb">Result</span><span class="p">;</span> |
| |
| <span class="cp">#[tokio::main]</span> |
| <span class="k">async</span><span class="w"> </span><span class="k">fn</span><span class="w"> </span><span class="nf">main</span><span class="p">()</span><span class="w"> </span><span class="p">-></span><span class="w"> </span><span class="nb">Result</span><span class="o"><</span><span class="p">()</span><span class="o">></span><span class="w"> </span><span class="p">{</span> |
| <span class="w"> </span><span class="kd">let</span><span class="w"> </span><span class="n">ctx</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">SessionContext</span><span class="p">::</span><span class="n">new</span><span class="p">();</span> |
| <span class="w"> </span><span class="c1">// Register the same in-memory table as the previous example</span> |
| <span class="w"> </span><span class="kd">let</span><span class="w"> </span><span class="n">data</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">RecordBatch</span><span class="p">::</span><span class="n">try_from_iter</span><span class="p">(</span><span class="fm">vec!</span><span class="p">[</span> |
| <span class="w"> </span><span class="p">(</span><span class="s">"id"</span><span class="p">,</span><span class="w"> </span><span class="n">Arc</span><span class="p">::</span><span class="n">new</span><span class="p">(</span><span class="n">Int32Array</span><span class="p">::</span><span class="n">from</span><span class="p">(</span><span class="fm">vec!</span><span class="p">[</span><span class="mi">1</span><span class="p">,</span><span class="w"> </span><span class="mi">2</span><span class="p">,</span><span class="w"> </span><span class="mi">3</span><span class="p">]))</span><span class="w"> </span><span class="k">as</span><span class="w"> </span><span class="n">ArrayRef</span><span class="p">),</span> |
| <span class="w"> </span><span class="p">(</span><span class="s">"bank_account"</span><span class="p">,</span><span class="w"> </span><span class="n">Arc</span><span class="p">::</span><span class="n">new</span><span class="p">(</span><span class="n">Int32Array</span><span class="p">::</span><span class="n">from</span><span class="p">(</span><span class="fm">vec!</span><span class="p">[</span><span class="mi">9000</span><span class="p">,</span><span class="w"> </span><span class="mi">8000</span><span class="p">,</span><span class="w"> </span><span class="mi">7000</span><span class="p">]))),</span> |
| <span class="w"> </span><span class="p">])</span><span class="o">?</span><span class="p">;</span> |
| <span class="w"> </span><span class="n">ctx</span><span class="p">.</span><span class="n">register_batch</span><span class="p">(</span><span class="s">"users"</span><span class="p">,</span><span class="w"> </span><span class="n">data</span><span class="p">)</span><span class="o">?</span><span class="p">;</span> |
| <span class="w"> </span><span class="c1">// Create a DataFrame using SQL</span> |
| <span class="w"> </span><span class="kd">let</span><span class="w"> </span><span class="n">dataframe</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">ctx</span><span class="p">.</span><span class="n">sql</span><span class="p">(</span><span class="s">"SELECT * FROM users;"</span><span class="p">)</span> |
| <span class="w"> </span><span class="p">.</span><span class="k">await</span><span class="o">?</span> |
| <span class="w"> </span><span class="c1">// Note we can filter the output of the query using the DataFrame API</span> |
| <span class="w"> </span><span class="p">.</span><span class="n">filter</span><span class="p">(</span><span class="n">col</span><span class="p">(</span><span class="s">"bank_account"</span><span class="p">).</span><span class="n">gt_eq</span><span class="p">(</span><span class="n">lit</span><span class="p">(</span><span class="mi">8000</span><span class="p">)))</span><span class="o">?</span><span class="p">;</span><span class="w"> </span><span class="c1">// bank_account >= 8000</span> |
| |
| <span class="w"> </span><span class="kd">let</span><span class="w"> </span><span class="n">results</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="o">&</span><span class="n">dataframe</span><span class="p">.</span><span class="n">collect</span><span class="p">().</span><span class="k">await</span><span class="o">?</span><span class="p">;</span> |
| |
| <span class="w"> </span><span class="c1">// use the `assert_batches_eq` macro to show the output</span> |
| <span class="w"> </span><span class="n">assert_batches_eq</span><span class="o">!</span><span class="p">(</span> |
| <span class="w"> </span><span class="fm">vec!</span><span class="p">[</span> |
| <span class="w"> </span><span class="s">"+----+--------------+"</span><span class="p">,</span> |
| <span class="w"> </span><span class="s">"| id | bank_account |"</span><span class="p">,</span> |
| <span class="w"> </span><span class="s">"+----+--------------+"</span><span class="p">,</span> |
| <span class="w"> </span><span class="s">"| 1 | 9000 |"</span><span class="p">,</span> |
| <span class="w"> </span><span class="s">"| 2 | 8000 |"</span><span class="p">,</span> |
| <span class="w"> </span><span class="s">"+----+--------------+"</span><span class="p">,</span> |
| <span class="w"> </span><span class="p">],</span> |
| <span class="w"> </span><span class="o">&</span><span class="n">results</span> |
| <span class="w"> </span><span class="p">);</span> |
| <span class="w"> </span><span class="nb">Ok</span><span class="p">(())</span> |
| <span class="p">}</span> |
| </pre></div> |
| </div> |
| </section> |
| <section id="collect-streaming-exec"> |
| <h2>Collect / Streaming Exec<a class="headerlink" href="#collect-streaming-exec" title="Link to this heading">#</a></h2> |
| <p>DataFusion <a class="reference external" href="https://docs.rs/datafusion/latest/datafusion/dataframe/struct.DataFrame.html"><code class="docutils literal notranslate"><span class="pre">DataFrame</span></code></a>s are “lazy”, meaning they do no processing until |
| they are executed, which allows for additional optimizations.</p> |
| <p>You can run a <code class="docutils literal notranslate"><span class="pre">DataFrame</span></code> in one of three ways:</p> |
| <ol class="arabic simple"> |
| <li><p><code class="docutils literal notranslate"><span class="pre">collect</span></code>: executes the query and buffers all the output into a <code class="docutils literal notranslate"><span class="pre">Vec<RecordBatch></span></code></p></li> |
| <li><p><code class="docutils literal notranslate"><span class="pre">execute_stream</span></code>: begins executions and returns a <code class="docutils literal notranslate"><span class="pre">SendableRecordBatchStream</span></code> which incrementally computes output on each call to <code class="docutils literal notranslate"><span class="pre">next()</span></code></p></li> |
| <li><p><code class="docutils literal notranslate"><span class="pre">cache</span></code>: executes the query and buffers the output into a new in memory <code class="docutils literal notranslate"><span class="pre">DataFrame.</span></code></p></li> |
| </ol> |
| <p>To collect all outputs into a memory buffer, use the <code class="docutils literal notranslate"><span class="pre">collect</span></code> method:</p> |
| <div class="highlight-rust notranslate"><div class="highlight"><pre><span></span><span class="k">use</span><span class="w"> </span><span class="n">datafusion</span><span class="p">::</span><span class="n">prelude</span><span class="p">::</span><span class="o">*</span><span class="p">;</span> |
| <span class="k">use</span><span class="w"> </span><span class="n">datafusion</span><span class="p">::</span><span class="n">error</span><span class="p">::</span><span class="nb">Result</span><span class="p">;</span> |
| |
| <span class="cp">#[tokio::main]</span> |
| <span class="k">async</span><span class="w"> </span><span class="k">fn</span><span class="w"> </span><span class="nf">main</span><span class="p">()</span><span class="w"> </span><span class="p">-></span><span class="w"> </span><span class="nb">Result</span><span class="o"><</span><span class="p">()</span><span class="o">></span><span class="w"> </span><span class="p">{</span> |
| <span class="w"> </span><span class="kd">let</span><span class="w"> </span><span class="n">ctx</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">SessionContext</span><span class="p">::</span><span class="n">new</span><span class="p">();</span> |
| <span class="w"> </span><span class="c1">// read the contents of a CSV file into a DataFrame</span> |
| <span class="w"> </span><span class="kd">let</span><span class="w"> </span><span class="n">df</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">ctx</span><span class="p">.</span><span class="n">read_csv</span><span class="p">(</span><span class="s">"tests/data/example.csv"</span><span class="p">,</span><span class="w"> </span><span class="n">CsvReadOptions</span><span class="p">::</span><span class="n">new</span><span class="p">()).</span><span class="k">await</span><span class="o">?</span><span class="p">;</span> |
| <span class="w"> </span><span class="c1">// execute the query and collect the results as a Vec<RecordBatch></span> |
| <span class="w"> </span><span class="kd">let</span><span class="w"> </span><span class="n">batches</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">df</span><span class="p">.</span><span class="n">collect</span><span class="p">().</span><span class="k">await</span><span class="o">?</span><span class="p">;</span> |
| <span class="w"> </span><span class="k">for</span><span class="w"> </span><span class="n">record_batch</span><span class="w"> </span><span class="k">in</span><span class="w"> </span><span class="n">batches</span><span class="w"> </span><span class="p">{</span> |
| <span class="w"> </span><span class="fm">println!</span><span class="p">(</span><span class="s">"{record_batch:?}"</span><span class="p">);</span> |
| <span class="w"> </span><span class="p">}</span> |
| <span class="w"> </span><span class="nb">Ok</span><span class="p">(())</span> |
| <span class="p">}</span> |
| </pre></div> |
| </div> |
| <p>Use <code class="docutils literal notranslate"><span class="pre">execute_stream</span></code> to incrementally generate output one <code class="docutils literal notranslate"><span class="pre">RecordBatch</span></code> at a time:</p> |
| <div class="highlight-rust notranslate"><div class="highlight"><pre><span></span><span class="k">use</span><span class="w"> </span><span class="n">datafusion</span><span class="p">::</span><span class="n">prelude</span><span class="p">::</span><span class="o">*</span><span class="p">;</span> |
| <span class="k">use</span><span class="w"> </span><span class="n">datafusion</span><span class="p">::</span><span class="n">error</span><span class="p">::</span><span class="nb">Result</span><span class="p">;</span> |
| <span class="k">use</span><span class="w"> </span><span class="n">futures</span><span class="p">::</span><span class="n">stream</span><span class="p">::</span><span class="n">StreamExt</span><span class="p">;</span> |
| |
| <span class="cp">#[tokio::main]</span> |
| <span class="k">async</span><span class="w"> </span><span class="k">fn</span><span class="w"> </span><span class="nf">main</span><span class="p">()</span><span class="w"> </span><span class="p">-></span><span class="w"> </span><span class="nb">Result</span><span class="o"><</span><span class="p">()</span><span class="o">></span><span class="w"> </span><span class="p">{</span> |
| <span class="w"> </span><span class="kd">let</span><span class="w"> </span><span class="n">ctx</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">SessionContext</span><span class="p">::</span><span class="n">new</span><span class="p">();</span> |
| <span class="w"> </span><span class="c1">// read example.csv file into a DataFrame</span> |
| <span class="w"> </span><span class="kd">let</span><span class="w"> </span><span class="n">df</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">ctx</span><span class="p">.</span><span class="n">read_csv</span><span class="p">(</span><span class="s">"tests/data/example.csv"</span><span class="p">,</span><span class="w"> </span><span class="n">CsvReadOptions</span><span class="p">::</span><span class="n">new</span><span class="p">()).</span><span class="k">await</span><span class="o">?</span><span class="p">;</span> |
| <span class="w"> </span><span class="c1">// begin execution (returns quickly, does not compute results)</span> |
| <span class="w"> </span><span class="kd">let</span><span class="w"> </span><span class="k">mut</span><span class="w"> </span><span class="n">stream</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">df</span><span class="p">.</span><span class="n">execute_stream</span><span class="p">().</span><span class="k">await</span><span class="o">?</span><span class="p">;</span> |
| <span class="w"> </span><span class="c1">// results are returned incrementally as they are computed</span> |
| <span class="w"> </span><span class="k">while</span><span class="w"> </span><span class="kd">let</span><span class="w"> </span><span class="nb">Some</span><span class="p">(</span><span class="n">record_batch</span><span class="p">)</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">stream</span><span class="p">.</span><span class="n">next</span><span class="p">().</span><span class="k">await</span><span class="w"> </span><span class="p">{</span> |
| <span class="w"> </span><span class="fm">println!</span><span class="p">(</span><span class="s">"{record_batch:?}"</span><span class="p">);</span> |
| <span class="w"> </span><span class="p">}</span> |
| <span class="w"> </span><span class="nb">Ok</span><span class="p">(())</span> |
| <span class="p">}</span> |
| </pre></div> |
| </div> |
| </section> |
| </section> |
| <section id="write-dataframe-to-files"> |
| <h1>Write DataFrame to Files<a class="headerlink" href="#write-dataframe-to-files" title="Link to this heading">#</a></h1> |
| <p>You can also write the contents of a <code class="docutils literal notranslate"><span class="pre">DataFrame</span></code> to a file. When writing a file, |
| DataFusion executes the <code class="docutils literal notranslate"><span class="pre">DataFrame</span></code> and streams the results to the output. |
| DataFusion comes with support for writing <code class="docutils literal notranslate"><span class="pre">csv</span></code>, <code class="docutils literal notranslate"><span class="pre">json</span></code> <code class="docutils literal notranslate"><span class="pre">arrow</span></code> <code class="docutils literal notranslate"><span class="pre">avro</span></code>, and |
| <code class="docutils literal notranslate"><span class="pre">parquet</span></code> files, and supports writing custom file formats via API (see |
| <a class="reference external" href="https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/custom_data_source/custom_file_format.rs"><code class="docutils literal notranslate"><span class="pre">custom_file_format.rs</span></code></a> for an example)</p> |
| <p>For example, to read a CSV file and write it to a parquet file, use the |
| <a class="reference external" href="https://docs.rs/datafusion/latest/datafusion/dataframe/struct.DataFrame.html#method.write_parquet"><code class="docutils literal notranslate"><span class="pre">DataFrame::write_parquet</span></code></a> method</p> |
| <div class="highlight-rust notranslate"><div class="highlight"><pre><span></span><span class="k">use</span><span class="w"> </span><span class="n">datafusion</span><span class="p">::</span><span class="n">prelude</span><span class="p">::</span><span class="o">*</span><span class="p">;</span> |
| <span class="k">use</span><span class="w"> </span><span class="n">datafusion</span><span class="p">::</span><span class="n">error</span><span class="p">::</span><span class="nb">Result</span><span class="p">;</span> |
| <span class="k">use</span><span class="w"> </span><span class="n">datafusion</span><span class="p">::</span><span class="n">dataframe</span><span class="p">::</span><span class="n">DataFrameWriteOptions</span><span class="p">;</span> |
| |
| <span class="cp">#[tokio::main]</span> |
| <span class="k">async</span><span class="w"> </span><span class="k">fn</span><span class="w"> </span><span class="nf">main</span><span class="p">()</span><span class="w"> </span><span class="p">-></span><span class="w"> </span><span class="nb">Result</span><span class="o"><</span><span class="p">()</span><span class="o">></span><span class="w"> </span><span class="p">{</span> |
| <span class="w"> </span><span class="kd">let</span><span class="w"> </span><span class="n">ctx</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">SessionContext</span><span class="p">::</span><span class="n">new</span><span class="p">();</span> |
| <span class="w"> </span><span class="c1">// read example.csv file into a DataFrame</span> |
| <span class="w"> </span><span class="kd">let</span><span class="w"> </span><span class="n">df</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">ctx</span><span class="p">.</span><span class="n">read_csv</span><span class="p">(</span><span class="s">"tests/data/example.csv"</span><span class="p">,</span><span class="w"> </span><span class="n">CsvReadOptions</span><span class="p">::</span><span class="n">new</span><span class="p">()).</span><span class="k">await</span><span class="o">?</span><span class="p">;</span> |
| <span class="w"> </span><span class="c1">// stream the contents of the DataFrame to the `example.parquet` file</span> |
| <span class="w"> </span><span class="kd">let</span><span class="w"> </span><span class="n">target_path</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">tempfile</span><span class="p">::</span><span class="n">tempdir</span><span class="p">()</span><span class="o">?</span><span class="p">.</span><span class="n">path</span><span class="p">().</span><span class="n">join</span><span class="p">(</span><span class="s">"example.parquet"</span><span class="p">);</span> |
| <span class="w"> </span><span class="n">df</span><span class="p">.</span><span class="n">write_parquet</span><span class="p">(</span> |
| <span class="w"> </span><span class="n">target_path</span><span class="p">.</span><span class="n">to_str</span><span class="p">().</span><span class="n">unwrap</span><span class="p">(),</span> |
| <span class="w"> </span><span class="n">DataFrameWriteOptions</span><span class="p">::</span><span class="n">new</span><span class="p">(),</span> |
| <span class="w"> </span><span class="nb">None</span><span class="p">,</span><span class="w"> </span><span class="c1">// writer_options</span> |
| <span class="w"> </span><span class="p">).</span><span class="k">await</span><span class="p">;</span> |
| <span class="w"> </span><span class="nb">Ok</span><span class="p">(())</span> |
| <span class="p">}</span> |
| </pre></div> |
| </div> |
| <p>The output file will look like (Example Output):</p> |
| <div class="highlight-sql notranslate"><div class="highlight"><pre><span></span><span class="o">></span><span class="w"> </span><span class="k">select</span><span class="w"> </span><span class="o">*</span><span class="w"> </span><span class="k">from</span><span class="w"> </span><span class="s1">'../datafusion/core/example.parquet'</span><span class="p">;</span> |
| <span class="o">+</span><span class="c1">---+---+---+</span> |
| <span class="o">|</span><span class="w"> </span><span class="n">a</span><span class="w"> </span><span class="o">|</span><span class="w"> </span><span class="n">b</span><span class="w"> </span><span class="o">|</span><span class="w"> </span><span class="k">c</span><span class="w"> </span><span class="o">|</span> |
| <span class="o">+</span><span class="c1">---+---+---+</span> |
| <span class="o">|</span><span class="w"> </span><span class="mi">1</span><span class="w"> </span><span class="o">|</span><span class="w"> </span><span class="mi">2</span><span class="w"> </span><span class="o">|</span><span class="w"> </span><span class="mi">3</span><span class="w"> </span><span class="o">|</span> |
| <span class="o">+</span><span class="c1">---+---+---+</span> |
| </pre></div> |
| </div> |
| <section id="relationship-between-logicalplans-and-dataframes"> |
| <h2>Relationship between <code class="docutils literal notranslate"><span class="pre">LogicalPlan</span></code>s and <code class="docutils literal notranslate"><span class="pre">DataFrame</span></code>s<a class="headerlink" href="#relationship-between-logicalplans-and-dataframes" title="Link to this heading">#</a></h2> |
| <p>The <code class="docutils literal notranslate"><span class="pre">DataFrame</span></code> struct is defined like this:</p> |
| <div class="highlight-rust notranslate"><div class="highlight"><pre><span></span><span class="k">use</span><span class="w"> </span><span class="n">datafusion</span><span class="p">::</span><span class="n">execution</span><span class="p">::</span><span class="n">session_state</span><span class="p">::</span><span class="n">SessionState</span><span class="p">;</span> |
| <span class="k">use</span><span class="w"> </span><span class="n">datafusion</span><span class="p">::</span><span class="n">logical_expr</span><span class="p">::</span><span class="n">LogicalPlan</span><span class="p">;</span> |
| <span class="k">pub</span><span class="w"> </span><span class="k">struct</span><span class="w"> </span><span class="nc">DataFrame</span><span class="w"> </span><span class="p">{</span> |
| <span class="w"> </span><span class="c1">// state required to execute a LogicalPlan</span> |
| <span class="w"> </span><span class="n">session_state</span><span class="p">:</span><span class="w"> </span><span class="nb">Box</span><span class="o"><</span><span class="n">SessionState</span><span class="o">></span><span class="p">,</span> |
| <span class="w"> </span><span class="c1">// LogicalPlan that describes the computation to perform</span> |
| <span class="w"> </span><span class="n">plan</span><span class="p">:</span><span class="w"> </span><span class="nc">LogicalPlan</span><span class="p">,</span> |
| <span class="p">}</span> |
| </pre></div> |
| </div> |
| <p>As shown above, <code class="docutils literal notranslate"><span class="pre">DataFrame</span></code> is a thin wrapper of <code class="docutils literal notranslate"><span class="pre">LogicalPlan</span></code>, so you can |
| easily go back and forth between them.</p> |
| <div class="highlight-rust notranslate"><div class="highlight"><pre><span></span><span class="k">use</span><span class="w"> </span><span class="n">datafusion</span><span class="p">::</span><span class="n">prelude</span><span class="p">::</span><span class="o">*</span><span class="p">;</span> |
| <span class="k">use</span><span class="w"> </span><span class="n">datafusion</span><span class="p">::</span><span class="n">error</span><span class="p">::</span><span class="nb">Result</span><span class="p">;</span> |
| <span class="k">use</span><span class="w"> </span><span class="n">datafusion</span><span class="p">::</span><span class="n">logical_expr</span><span class="p">::</span><span class="n">LogicalPlanBuilder</span><span class="p">;</span> |
| |
| <span class="cp">#[tokio::main]</span> |
| <span class="k">async</span><span class="w"> </span><span class="k">fn</span><span class="w"> </span><span class="nf">main</span><span class="p">()</span><span class="w"> </span><span class="p">-></span><span class="w"> </span><span class="nb">Result</span><span class="o"><</span><span class="p">()</span><span class="o">></span><span class="p">{</span> |
| <span class="w"> </span><span class="kd">let</span><span class="w"> </span><span class="n">ctx</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">SessionContext</span><span class="p">::</span><span class="n">new</span><span class="p">();</span> |
| <span class="w"> </span><span class="c1">// read example.csv file into a DataFrame</span> |
| <span class="w"> </span><span class="kd">let</span><span class="w"> </span><span class="n">df</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">ctx</span><span class="p">.</span><span class="n">read_csv</span><span class="p">(</span><span class="s">"tests/data/example.csv"</span><span class="p">,</span><span class="w"> </span><span class="n">CsvReadOptions</span><span class="p">::</span><span class="n">new</span><span class="p">()).</span><span class="k">await</span><span class="o">?</span><span class="p">;</span> |
| <span class="w"> </span><span class="c1">// You can easily get the LogicalPlan from the DataFrame</span> |
| <span class="w"> </span><span class="kd">let</span><span class="w"> </span><span class="p">(</span><span class="n">_state</span><span class="p">,</span><span class="w"> </span><span class="n">plan</span><span class="p">)</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">df</span><span class="p">.</span><span class="n">into_parts</span><span class="p">();</span> |
| <span class="w"> </span><span class="c1">// Just combine LogicalPlan with SessionContext and you get a DataFrame</span> |
| <span class="w"> </span><span class="c1">// get LogicalPlan in dataframe</span> |
| <span class="w"> </span><span class="kd">let</span><span class="w"> </span><span class="n">new_df</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">DataFrame</span><span class="p">::</span><span class="n">new</span><span class="p">(</span><span class="n">ctx</span><span class="p">.</span><span class="n">state</span><span class="p">(),</span><span class="w"> </span><span class="n">plan</span><span class="p">);</span> |
| <span class="w"> </span><span class="nb">Ok</span><span class="p">(())</span> |
| <span class="p">}</span> |
| </pre></div> |
| </div> |
| <p>In fact, using the <a class="reference external" href="https://docs.rs/datafusion/latest/datafusion/dataframe/struct.DataFrame.html"><code class="docutils literal notranslate"><span class="pre">DataFrame</span></code></a>s methods you can create the same |
| <a class="reference external" href="https://docs.rs/datafusion/latest/datafusion/logical_expr/enum.LogicalPlan.html"><code class="docutils literal notranslate"><span class="pre">LogicalPlan</span></code></a>s as when using <a class="reference external" href="https://docs.rs/datafusion/latest/datafusion/logical_expr/struct.LogicalPlanBuilder.html"><code class="docutils literal notranslate"><span class="pre">LogicalPlanBuilder</span></code></a>:</p> |
| <div class="highlight-rust notranslate"><div class="highlight"><pre><span></span><span class="k">use</span><span class="w"> </span><span class="n">datafusion</span><span class="p">::</span><span class="n">prelude</span><span class="p">::</span><span class="o">*</span><span class="p">;</span> |
| <span class="k">use</span><span class="w"> </span><span class="n">datafusion</span><span class="p">::</span><span class="n">error</span><span class="p">::</span><span class="nb">Result</span><span class="p">;</span> |
| <span class="k">use</span><span class="w"> </span><span class="n">datafusion</span><span class="p">::</span><span class="n">logical_expr</span><span class="p">::</span><span class="n">LogicalPlanBuilder</span><span class="p">;</span> |
| |
| <span class="cp">#[tokio::main]</span> |
| <span class="k">async</span><span class="w"> </span><span class="k">fn</span><span class="w"> </span><span class="nf">main</span><span class="p">()</span><span class="w"> </span><span class="p">-></span><span class="w"> </span><span class="nb">Result</span><span class="o"><</span><span class="p">()</span><span class="o">></span><span class="p">{</span> |
| <span class="w"> </span><span class="kd">let</span><span class="w"> </span><span class="n">ctx</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">SessionContext</span><span class="p">::</span><span class="n">new</span><span class="p">();</span> |
| <span class="w"> </span><span class="c1">// read example.csv file into a DataFrame</span> |
| <span class="w"> </span><span class="kd">let</span><span class="w"> </span><span class="n">df</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">ctx</span><span class="p">.</span><span class="n">read_csv</span><span class="p">(</span><span class="s">"tests/data/example.csv"</span><span class="p">,</span><span class="w"> </span><span class="n">CsvReadOptions</span><span class="p">::</span><span class="n">new</span><span class="p">()).</span><span class="k">await</span><span class="o">?</span><span class="p">;</span> |
| <span class="w"> </span><span class="c1">// Create a new DataFrame sorted by `id`, `bank_account`</span> |
| <span class="w"> </span><span class="kd">let</span><span class="w"> </span><span class="n">new_df</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">df</span><span class="p">.</span><span class="n">select</span><span class="p">(</span><span class="fm">vec!</span><span class="p">[</span><span class="n">col</span><span class="p">(</span><span class="s">"a"</span><span class="p">),</span><span class="w"> </span><span class="n">col</span><span class="p">(</span><span class="s">"b"</span><span class="p">)])</span><span class="o">?</span> |
| <span class="w"> </span><span class="p">.</span><span class="n">sort_by</span><span class="p">(</span><span class="fm">vec!</span><span class="p">[</span><span class="n">col</span><span class="p">(</span><span class="s">"a"</span><span class="p">)])</span><span class="o">?</span><span class="p">;</span> |
| <span class="w"> </span><span class="c1">// Build the same plan using the LogicalPlanBuilder</span> |
| <span class="w"> </span><span class="c1">// Similar to `SELECT a, b FROM example.csv ORDER BY a`</span> |
| <span class="w"> </span><span class="kd">let</span><span class="w"> </span><span class="n">df</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">ctx</span><span class="p">.</span><span class="n">read_csv</span><span class="p">(</span><span class="s">"tests/data/example.csv"</span><span class="p">,</span><span class="w"> </span><span class="n">CsvReadOptions</span><span class="p">::</span><span class="n">new</span><span class="p">()).</span><span class="k">await</span><span class="o">?</span><span class="p">;</span> |
| <span class="w"> </span><span class="kd">let</span><span class="w"> </span><span class="p">(</span><span class="n">_state</span><span class="p">,</span><span class="w"> </span><span class="n">plan</span><span class="p">)</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">df</span><span class="p">.</span><span class="n">into_parts</span><span class="p">();</span><span class="w"> </span><span class="c1">// get the DataFrame's LogicalPlan</span> |
| <span class="w"> </span><span class="kd">let</span><span class="w"> </span><span class="n">plan</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">LogicalPlanBuilder</span><span class="p">::</span><span class="n">from</span><span class="p">(</span><span class="n">plan</span><span class="p">)</span> |
| <span class="w"> </span><span class="p">.</span><span class="n">project</span><span class="p">(</span><span class="fm">vec!</span><span class="p">[</span><span class="n">col</span><span class="p">(</span><span class="s">"a"</span><span class="p">),</span><span class="w"> </span><span class="n">col</span><span class="p">(</span><span class="s">"b"</span><span class="p">)])</span><span class="o">?</span> |
| <span class="w"> </span><span class="p">.</span><span class="n">sort_by</span><span class="p">(</span><span class="fm">vec!</span><span class="p">[</span><span class="n">col</span><span class="p">(</span><span class="s">"a"</span><span class="p">)])</span><span class="o">?</span> |
| <span class="w"> </span><span class="p">.</span><span class="n">build</span><span class="p">()</span><span class="o">?</span><span class="p">;</span> |
| <span class="w"> </span><span class="c1">// prove they are the same</span> |
| <span class="w"> </span><span class="fm">assert_eq!</span><span class="p">(</span><span class="n">new_df</span><span class="p">.</span><span class="n">logical_plan</span><span class="p">(),</span><span class="w"> </span><span class="o">&</span><span class="n">plan</span><span class="p">);</span> |
| <span class="w"> </span><span class="nb">Ok</span><span class="p">(())</span> |
| <span class="p">}</span> |
| </pre></div> |
| </div> |
| </section> |
| </section> |
| |
| |
| </article> |
| |
| |
| |
| |
| |
| <footer class="prev-next-footer d-print-none"> |
| |
| <div class="prev-next-area"> |
| <a class="left-prev" |
| href="working-with-exprs.html" |
| title="previous page"> |
| <i class="fa-solid fa-angle-left"></i> |
| <div class="prev-next-info"> |
| <p class="prev-next-subtitle">previous</p> |
| <p class="prev-next-title">Working with <code class="docutils literal notranslate"><span class="pre">Expr</span></code>s</p> |
| </div> |
| </a> |
| <a class="right-next" |
| href="building-logical-plans.html" |
| title="next page"> |
| <div class="prev-next-info"> |
| <p class="prev-next-subtitle">next</p> |
| <p class="prev-next-title">Building Logical Plans</p> |
| </div> |
| <i class="fa-solid fa-angle-right"></i> |
| </a> |
| </div> |
| </footer> |
| |
| </div> |
| |
| |
| |
| <dialog id="pst-secondary-sidebar-modal"></dialog> |
| <div id="pst-secondary-sidebar" class="bd-sidebar-secondary bd-toc"><div class="sidebar-secondary-items sidebar-secondary__inner"> |
| |
| |
| <div class="sidebar-secondary-item"> |
| <div |
| id="pst-page-navigation-heading-2" |
| class="page-toc tocsection onthispage"> |
| <i class="fa-solid fa-list"></i> On this page |
| </div> |
| <nav class="bd-toc-nav page-toc" aria-labelledby="pst-page-navigation-heading-2"> |
| <ul class="visible nav section-nav flex-column"> |
| <li class="toc-h1 nav-item toc-entry"><a class="reference internal nav-link" href="#">Using the DataFrame API</a><ul class="visible nav section-nav flex-column"> |
| <li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#what-is-a-dataframe">What is a DataFrame?</a></li> |
| <li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#how-to-generate-a-dataframe">How to generate a DataFrame</a></li> |
| <li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#collect-streaming-exec">Collect / Streaming Exec</a></li> |
| </ul> |
| </li> |
| <li class="toc-h1 nav-item toc-entry"><a class="reference internal nav-link" href="#write-dataframe-to-files">Write DataFrame to Files</a><ul class="visible nav section-nav flex-column"> |
| <li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#relationship-between-logicalplans-and-dataframes">Relationship between <code class="docutils literal notranslate"><span class="pre">LogicalPlan</span></code>s and <code class="docutils literal notranslate"><span class="pre">DataFrame</span></code>s</a></li> |
| </ul> |
| </li> |
| </ul> |
| |
| </nav></div> |
| |
| <div class="sidebar-secondary-item"> |
| |
| |
| <div class="tocsection editthispage"> |
| <a href="https://github.com/apache/arrow-datafusion/edit/main/docs/source/library-user-guide/using-the-dataframe-api.md"> |
| <i class="fa-solid fa-pencil"></i> |
| |
| |
| |
| Edit on GitHub |
| |
| |
| </a> |
| </div> |
| </div> |
| |
| <div class="sidebar-secondary-item"> |
| <div role="note" aria-label="source link"> |
| <h3>This Page</h3> |
| <ul class="this-page-menu"> |
| <li><a href="../_sources/library-user-guide/using-the-dataframe-api.md.txt" |
| rel="nofollow">Show Source</a></li> |
| </ul> |
| </div></div> |
| |
| </div></div> |
| |
| |
| </div> |
| <footer class="bd-footer-content"> |
| |
| </footer> |
| |
| </main> |
| </div> |
| </div> |
| |
| <!-- Scripts loaded after <body> so the DOM is not blocked --> |
| <script defer src="../_static/scripts/bootstrap.js?digest=8878045cc6db502f8baf"></script> |
| <script defer src="../_static/scripts/pydata-sphinx-theme.js?digest=8878045cc6db502f8baf"></script> |
| |
| <!-- Based on pydata_sphinx_theme/footer.html --> |
| <footer class="footer mt-5 mt-md-0"> |
| <div class="container"> |
| |
| <div class="footer-item"> |
| <p>Apache DataFusion, Apache, the Apache feather logo, and the Apache DataFusion project logo</p> |
| <p>are either registered trademarks or trademarks of The Apache Software Foundation in the United States and other countries.</p> |
| </div> |
| </div> |
| </footer> |
| |
| |
| </body> |
| </html> |