blob: 6c06b669df1dc4f7c3a55ca851cf64d80c7b34e2 [file] [log] [blame]
<!DOCTYPE html>
<html lang="en" data-content_root="../" >
<head>
<meta charset="utf-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
<title>Introduction &#8212; Apache DataFusion documentation</title>
<script data-cfasync="false">
document.documentElement.dataset.mode = localStorage.getItem("mode") || "";
document.documentElement.dataset.theme = localStorage.getItem("theme") || "";
</script>
<!--
this give us a css class that will be invisible only if js is disabled
-->
<noscript>
<style>
.pst-js-only { display: none !important; }
</style>
</noscript>
<!-- Loaded before other Sphinx assets -->
<link href="../_static/styles/theme.css?digest=8878045cc6db502f8baf" rel="stylesheet" />
<link href="../_static/styles/pydata-sphinx-theme.css?digest=8878045cc6db502f8baf" rel="stylesheet" />
<link rel="stylesheet" type="text/css" href="../_static/pygments.css?v=8f2a1f02" />
<link rel="stylesheet" type="text/css" href="../_static/theme_overrides.css?v=d08b24aa" />
<!-- So that users can add custom icons -->
<script src="../_static/scripts/fontawesome.js?digest=8878045cc6db502f8baf"></script>
<!-- Pre-loaded scripts that we'll load fully later -->
<link rel="preload" as="script" href="../_static/scripts/bootstrap.js?digest=8878045cc6db502f8baf" />
<link rel="preload" as="script" href="../_static/scripts/pydata-sphinx-theme.js?digest=8878045cc6db502f8baf" />
<script src="../_static/documentation_options.js?v=5929fcd5"></script>
<script src="../_static/doctools.js?v=9bcbadda"></script>
<script src="../_static/sphinx_highlight.js?v=dc90522c"></script>
<script>DOCUMENTATION_OPTIONS.pagename = 'user-guide/introduction';</script>
<link rel="index" title="Index" href="../genindex.html" />
<link rel="search" title="Search" href="../search.html" />
<link rel="next" title="Example Usage" href="example-usage.html" />
<link rel="prev" title="Download" href="../download.html" />
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="" />
</head>
<body data-bs-spy="scroll" data-bs-target=".bd-toc-nav" data-offset="180" data-bs-root-margin="0px 0px -60%" data-default-mode="">
<div id="pst-skip-link" class="skip-link d-print-none"><a href="#main-content">Skip to main content</a></div>
<div id="pst-scroll-pixel-helper"></div>
<button type="button" class="btn rounded-pill" id="pst-back-to-top">
<i class="fa-solid fa-arrow-up"></i>Back to top</button>
<dialog id="pst-search-dialog">
<form class="bd-search d-flex align-items-center"
action="../search.html"
method="get">
<i class="fa-solid fa-magnifying-glass"></i>
<input type="search"
class="form-control"
name="q"
placeholder="Search the docs ..."
aria-label="Search the docs ..."
autocomplete="off"
autocorrect="off"
autocapitalize="off"
spellcheck="false"/>
<span class="search-button__kbd-shortcut"><kbd class="kbd-shortcut__modifier">Ctrl</kbd>+<kbd>K</kbd></span>
</form>
</dialog>
<div class="pst-async-banner-revealer d-none">
<aside id="bd-header-version-warning" class="d-none d-print-none" aria-label="Version warning"></aside>
</div>
<header class="bd-header navbar navbar-expand-lg bd-navbar d-print-none">
<div class="bd-header__inner bd-page-width">
<button class="pst-navbar-icon sidebar-toggle primary-toggle" aria-label="Site navigation">
<span class="fa-solid fa-bars"></span>
</button>
<div class="col-lg-3 navbar-header-items__start">
<div class="navbar-item">
<a class="navbar-brand logo" href="../index.html">
<img src="../_static/original.svg" class="logo__image only-light" alt="Apache DataFusion documentation - Home"/>
<img src="../_static/original_dark.svg" class="logo__image only-dark pst-js-only" alt="Apache DataFusion documentation - Home"/>
</a></div>
</div>
<div class="col-lg-9 navbar-header-items">
<div class="navbar-header-items__end">
<div class="navbar-item navbar-persistent--container">
<button class="btn search-button-field search-button__button pst-js-only" title="Search" aria-label="Search" data-bs-placement="bottom" data-bs-toggle="tooltip">
<i class="fa-solid fa-magnifying-glass"></i>
<span class="search-button__default-text">Search</span>
<span class="search-button__kbd-shortcut"><kbd class="kbd-shortcut__modifier">Ctrl</kbd>+<kbd class="kbd-shortcut__modifier">K</kbd></span>
</button>
</div>
<div class="navbar-item">
<button class="btn btn-sm nav-link pst-navbar-icon theme-switch-button pst-js-only" aria-label="Color mode" data-bs-title="Color mode" data-bs-placement="bottom" data-bs-toggle="tooltip">
<i class="theme-switch fa-solid fa-sun fa-lg" data-mode="light" title="Light"></i>
<i class="theme-switch fa-solid fa-moon fa-lg" data-mode="dark" title="Dark"></i>
<i class="theme-switch fa-solid fa-circle-half-stroke fa-lg" data-mode="auto" title="System Settings"></i>
</button></div>
</div>
</div>
<div class="navbar-persistent--mobile">
<button class="btn search-button-field search-button__button pst-js-only" title="Search" aria-label="Search" data-bs-placement="bottom" data-bs-toggle="tooltip">
<i class="fa-solid fa-magnifying-glass"></i>
<span class="search-button__default-text">Search</span>
<span class="search-button__kbd-shortcut"><kbd class="kbd-shortcut__modifier">Ctrl</kbd>+<kbd class="kbd-shortcut__modifier">K</kbd></span>
</button>
</div>
<button class="pst-navbar-icon sidebar-toggle secondary-toggle" aria-label="On this page">
<span class="fa-solid fa-outdent"></span>
</button>
</div>
</header>
<div class="bd-container">
<div class="bd-container__inner bd-page-width">
<dialog id="pst-primary-sidebar-modal"></dialog>
<div id="pst-primary-sidebar" class="bd-sidebar-primary bd-sidebar">
<div class="sidebar-header-items sidebar-primary__section">
<div class="sidebar-header-items__end">
<div class="navbar-item">
<button class="btn btn-sm nav-link pst-navbar-icon theme-switch-button pst-js-only" aria-label="Color mode" data-bs-title="Color mode" data-bs-placement="bottom" data-bs-toggle="tooltip">
<i class="theme-switch fa-solid fa-sun fa-lg" data-mode="light" title="Light"></i>
<i class="theme-switch fa-solid fa-moon fa-lg" data-mode="dark" title="Dark"></i>
<i class="theme-switch fa-solid fa-circle-half-stroke fa-lg" data-mode="auto" title="System Settings"></i>
</button></div>
</div>
</div>
<div class="sidebar-primary-items__start sidebar-primary__section">
<div class="sidebar-primary-item"><nav class="bd-links" id="bd-docs-nav" aria-label="Main navigation">
<div class="bd-toc-item active">
<p aria-level="2" class="caption" role="heading"><span class="caption-text">ASF Links</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference external" href="https://apache.org">Apache Software Foundation</a></li>
<li class="toctree-l1"><a class="reference external" href="https://www.apache.org/licenses/">License</a></li>
<li class="toctree-l1"><a class="reference external" href="https://www.apache.org/foundation/sponsorship.html">Donate</a></li>
<li class="toctree-l1"><a class="reference external" href="https://www.apache.org/foundation/thanks.html">Thanks</a></li>
<li class="toctree-l1"><a class="reference external" href="https://www.apache.org/security/">Security</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Links</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference external" href="https://github.com/apache/datafusion">GitHub and Issue Tracker</a></li>
<li class="toctree-l1"><a class="reference external" href="https://crates.io/crates/datafusion">crates.io</a></li>
<li class="toctree-l1"><a class="reference external" href="https://docs.rs/datafusion/latest/datafusion/">API Docs</a></li>
<li class="toctree-l1"><a class="reference external" href="https://datafusion.apache.org/blog/">Blog</a></li>
<li class="toctree-l1"><a class="reference external" href="https://github.com/apache/datafusion/blob/main/CODE_OF_CONDUCT.md">Code of conduct</a></li>
<li class="toctree-l1"><a class="reference internal" href="../download.html">Download</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">User Guide</span></p>
<ul class="current nav bd-sidenav">
<li class="toctree-l1 current active"><a class="current reference internal" href="#">Introduction</a></li>
<li class="toctree-l1"><a class="reference internal" href="example-usage.html">Example Usage</a></li>
<li class="toctree-l1"><a class="reference internal" href="features.html">Features</a></li>
<li class="toctree-l1"><a class="reference internal" href="concepts-readings-events.html">Concepts, Readings, Events</a></li>
<li class="toctree-l1"><a class="reference internal" href="crate-configuration.html">Crate Configuration</a></li>
<li class="toctree-l1 has-children"><a class="reference internal" href="cli/index.html">DataFusion CLI</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
<li class="toctree-l2"><a class="reference internal" href="cli/overview.html">Overview</a></li>
<li class="toctree-l2"><a class="reference internal" href="cli/installation.html">Installation</a></li>
<li class="toctree-l2"><a class="reference internal" href="cli/usage.html">Usage</a></li>
<li class="toctree-l2"><a class="reference internal" href="cli/datasources.html">Local Files / Directories</a></li>
<li class="toctree-l2"><a class="reference internal" href="cli/functions.html">CLI Specific Functions</a></li>
</ul>
</details></li>
<li class="toctree-l1"><a class="reference internal" href="dataframe.html">DataFrame API</a></li>
<li class="toctree-l1"><a class="reference internal" href="arrow-introduction.html">Gentle Arrow Introduction</a></li>
<li class="toctree-l1"><a class="reference internal" href="expressions.html">Expression API</a></li>
<li class="toctree-l1 has-children"><a class="reference internal" href="sql/index.html">SQL Reference</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
<li class="toctree-l2"><a class="reference internal" href="sql/data_types.html">Data Types</a></li>
<li class="toctree-l2"><a class="reference internal" href="sql/select.html">SELECT syntax</a></li>
<li class="toctree-l2"><a class="reference internal" href="sql/subqueries.html">Subqueries</a></li>
<li class="toctree-l2"><a class="reference internal" href="sql/ddl.html">DDL</a></li>
<li class="toctree-l2"><a class="reference internal" href="sql/dml.html">DML</a></li>
<li class="toctree-l2"><a class="reference internal" href="sql/explain.html">EXPLAIN</a></li>
<li class="toctree-l2"><a class="reference internal" href="sql/information_schema.html">Information Schema</a></li>
<li class="toctree-l2"><a class="reference internal" href="sql/operators.html">Operators and Literals</a></li>
<li class="toctree-l2"><a class="reference internal" href="sql/aggregate_functions.html">Aggregate Functions</a></li>
<li class="toctree-l2"><a class="reference internal" href="sql/window_functions.html">Window Functions</a></li>
<li class="toctree-l2"><a class="reference internal" href="sql/scalar_functions.html">Scalar Functions</a></li>
<li class="toctree-l2"><a class="reference internal" href="sql/special_functions.html">Special Functions</a></li>
<li class="toctree-l2"><a class="reference internal" href="sql/format_options.html">Format Options</a></li>
<li class="toctree-l2"><a class="reference internal" href="sql/prepared_statements.html">Prepared Statements</a></li>
</ul>
</details></li>
<li class="toctree-l1"><a class="reference internal" href="configs.html">Configuration Settings</a></li>
<li class="toctree-l1"><a class="reference internal" href="explain-usage.html">Reading Explain Plans</a></li>
<li class="toctree-l1"><a class="reference internal" href="metrics.html">Metrics</a></li>
<li class="toctree-l1"><a class="reference internal" href="faq.html">Frequently Asked Questions</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Library User Guide</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../library-user-guide/index.html">Introduction</a></li>
<li class="toctree-l1"><a class="reference internal" href="../library-user-guide/upgrading.html">Upgrade Guides</a></li>
<li class="toctree-l1"><a class="reference internal" href="../library-user-guide/extensions.html">Extensions List</a></li>
<li class="toctree-l1"><a class="reference internal" href="../library-user-guide/using-the-sql-api.html">Using the SQL API</a></li>
<li class="toctree-l1"><a class="reference internal" href="../library-user-guide/extending-sql.html">Extending SQL Syntax</a></li>
<li class="toctree-l1"><a class="reference internal" href="../library-user-guide/working-with-exprs.html">Working with <code class="docutils literal notranslate"><span class="pre">Expr</span></code>s</a></li>
<li class="toctree-l1"><a class="reference internal" href="../library-user-guide/using-the-dataframe-api.html">Using the DataFrame API</a></li>
<li class="toctree-l1"><a class="reference internal" href="../library-user-guide/building-logical-plans.html">Building Logical Plans</a></li>
<li class="toctree-l1"><a class="reference internal" href="../library-user-guide/catalogs.html">Catalogs, Schemas, and Tables</a></li>
<li class="toctree-l1 has-children"><a class="reference internal" href="../library-user-guide/functions/index.html">Functions</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
<li class="toctree-l2"><a class="reference internal" href="../library-user-guide/functions/adding-udfs.html">Adding User Defined Functions: Scalar/Window/Aggregate/Table Functions</a></li>
<li class="toctree-l2"><a class="reference internal" href="../library-user-guide/functions/spark.html">Spark Compatible Functions</a></li>
</ul>
</details></li>
<li class="toctree-l1"><a class="reference internal" href="../library-user-guide/custom-table-providers.html">Custom Table Provider</a></li>
<li class="toctree-l1"><a class="reference internal" href="../library-user-guide/table-constraints.html">Table Constraint Enforcement</a></li>
<li class="toctree-l1"><a class="reference internal" href="../library-user-guide/extending-operators.html">Extending Operators</a></li>
<li class="toctree-l1"><a class="reference internal" href="../library-user-guide/profiling.html">Profiling Cookbook</a></li>
<li class="toctree-l1"><a class="reference internal" href="../library-user-guide/query-optimizer.html">Query Optimizer</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Contributor Guide</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../contributor-guide/index.html">Introduction</a></li>
<li class="toctree-l1"><a class="reference internal" href="../contributor-guide/communication.html">Community Communication</a></li>
<li class="toctree-l1"><a class="reference internal" href="../contributor-guide/development_environment.html">Development Environment</a></li>
<li class="toctree-l1"><a class="reference internal" href="../contributor-guide/architecture.html">Architecture</a></li>
<li class="toctree-l1"><a class="reference internal" href="../contributor-guide/architecture/dependency-graph.html">Workspace Dependency Graph</a></li>
<li class="toctree-l1"><a class="reference internal" href="../contributor-guide/testing.html">Testing</a></li>
<li class="toctree-l1"><a class="reference internal" href="../contributor-guide/api-health.html">API health policy</a></li>
<li class="toctree-l1"><a class="reference internal" href="../contributor-guide/howtos.html">HOWTOs</a></li>
<li class="toctree-l1"><a class="reference internal" href="../contributor-guide/roadmap.html">Roadmap and Improvement Proposals</a></li>
<li class="toctree-l1"><a class="reference internal" href="../contributor-guide/governance.html">Governance</a></li>
<li class="toctree-l1"><a class="reference internal" href="../contributor-guide/inviting.html">Inviting New Committers and PMC Members</a></li>
<li class="toctree-l1 has-children"><a class="reference internal" href="../contributor-guide/specification/index.html">Specifications</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
<li class="toctree-l2"><a class="reference internal" href="../contributor-guide/specification/invariants.html">Invariants</a></li>
<li class="toctree-l2"><a class="reference internal" href="../contributor-guide/specification/output-field-name-semantic.html">Output field name semantics</a></li>
</ul>
</details></li>
<li class="toctree-l1 has-children"><a class="reference internal" href="../contributor-guide/gsoc/index.html">Google Summer of Code (GSOC)</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
<li class="toctree-l2"><a class="reference internal" href="../contributor-guide/gsoc/gsoc_application_guidelines_2025.html">GSoC Application Guidelines (2025)</a></li>
<li class="toctree-l2"><a class="reference internal" href="../contributor-guide/gsoc/gsoc_project_ideas_2025.html">GSoC Project Ideas (2025)</a></li>
</ul>
</details></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">DataFusion Subprojects</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference external" href="https://datafusion.apache.org/ballista/">DataFusion Ballista</a></li>
<li class="toctree-l1"><a class="reference external" href="https://datafusion.apache.org/comet/">DataFusion Comet</a></li>
<li class="toctree-l1"><a class="reference external" href="https://datafusion.apache.org/python/">DataFusion Python</a></li>
</ul>
</div>
</nav></div>
</div>
<div class="sidebar-primary-items__end sidebar-primary__section">
<div class="sidebar-primary-item">
<div id="ethical-ad-placement"
class="flat"
data-ea-publisher="readthedocs"
data-ea-type="readthedocs-sidebar"
data-ea-manual="true">
</div></div>
</div>
</div>
<main id="main-content" class="bd-main" role="main">
<div class="bd-content">
<div class="bd-article-container">
<div class="bd-header-article d-print-none">
<div class="header-article-items header-article__inner">
<div class="header-article-items__start">
<div class="header-article-item">
<nav aria-label="Breadcrumb" class="d-print-none">
<ul class="bd-breadcrumbs">
<li class="breadcrumb-item breadcrumb-home">
<a href="../index.html" class="nav-link" aria-label="Home">
<i class="fa-solid fa-home"></i>
</a>
</li>
<li class="breadcrumb-item active" aria-current="page"><span class="ellipsis">Introduction</span></li>
</ul>
</nav>
</div>
</div>
</div>
</div>
<div id="searchbox"></div>
<article class="bd-article">
<!---
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
distributed with this work for additional information
regarding copyright ownership. The ASF licenses this file
to you under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing,
software distributed under the License is distributed on an
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, either express or implied. See the License for the
specific language governing permissions and limitations
under the License.
-->
<section id="introduction">
<h1>Introduction<a class="headerlink" href="#introduction" title="Link to this heading">#</a></h1>
<p>DataFusion is a very fast, extensible query engine for building
high-quality data-centric systems in <a class="reference external" href="http://rustlang.org">Rust</a>,
using the <a class="reference external" href="https://arrow.apache.org">Apache Arrow</a> in-memory format.
DataFusion originated as part of the <a class="reference external" href="https://arrow.apache.org/">Apache Arrow</a>
project.</p>
<p>DataFusion offers SQL and Dataframe APIs, excellent <a class="reference external" href="https://benchmark.clickhouse.com/">performance</a>, built-in support for CSV, Parquet, JSON, and Avro, <a class="reference external" href="https://github.com/apache/datafusion-python">python bindings</a>, extensive customization, a great community, and more.</p>
<section id="project-goals">
<h2>Project Goals<a class="headerlink" href="#project-goals" title="Link to this heading">#</a></h2>
<p>DataFusion aims to be the query engine of choice for new, fast
data centric systems such as databases, dataframe libraries, machine
learning and streaming applications by leveraging the unique features
of <a class="reference external" href="https://www.rust-lang.org/">Rust</a> and <a class="reference external" href="https://arrow.apache.org/">Apache
Arrow</a>.</p>
</section>
<section id="features">
<h2>Features<a class="headerlink" href="#features" title="Link to this heading">#</a></h2>
<ul class="simple">
<li><p>Feature-rich <a class="reference external" href="https://datafusion.apache.org/user-guide/sql/index.html">SQL support</a> and <a class="reference external" href="https://datafusion.apache.org/user-guide/dataframe.html">DataFrame API</a></p></li>
<li><p>Blazingly fast, vectorized, multithreaded, streaming execution engine.</p></li>
<li><p>Native support for Parquet, CSV, JSON, and Avro file formats. Support
for custom file formats and non-file datasources via the <code class="docutils literal notranslate"><span class="pre">TableProvider</span></code> trait.</p></li>
<li><p>Many extension points: user defined scalar/aggregate/window functions, DataSources, SQL,
other query languages, custom plan and execution nodes, optimizer passes, and more.</p></li>
<li><p>Streaming, asynchronous IO directly from popular object stores, including AWS S3,
Azure Blob Storage, and Google Cloud Storage (Other storage systems are supported via the
<code class="docutils literal notranslate"><span class="pre">ObjectStore</span></code> trait).</p></li>
<li><p><a class="reference external" href="https://docs.rs/datafusion/latest">Excellent Documentation</a> and a
<a class="reference external" href="https://datafusion.apache.org/contributor-guide/communication.html">welcoming community</a>.</p></li>
<li><p>A state of the art query optimizer with expression coercion and
simplification, projection and filter pushdown, sort and distribution
aware optimizations, automatic join reordering, and more.</p></li>
<li><p>Permissive Apache 2.0 License, predictable and well understood
<a class="reference external" href="https://www.apache.org/">Apache Software Foundation</a> governance.</p></li>
<li><p>Implementation in <a class="reference external" href="https://www.rust-lang.org/">Rust</a>, a modern
system language with development productivity similar to Java or
Golang, the performance of C++, and <a class="reference external" href="https://insights.stackoverflow.com/survey/2021#technology-most-loved-dreaded-and-wanted">loved by programmers
everywhere</a>.</p></li>
<li><p>Support for <a class="reference external" href="https://substrait.io/">Substrait</a> query plans, to
easily pass plans across language and system boundaries.</p></li>
</ul>
</section>
<section id="use-cases">
<h2>Use Cases<a class="headerlink" href="#use-cases" title="Link to this heading">#</a></h2>
<p>DataFusion can be used without modification as an embedded SQL
engine or can be customized and used as a foundation for
building new systems.</p>
<p>While most current use cases are “analytic” or (throughput) some
components of DataFusion such as the plan representations, are
suitable for “streaming” and “transaction” style systems (low
latency).</p>
<p>Here are some example systems built using DataFusion:</p>
<ul class="simple">
<li><p>Specialized Analytical Database systems such as <a class="reference external" href="https://github.com/apache/incubator-horaedb">HoraeDB</a> and more general Apache Spark like system such as <a class="reference external" href="https://github.com/apache/datafusion-ballista">Ballista</a></p></li>
<li><p>New query language engines such as <a class="reference external" href="https://github.com/prql/prql-query">prql-query</a> and accelerators such as <a class="reference external" href="https://vegafusion.io/">VegaFusion</a></p></li>
<li><p>Research platform for new Database Systems, such as <a class="reference external" href="https://github.com/flock-lab/flock">Flock</a></p></li>
<li><p>SQL support to another library, such as <a class="reference external" href="https://vortex.dev/">Vortex</a></p></li>
<li><p>Streaming data platforms such as <a class="reference external" href="https://synnada.ai/">Synnada</a></p></li>
<li><p>Tools for reading / sorting / transcoding Parquet, CSV, AVRO, and JSON files such as <a class="reference external" href="https://github.com/timvw/qv">qv</a></p></li>
<li><p>Native Spark runtime replacement such as <a class="reference external" href="https://github.com/apache/auron">Auron</a></p></li>
<li><p>Distributed data cache to boost GPU utilization of AI workloads with <a class="reference external" href="https://www.kubeflow.org/docs/components/trainer/user-guides/data-cache/">Kubeflow Trainer</a></p></li>
</ul>
<p>By using DataFusion, projects are freed to focus on their specific
features, and avoid reimplementing general (but still necessary)
features such as an expression representation, standard optimizations,
parallelized streaming execution plans, file format support, etc.</p>
</section>
<section id="known-users">
<h2>Known Users<a class="headerlink" href="#known-users" title="Link to this heading">#</a></h2>
<p>Here are some active projects using DataFusion:</p>
<!-- "Active" means github repositories that had at least one commit in the last 6 months -->
<ul class="simple">
<li><p><a class="reference external" href="https://github.com/ArroyoSystems/arroyo">Arroyo</a> Distributed stream processing engine in Rust</p></li>
<li><p><a class="reference external" href="https://github.com/arkflow-rs/arkflow">ArkFlow</a> High-performance Rust stream processing engine</p></li>
<li><p><a class="reference external" href="https://github.com/apache/auron">Auron</a> The Auron accelerator for big data engine (e.g., Spark, Flink) leverages native vectorized execution to accelerate query processing</p></li>
<li><p><a class="reference external" href="https://github.com/apache/datafusion-ballista">Ballista</a> Distributed SQL Query Engine</p></li>
<li><p><a class="reference external" href="https://github.com/cnosdb/cnosdb">CnosDB</a> Open Source Distributed Time Series Database</p></li>
<li><p><a class="reference external" href="https://github.com/apache/datafusion-comet">Comet</a> Apache Spark native query execution plugin</p></li>
<li><p><a class="reference external" href="https://github.com/cube-js/cube.js/tree/master/rust">Cube Store</a> Cube’s universal semantic layer platform is the next evolution of OLAP technology for AI, BI, spreadsheets, and embedded analytics</p></li>
<li><p><a class="reference external" href="https://github.com/datafusion-contrib/datafusion-dft">datafusion-dft</a> Batteries included CLI, TUI, and server implementations for DataFusion.</p></li>
<li><p><a class="reference external" href="https://github.com/dbt-labs/dbt-fusion">dbt Fusion engine</a> The dbt Fusion engine, written in Rust, designed for speed and correctness with a native SQL understanding across DWH SQL dialects.</p></li>
<li><p><a class="reference external" href="https://github.com/delta-io/delta-rs">delta-rs</a> Native Rust implementation of Delta Lake</p></li>
<li><p><a class="reference external" href="https://www.enterprisedb.com/products/analytics">EDB Postgres Lakehouse</a> built with <a class="reference external" href="https://github.com/splitgraph/seafowl">Seafowl</a></p></li>
<li><p><a class="reference external" href="https://github.com/feldera/feldera">Feldera</a> Fast query engine for incremental computation</p></li>
<li><p><a class="reference external" href="https://funnel.io/">Funnel</a> Data Platform powering Marketing Intelligence applications.</p></li>
<li><p><a class="reference external" href="https://github.com/GlareDB/glaredb">GlareDB</a> Fast SQL database for querying and analyzing distributed data.</p></li>
<li><p><a class="reference external" href="https://github.com/GreptimeTeam/greptimedb">GreptimeDB</a> Open Source &amp; Cloud Native Distributed Time Series Database</p></li>
<li><p><a class="reference external" href="https://hiop.io">hiop</a> Serverless Data Logistic Platform</p></li>
<li><p><a class="reference external" href="https://github.com/apache/incubator-horaedb">HoraeDB</a> Distributed Time-Series Database</p></li>
<li><p><a class="reference external" href="https://github.com/apache/iceberg-rust">Iceberg-rust</a> Rust implementation of Apache Iceberg</p></li>
<li><p><a class="reference external" href="https://github.com/influxdata/influxdb">InfluxDB</a> Time Series Database</p></li>
<li><p><a class="reference external" href="https://github.com/kamu-data/kamu-cli">Kamu</a> Planet-scale streaming data pipeline</p></li>
<li><p><a class="reference external" href="https://github.com/kubeflow/trainer">Kubeflow Trainer</a> Kubernetes-native project designed for
scalable LLMs fine-tuning and distributed AI model training.</p></li>
<li><p><a class="reference external" href="https://github.com/lakesoul-io/LakeSoul">LakeSoul</a> Open source LakeHouse framework with native IO in Rust.</p></li>
<li><p><a class="reference external" href="https://github.com/lancedb/lance">Lance</a> Modern columnar data format for ML</p></li>
<li><p><a class="reference external" href="https://github.com/openobserve/openobserve">OpenObserve</a> Distributed cloud native observability platform</p></li>
<li><p><a class="reference external" href="https://github.com/paradedb/paradedb">ParadeDB</a> PostgreSQL for Search &amp; Analytics</p></li>
<li><p><a class="reference external" href="https://github.com/parseablehq/parseable">Parseable</a> Log storage and observability platform</p></li>
<li><p><a class="reference external" href="https://polygon.io/">Polygon.io</a> Stock Market API</p></li>
<li><p><a class="reference external" href="https://github.com/timvw/qv">qv</a> Quickly view your data</p></li>
<li><p><a class="reference external" href="https://blog.cloudflare.com/r2-sql-deep-dive/">R2 Query Engine</a> Cloudflare’s distributed engine for querying data in Iceberg Catalogs</p></li>
<li><p><a class="reference external" href="https://rerun.io/">rerun.io</a> Visualize and query robotics logs and transform them into training data.</p></li>
<li><p><a class="reference external" href="https://github.com/restatedev">Restate</a> Easily build resilient applications using distributed durable async/await</p></li>
<li><p><a class="reference external" href="https://github.com/roapi/roapi">ROAPI</a> Create full-fledged APIs for slowly moving datasets without writing a single line of code</p></li>
<li><p><a class="reference external" href="https://github.com/lakehq/sail">Sail</a> Unifying stream, batch and AI workloads with Apache Spark compatibility</p></li>
<li><p><a class="reference external" href="https://github.com/apache/sedona-db">SedonaDB</a> A single-node analytical database engine with geospatial as a first-class citizen</p></li>
<li><p><a class="reference external" href="https://github.com/gchq/sleeper">Sleeper</a> Serverless, cloud-native, log-structured merge tree based, scalable key-value store</p></li>
<li><p><a class="reference external" href="https://github.com/spiceai/spiceai">Spice.ai</a> Building blocks for data-driven AI applications</p></li>
<li><p><a class="reference external" href="https://synnada.ai/">Synnada</a> Streaming-first framework for data products</p></li>
<li><p><a class="reference external" href="https://vegafusion.io/">VegaFusion</a> Server-side acceleration for the <a class="reference external" href="https://vega.github.io/">Vega</a> visualization grammar</p></li>
<li><p><a class="reference external" href="https://vortex.dev/">Vortex</a> An extensible, state of the art columnar file format</p></li>
<li><p><a class="reference external" href="https://telemetry.sh/">Telemetry</a> Structured logging made easy</p></li>
<li><p><a class="reference external" href="https://github.com/xorq-labs/xorq/">Xorq</a> Xorq is a multi-engine batch transformation framework built on Ibis, DataFusion and Arrow</p></li>
</ul>
<p>Here are some less active projects that used DataFusion:</p>
<ul class="simple">
<li><p><a class="reference external" href="https://github.com/datafusion-contrib/bdt">bdt</a> Boring Data Tool</p></li>
<li><p><a class="reference external" href="https://github.com/cloudfuse-io/buzz-rust">Cloudfuse Buzz</a></p></li>
<li><p><a class="reference external" href="https://github.com/dask-contrib/dask-sql">Dask SQL</a> Distributed SQL query engine in Python</p></li>
<li><p><a class="reference external" href="https://github.com/wheretrue/exon">Exon</a> Analysis toolkit for life-science applications</p></li>
<li><p><a class="reference external" href="https://github.com/flock-lab/flock">Flock</a></p></li>
<li><p><a class="reference external" href="https://github.com/tensorbase/tensorbase">Tensorbase</a></p></li>
</ul>
<p>If you know of another project, please submit a PR to add a link!</p>
</section>
<section id="integrations-and-extensions">
<h2>Integrations and Extensions<a class="headerlink" href="#integrations-and-extensions" title="Link to this heading">#</a></h2>
<p>There are a number of community projects that extend DataFusion or
provide integrations with other systems, some of which are described below:</p>
<section id="language-bindings">
<h3>Language Bindings<a class="headerlink" href="#language-bindings" title="Link to this heading">#</a></h3>
<ul class="simple">
<li><p><a class="reference external" href="https://github.com/datafusion-contrib/datafusion-c">datafusion-c</a></p></li>
<li><p><a class="reference external" href="https://github.com/apache/datafusion-python">datafusion-python</a></p></li>
<li><p><a class="reference external" href="https://github.com/datafusion-contrib/datafusion-ruby">datafusion-ruby</a></p></li>
<li><p><a class="reference external" href="https://github.com/datafusion-contrib/datafusion-java">datafusion-java</a></p></li>
</ul>
</section>
<section id="integrations">
<h3>Integrations<a class="headerlink" href="#integrations" title="Link to this heading">#</a></h3>
<ul class="simple">
<li><p><a class="reference external" href="https://github.com/datafusion-contrib/datafusion-bigtable">datafusion-bigtable</a></p></li>
<li><p><a class="reference external" href="https://github.com/datafusion-contrib/datafusion-catalogprovider-glue">datafusion-catalogprovider-glue</a></p></li>
<li><p><a class="reference external" href="https://github.com/datafusion-contrib/datafusion-federation">datafusion-federation</a></p></li>
</ul>
</section>
</section>
<section id="why-datafusion">
<h2>Why DataFusion?<a class="headerlink" href="#why-datafusion" title="Link to this heading">#</a></h2>
<ul class="simple">
<li><p><em>High Performance</em>: Leveraging Rust and Arrow’s memory model, DataFusion is very fast.</p></li>
<li><p><em>Easy to Connect</em>: Being part of the Apache Arrow ecosystem (Arrow, Parquet, and Flight), DataFusion works well with the rest of the big data ecosystem</p></li>
<li><p><em>Easy to Embed</em>: Allowing extension at almost any point in its design, and published regularly as a crate on <a class="reference external" href="http://crates.io">crates.io</a>, DataFusion can be integrated and tailored for your specific usecase.</p></li>
<li><p><em>High Quality</em>: Extensively tested, both by itself and with the rest of the Arrow ecosystem, DataFusion can and is used as the foundation for production systems.</p></li>
</ul>
</section>
<section id="rust-version-compatibility-policy">
<h2>Rust Version Compatibility Policy<a class="headerlink" href="#rust-version-compatibility-policy" title="Link to this heading">#</a></h2>
<p>The Rust toolchain releases are tracked at <a class="reference external" href="https://releases.rs">Rust Versions</a> and follow
<a class="reference external" href="https://semver.org/">semantic versioning</a>. A Rust toolchain release can be identified
by a version string like <code class="docutils literal notranslate"><span class="pre">1.80.0</span></code>, or more generally <code class="docutils literal notranslate"><span class="pre">major.minor.patch</span></code>.</p>
<p>DataFusion supports the last 4 stable Rust minor versions released and any such versions released within the last 4 months.</p>
<p>For example, given the releases <code class="docutils literal notranslate"><span class="pre">1.78.0</span></code>, <code class="docutils literal notranslate"><span class="pre">1.79.0</span></code>, <code class="docutils literal notranslate"><span class="pre">1.80.0</span></code>, <code class="docutils literal notranslate"><span class="pre">1.80.1</span></code> and <code class="docutils literal notranslate"><span class="pre">1.81.0</span></code> DataFusion will support 1.78.0, which is 3 minor versions prior to the most minor recent <code class="docutils literal notranslate"><span class="pre">1.81</span></code>.</p>
<p>Note: If a Rust hotfix is released for the current MSRV, the MSRV will be updated to the specific minor version that includes all applicable hotfixes preceding other policies.</p>
<p>DataFusion enforces MSRV policy using a <a class="reference external" href="https://github.com/search?q=repo%3Aapache%2Fdatafusion+rust-version+language%3ATOML+path%3A%2F%5ECargo.toml%2F&amp;amp;type=code">MSRV CI Check</a></p>
</section>
</section>
</article>
<footer class="prev-next-footer d-print-none">
<div class="prev-next-area">
<a class="left-prev"
href="../download.html"
title="previous page">
<i class="fa-solid fa-angle-left"></i>
<div class="prev-next-info">
<p class="prev-next-subtitle">previous</p>
<p class="prev-next-title">Download</p>
</div>
</a>
<a class="right-next"
href="example-usage.html"
title="next page">
<div class="prev-next-info">
<p class="prev-next-subtitle">next</p>
<p class="prev-next-title">Example Usage</p>
</div>
<i class="fa-solid fa-angle-right"></i>
</a>
</div>
</footer>
</div>
<dialog id="pst-secondary-sidebar-modal"></dialog>
<div id="pst-secondary-sidebar" class="bd-sidebar-secondary bd-toc"><div class="sidebar-secondary-items sidebar-secondary__inner">
<div class="sidebar-secondary-item">
<div
id="pst-page-navigation-heading-2"
class="page-toc tocsection onthispage">
<i class="fa-solid fa-list"></i> On this page
</div>
<nav class="bd-toc-nav page-toc" aria-labelledby="pst-page-navigation-heading-2">
<ul class="visible nav section-nav flex-column">
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#project-goals">Project Goals</a></li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#features">Features</a></li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#use-cases">Use Cases</a></li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#known-users">Known Users</a></li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#integrations-and-extensions">Integrations and Extensions</a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#language-bindings">Language Bindings</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#integrations">Integrations</a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#why-datafusion">Why DataFusion?</a></li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#rust-version-compatibility-policy">Rust Version Compatibility Policy</a></li>
</ul>
</nav></div>
<div class="sidebar-secondary-item">
<div class="tocsection editthispage">
<a href="https://github.com/apache/arrow-datafusion/edit/main/docs/source/user-guide/introduction.md">
<i class="fa-solid fa-pencil"></i>
Edit on GitHub
</a>
</div>
</div>
<div class="sidebar-secondary-item">
<div role="note" aria-label="source link">
<h3>This Page</h3>
<ul class="this-page-menu">
<li><a href="../_sources/user-guide/introduction.md.txt"
rel="nofollow">Show Source</a></li>
</ul>
</div></div>
</div></div>
</div>
<footer class="bd-footer-content">
</footer>
</main>
</div>
</div>
<!-- Scripts loaded after <body> so the DOM is not blocked -->
<script defer src="../_static/scripts/bootstrap.js?digest=8878045cc6db502f8baf"></script>
<script defer src="../_static/scripts/pydata-sphinx-theme.js?digest=8878045cc6db502f8baf"></script>
<!-- Based on pydata_sphinx_theme/footer.html -->
<footer class="footer mt-5 mt-md-0">
<div class="container">
<div class="footer-item">
<p>Apache DataFusion, Apache, the Apache feather logo, and the Apache DataFusion project logo</p>
<p>are either registered trademarks or trademarks of The Apache Software Foundation in the United States and other countries.</p>
</div>
</div>
</footer>
</body>
</html>