blob: cf5c2acd0b9634a46c7ee390c1c97f24a221e3bd [file] [log] [blame]
<!DOCTYPE html>
<html lang="en" data-content_root="../../">
<head>
<meta charset="utf-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
<title>DataFrames &#8212; Apache Arrow DataFusion documentation</title>
<link href="../../_static/styles/theme.css?digest=1999514e3f237ded88cf" rel="stylesheet">
<link href="../../_static/styles/pydata-sphinx-theme.css?digest=1999514e3f237ded88cf" rel="stylesheet">
<link rel="stylesheet"
href="../../_static/vendor/fontawesome/5.13.0/css/all.min.css">
<link rel="preload" as="font" type="font/woff2" crossorigin
href="../../_static/vendor/fontawesome/5.13.0/webfonts/fa-solid-900.woff2">
<link rel="preload" as="font" type="font/woff2" crossorigin
href="../../_static/vendor/fontawesome/5.13.0/webfonts/fa-brands-400.woff2">
<link rel="stylesheet" type="text/css" href="../../_static/pygments.css?v=8f2a1f02" />
<link rel="stylesheet" type="text/css" href="../../_static/styles/pydata-sphinx-theme.css?v=1140d252" />
<link rel="stylesheet" type="text/css" href="../../_static/graphviz.css?v=4ae1632d" />
<link rel="stylesheet" type="text/css" href="../../_static/theme_overrides.css?v=dca7052a" />
<link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=1999514e3f237ded88cf">
<script src="../../_static/documentation_options.js?v=8a448e45"></script>
<script src="../../_static/doctools.js?v=9bcbadda"></script>
<script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
<link rel="index" title="Index" href="../../genindex.html" />
<link rel="search" title="Search" href="../../search.html" />
<link rel="next" title="HTML Rendering in Jupyter" href="rendering.html" />
<link rel="prev" title="Data Sources" href="../data-sources.html" />
<meta name="viewport" content="width=device-width, initial-scale=1" />
<meta name="docsearch:language" content="en">
<!-- Google Analytics -->
</head>
<body data-spy="scroll" data-target="#bd-toc-nav" data-offset="80">
<div class="container-fluid" id="banner"></div>
<div class="container-xl">
<div class="row">
<!-- Only show if we have sidebars configured, else just a small margin -->
<div class="col-12 col-md-3 bd-sidebar">
<div class="sidebar-start-items">
<a class="navbar-brand" href="../../index.html">
<img src="../../_static/images/2x_bgwhite_original.png" class="logo" alt="logo">
</a>
<form class="bd-search d-flex align-items-center" action="../../search.html" method="get">
<i class="icon fas fa-search"></i>
<input type="search" class="form-control" name="q" id="search-input" placeholder="Search the docs ..." aria-label="Search the docs ..." autocomplete="off" >
</form>
<nav class="bd-links" id="bd-docs-nav" aria-label="Main navigation">
<div class="bd-toc-item active">
<p aria-level="2" class="caption" role="heading">
<span class="caption-text">
LINKS
</span>
</p>
<ul class="nav bd-sidenav">
<li class="toctree-l1">
<a class="reference external" href="https://github.com/apache/datafusion-python">
Github and Issue Tracker
</a>
</li>
<li class="toctree-l1">
<a class="reference external" href="https://docs.rs/datafusion/latest/datafusion/">
Rust's API Docs
</a>
</li>
<li class="toctree-l1">
<a class="reference external" href="https://github.com/apache/datafusion/blob/main/CODE_OF_CONDUCT.md">
Code of conduct
</a>
</li>
<li class="toctree-l1">
<a class="reference external" href="https://github.com/apache/datafusion-python/tree/main/examples">
Examples
</a>
</li>
</ul>
<p aria-level="2" class="caption" role="heading">
<span class="caption-text">
USER GUIDE
</span>
</p>
<ul class="current nav bd-sidenav">
<li class="toctree-l1">
<a class="reference internal" href="../introduction.html">
Introduction
</a>
</li>
<li class="toctree-l1">
<a class="reference internal" href="../basics.html">
Concepts
</a>
</li>
<li class="toctree-l1">
<a class="reference internal" href="../data-sources.html">
Data Sources
</a>
</li>
<li class="toctree-l1 current active has-children">
<a class="current reference internal" href="#">
DataFrames
</a>
<input checked="" class="toctree-checkbox" id="toctree-checkbox-1" name="toctree-checkbox-1" type="checkbox"/>
<label for="toctree-checkbox-1">
<i class="fas fa-chevron-down">
</i>
</label>
<ul>
<li class="toctree-l2">
<a class="reference internal" href="rendering.html">
HTML Rendering in Jupyter
</a>
</li>
</ul>
</li>
<li class="toctree-l1 has-children">
<a class="reference internal" href="../common-operations/index.html">
Common Operations
</a>
<input class="toctree-checkbox" id="toctree-checkbox-2" name="toctree-checkbox-2" type="checkbox"/>
<label for="toctree-checkbox-2">
<i class="fas fa-chevron-down">
</i>
</label>
<ul>
<li class="toctree-l2">
<a class="reference internal" href="../common-operations/views.html">
Registering Views
</a>
</li>
<li class="toctree-l2">
<a class="reference internal" href="../common-operations/basic-info.html">
Basic Operations
</a>
</li>
<li class="toctree-l2">
<a class="reference internal" href="../common-operations/select-and-filter.html">
Column Selections
</a>
</li>
<li class="toctree-l2">
<a class="reference internal" href="../common-operations/expressions.html">
Expressions
</a>
</li>
<li class="toctree-l2">
<a class="reference internal" href="../common-operations/joins.html">
Joins
</a>
</li>
<li class="toctree-l2">
<a class="reference internal" href="../common-operations/functions.html">
Functions
</a>
</li>
<li class="toctree-l2">
<a class="reference internal" href="../common-operations/aggregations.html">
Aggregation
</a>
</li>
<li class="toctree-l2">
<a class="reference internal" href="../common-operations/windows.html">
Window Functions
</a>
</li>
<li class="toctree-l2">
<a class="reference internal" href="../common-operations/udf-and-udfa.html">
User-Defined Functions
</a>
</li>
</ul>
</li>
<li class="toctree-l1 has-children">
<a class="reference internal" href="../io/index.html">
IO
</a>
<input class="toctree-checkbox" id="toctree-checkbox-3" name="toctree-checkbox-3" type="checkbox"/>
<label for="toctree-checkbox-3">
<i class="fas fa-chevron-down">
</i>
</label>
<ul>
<li class="toctree-l2">
<a class="reference internal" href="../io/arrow.html">
Arrow
</a>
</li>
<li class="toctree-l2">
<a class="reference internal" href="../io/avro.html">
Avro
</a>
</li>
<li class="toctree-l2">
<a class="reference internal" href="../io/csv.html">
CSV
</a>
</li>
<li class="toctree-l2">
<a class="reference internal" href="../io/json.html">
JSON
</a>
</li>
<li class="toctree-l2">
<a class="reference internal" href="../io/parquet.html">
Parquet
</a>
</li>
<li class="toctree-l2">
<a class="reference internal" href="../io/table_provider.html">
Custom Table Provider
</a>
</li>
</ul>
</li>
<li class="toctree-l1">
<a class="reference internal" href="../configuration.html">
Configuration
</a>
</li>
<li class="toctree-l1">
<a class="reference internal" href="../sql.html">
SQL
</a>
</li>
</ul>
<p aria-level="2" class="caption" role="heading">
<span class="caption-text">
CONTRIBUTOR GUIDE
</span>
</p>
<ul class="nav bd-sidenav">
<li class="toctree-l1">
<a class="reference internal" href="../../contributor-guide/introduction.html">
Introduction
</a>
</li>
<li class="toctree-l1">
<a class="reference internal" href="../../contributor-guide/ffi.html">
Python Extensions
</a>
</li>
</ul>
<p aria-level="2" class="caption" role="heading">
<span class="caption-text">
API
</span>
</p>
<ul class="nav bd-sidenav">
<li class="toctree-l1 has-children">
<a class="reference internal" href="../../autoapi/index.html">
API Reference
</a>
<input class="toctree-checkbox" id="toctree-checkbox-4" name="toctree-checkbox-4" type="checkbox"/>
<label for="toctree-checkbox-4">
<i class="fas fa-chevron-down">
</i>
</label>
<ul>
<li class="toctree-l2 has-children">
<a class="reference internal" href="../../autoapi/datafusion/index.html">
datafusion
</a>
<input class="toctree-checkbox" id="toctree-checkbox-5" name="toctree-checkbox-5" type="checkbox"/>
<label for="toctree-checkbox-5">
<i class="fas fa-chevron-down">
</i>
</label>
<ul>
<li class="toctree-l3">
<a class="reference internal" href="../../autoapi/datafusion/catalog/index.html">
datafusion.catalog
</a>
</li>
<li class="toctree-l3">
<a class="reference internal" href="../../autoapi/datafusion/context/index.html">
datafusion.context
</a>
</li>
<li class="toctree-l3">
<a class="reference internal" href="../../autoapi/datafusion/dataframe/index.html">
datafusion.dataframe
</a>
</li>
<li class="toctree-l3">
<a class="reference internal" href="../../autoapi/datafusion/dataframe_formatter/index.html">
datafusion.dataframe_formatter
</a>
</li>
<li class="toctree-l3">
<a class="reference internal" href="../../autoapi/datafusion/expr/index.html">
datafusion.expr
</a>
</li>
<li class="toctree-l3">
<a class="reference internal" href="../../autoapi/datafusion/functions/index.html">
datafusion.functions
</a>
</li>
<li class="toctree-l3">
<a class="reference internal" href="../../autoapi/datafusion/html_formatter/index.html">
datafusion.html_formatter
</a>
</li>
<li class="toctree-l3 has-children">
<a class="reference internal" href="../../autoapi/datafusion/input/index.html">
datafusion.input
</a>
<input class="toctree-checkbox" id="toctree-checkbox-6" name="toctree-checkbox-6" type="checkbox"/>
<label for="toctree-checkbox-6">
<i class="fas fa-chevron-down">
</i>
</label>
<ul>
<li class="toctree-l4">
<a class="reference internal" href="../../autoapi/datafusion/input/base/index.html">
datafusion.input.base
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="../../autoapi/datafusion/input/location/index.html">
datafusion.input.location
</a>
</li>
</ul>
</li>
<li class="toctree-l3">
<a class="reference internal" href="../../autoapi/datafusion/io/index.html">
datafusion.io
</a>
</li>
<li class="toctree-l3">
<a class="reference internal" href="../../autoapi/datafusion/object_store/index.html">
datafusion.object_store
</a>
</li>
<li class="toctree-l3">
<a class="reference internal" href="../../autoapi/datafusion/plan/index.html">
datafusion.plan
</a>
</li>
<li class="toctree-l3">
<a class="reference internal" href="../../autoapi/datafusion/record_batch/index.html">
datafusion.record_batch
</a>
</li>
<li class="toctree-l3">
<a class="reference internal" href="../../autoapi/datafusion/substrait/index.html">
datafusion.substrait
</a>
</li>
<li class="toctree-l3">
<a class="reference internal" href="../../autoapi/datafusion/unparser/index.html">
datafusion.unparser
</a>
</li>
<li class="toctree-l3">
<a class="reference internal" href="../../autoapi/datafusion/user_defined/index.html">
datafusion.user_defined
</a>
</li>
</ul>
</li>
</ul>
</li>
</ul>
</div>
</nav>
</div>
<div class="sidebar-end-items">
</div>
</div>
<div class="d-none d-xl-block col-xl-2 bd-toc">
<div class="toc-item">
<div class="tocsection onthispage pt-5 pb-3">
<i class="fas fa-list"></i> On this page
</div>
<nav id="bd-toc-nav">
<ul class="visible nav section-nav flex-column">
<li class="toc-h2 nav-item toc-entry">
<a class="reference internal nav-link" href="#overview">
Overview
</a>
</li>
<li class="toc-h2 nav-item toc-entry">
<a class="reference internal nav-link" href="#creating-dataframes">
Creating DataFrames
</a>
</li>
<li class="toc-h2 nav-item toc-entry">
<a class="reference internal nav-link" href="#common-dataframe-operations">
Common DataFrame Operations
</a>
</li>
<li class="toc-h2 nav-item toc-entry">
<a class="reference internal nav-link" href="#column-names-as-function-arguments">
Column Names as Function Arguments
</a>
</li>
<li class="toc-h2 nav-item toc-entry">
<a class="reference internal nav-link" href="#terminal-operations">
Terminal Operations
</a>
</li>
<li class="toc-h2 nav-item toc-entry">
<a class="reference internal nav-link" href="#html-rendering">
HTML Rendering
</a>
</li>
<li class="toc-h2 nav-item toc-entry">
<a class="reference internal nav-link" href="#core-classes">
Core Classes
</a>
</li>
<li class="toc-h2 nav-item toc-entry">
<a class="reference internal nav-link" href="#expression-classes">
Expression Classes
</a>
</li>
<li class="toc-h2 nav-item toc-entry">
<a class="reference internal nav-link" href="#built-in-functions">
Built-in Functions
</a>
</li>
</ul>
</nav>
</div>
<div class="toc-item">
</div>
</div>
<main class="col-12 col-md-9 col-xl-7 py-md-5 pl-md-5 pr-md-4 bd-content" role="main">
<div>
<section id="dataframes">
<h1>DataFrames<a class="headerlink" href="#dataframes" title="Link to this heading"></a></h1>
<section id="overview">
<h2>Overview<a class="headerlink" href="#overview" title="Link to this heading"></a></h2>
<p>The <code class="docutils literal notranslate"><span class="pre">DataFrame</span></code> class is the core abstraction in DataFusion that represents tabular data and operations
on that data. DataFrames provide a flexible API for transforming data through various operations such as
filtering, projection, aggregation, joining, and more.</p>
<p>A DataFrame represents a logical plan that is lazily evaluated. The actual execution occurs only when
terminal operations like <code class="docutils literal notranslate"><span class="pre">collect()</span></code>, <code class="docutils literal notranslate"><span class="pre">show()</span></code>, or <code class="docutils literal notranslate"><span class="pre">to_pandas()</span></code> are called.</p>
</section>
<section id="creating-dataframes">
<h2>Creating DataFrames<a class="headerlink" href="#creating-dataframes" title="Link to this heading"></a></h2>
<p>DataFrames can be created in several ways:</p>
<ul>
<li><p>From SQL queries via a <code class="docutils literal notranslate"><span class="pre">SessionContext</span></code>:</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="kn">from</span><span class="w"> </span><span class="nn">datafusion</span><span class="w"> </span><span class="kn">import</span> <span class="n">SessionContext</span>
<span class="n">ctx</span> <span class="o">=</span> <span class="n">SessionContext</span><span class="p">()</span>
<span class="n">df</span> <span class="o">=</span> <span class="n">ctx</span><span class="o">.</span><span class="n">sql</span><span class="p">(</span><span class="s2">&quot;SELECT * FROM your_table&quot;</span><span class="p">)</span>
</pre></div>
</div>
</li>
<li><p>From registered tables:</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="n">df</span> <span class="o">=</span> <span class="n">ctx</span><span class="o">.</span><span class="n">table</span><span class="p">(</span><span class="s2">&quot;your_table&quot;</span><span class="p">)</span>
</pre></div>
</div>
</li>
<li><p>From various data sources:</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="c1"># From CSV files (see :ref:`io_csv` for detailed options)</span>
<span class="n">df</span> <span class="o">=</span> <span class="n">ctx</span><span class="o">.</span><span class="n">read_csv</span><span class="p">(</span><span class="s2">&quot;path/to/data.csv&quot;</span><span class="p">)</span>
<span class="c1"># From Parquet files (see :ref:`io_parquet` for detailed options)</span>
<span class="n">df</span> <span class="o">=</span> <span class="n">ctx</span><span class="o">.</span><span class="n">read_parquet</span><span class="p">(</span><span class="s2">&quot;path/to/data.parquet&quot;</span><span class="p">)</span>
<span class="c1"># From JSON files (see :ref:`io_json` for detailed options)</span>
<span class="n">df</span> <span class="o">=</span> <span class="n">ctx</span><span class="o">.</span><span class="n">read_json</span><span class="p">(</span><span class="s2">&quot;path/to/data.json&quot;</span><span class="p">)</span>
<span class="c1"># From Avro files (see :ref:`io_avro` for detailed options)</span>
<span class="n">df</span> <span class="o">=</span> <span class="n">ctx</span><span class="o">.</span><span class="n">read_avro</span><span class="p">(</span><span class="s2">&quot;path/to/data.avro&quot;</span><span class="p">)</span>
<span class="c1"># From Pandas DataFrame</span>
<span class="kn">import</span><span class="w"> </span><span class="nn">pandas</span><span class="w"> </span><span class="k">as</span><span class="w"> </span><span class="nn">pd</span>
<span class="n">pandas_df</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">({</span><span class="s2">&quot;a&quot;</span><span class="p">:</span> <span class="p">[</span><span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="mi">3</span><span class="p">],</span> <span class="s2">&quot;b&quot;</span><span class="p">:</span> <span class="p">[</span><span class="mi">4</span><span class="p">,</span> <span class="mi">5</span><span class="p">,</span> <span class="mi">6</span><span class="p">]})</span>
<span class="n">df</span> <span class="o">=</span> <span class="n">ctx</span><span class="o">.</span><span class="n">from_pandas</span><span class="p">(</span><span class="n">pandas_df</span><span class="p">)</span>
<span class="c1"># From Arrow data</span>
<span class="kn">import</span><span class="w"> </span><span class="nn">pyarrow</span><span class="w"> </span><span class="k">as</span><span class="w"> </span><span class="nn">pa</span>
<span class="n">batch</span> <span class="o">=</span> <span class="n">pa</span><span class="o">.</span><span class="n">RecordBatch</span><span class="o">.</span><span class="n">from_arrays</span><span class="p">(</span>
<span class="p">[</span><span class="n">pa</span><span class="o">.</span><span class="n">array</span><span class="p">([</span><span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="mi">3</span><span class="p">]),</span> <span class="n">pa</span><span class="o">.</span><span class="n">array</span><span class="p">([</span><span class="mi">4</span><span class="p">,</span> <span class="mi">5</span><span class="p">,</span> <span class="mi">6</span><span class="p">])],</span>
<span class="n">names</span><span class="o">=</span><span class="p">[</span><span class="s2">&quot;a&quot;</span><span class="p">,</span> <span class="s2">&quot;b&quot;</span><span class="p">]</span>
<span class="p">)</span>
<span class="n">df</span> <span class="o">=</span> <span class="n">ctx</span><span class="o">.</span><span class="n">from_arrow</span><span class="p">(</span><span class="n">batch</span><span class="p">)</span>
</pre></div>
</div>
</li>
</ul>
<p>For detailed information about reading from different data sources, see the <a class="reference internal" href="../io/index.html"><span class="doc">I/O Guide</span></a>.
For custom data sources, see <a class="reference internal" href="../io/table_provider.html#io-custom-table-provider"><span class="std std-ref">Custom Table Provider</span></a>.</p>
</section>
<section id="common-dataframe-operations">
<h2>Common DataFrame Operations<a class="headerlink" href="#common-dataframe-operations" title="Link to this heading"></a></h2>
<p>DataFusion’s DataFrame API offers a wide range of operations:</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="kn">from</span><span class="w"> </span><span class="nn">datafusion</span><span class="w"> </span><span class="kn">import</span> <span class="n">column</span><span class="p">,</span> <span class="n">literal</span>
<span class="c1"># Select specific columns</span>
<span class="n">df</span> <span class="o">=</span> <span class="n">df</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="s2">&quot;col1&quot;</span><span class="p">,</span> <span class="s2">&quot;col2&quot;</span><span class="p">)</span>
<span class="c1"># Select with expressions</span>
<span class="n">df</span> <span class="o">=</span> <span class="n">df</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="n">column</span><span class="p">(</span><span class="s2">&quot;a&quot;</span><span class="p">)</span> <span class="o">+</span> <span class="n">column</span><span class="p">(</span><span class="s2">&quot;b&quot;</span><span class="p">),</span> <span class="n">column</span><span class="p">(</span><span class="s2">&quot;a&quot;</span><span class="p">)</span> <span class="o">-</span> <span class="n">column</span><span class="p">(</span><span class="s2">&quot;b&quot;</span><span class="p">))</span>
<span class="c1"># Filter rows</span>
<span class="n">df</span> <span class="o">=</span> <span class="n">df</span><span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="n">column</span><span class="p">(</span><span class="s2">&quot;age&quot;</span><span class="p">)</span> <span class="o">&gt;</span> <span class="n">literal</span><span class="p">(</span><span class="mi">25</span><span class="p">))</span>
<span class="c1"># Add computed columns</span>
<span class="n">df</span> <span class="o">=</span> <span class="n">df</span><span class="o">.</span><span class="n">with_column</span><span class="p">(</span><span class="s2">&quot;full_name&quot;</span><span class="p">,</span> <span class="n">column</span><span class="p">(</span><span class="s2">&quot;first_name&quot;</span><span class="p">)</span> <span class="o">+</span> <span class="n">literal</span><span class="p">(</span><span class="s2">&quot; &quot;</span><span class="p">)</span> <span class="o">+</span> <span class="n">column</span><span class="p">(</span><span class="s2">&quot;last_name&quot;</span><span class="p">))</span>
<span class="c1"># Multiple column additions</span>
<span class="n">df</span> <span class="o">=</span> <span class="n">df</span><span class="o">.</span><span class="n">with_columns</span><span class="p">(</span>
<span class="p">(</span><span class="n">column</span><span class="p">(</span><span class="s2">&quot;a&quot;</span><span class="p">)</span> <span class="o">+</span> <span class="n">column</span><span class="p">(</span><span class="s2">&quot;b&quot;</span><span class="p">))</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="s2">&quot;sum&quot;</span><span class="p">),</span>
<span class="p">(</span><span class="n">column</span><span class="p">(</span><span class="s2">&quot;a&quot;</span><span class="p">)</span> <span class="o">*</span> <span class="n">column</span><span class="p">(</span><span class="s2">&quot;b&quot;</span><span class="p">))</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="s2">&quot;product&quot;</span><span class="p">)</span>
<span class="p">)</span>
<span class="c1"># Sort data</span>
<span class="n">df</span> <span class="o">=</span> <span class="n">df</span><span class="o">.</span><span class="n">sort</span><span class="p">(</span><span class="n">column</span><span class="p">(</span><span class="s2">&quot;age&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">sort</span><span class="p">(</span><span class="n">ascending</span><span class="o">=</span><span class="kc">False</span><span class="p">))</span>
<span class="c1"># Join DataFrames</span>
<span class="n">df</span> <span class="o">=</span> <span class="n">df1</span><span class="o">.</span><span class="n">join</span><span class="p">(</span><span class="n">df2</span><span class="p">,</span> <span class="n">on</span><span class="o">=</span><span class="s2">&quot;user_id&quot;</span><span class="p">,</span> <span class="n">how</span><span class="o">=</span><span class="s2">&quot;inner&quot;</span><span class="p">)</span>
<span class="c1"># Aggregate data</span>
<span class="kn">from</span><span class="w"> </span><span class="nn">datafusion</span><span class="w"> </span><span class="kn">import</span> <span class="n">functions</span> <span class="k">as</span> <span class="n">f</span>
<span class="n">df</span> <span class="o">=</span> <span class="n">df</span><span class="o">.</span><span class="n">aggregate</span><span class="p">(</span>
<span class="p">[],</span> <span class="c1"># Group by columns (empty for global aggregation)</span>
<span class="p">[</span><span class="n">f</span><span class="o">.</span><span class="n">sum</span><span class="p">(</span><span class="n">column</span><span class="p">(</span><span class="s2">&quot;amount&quot;</span><span class="p">))</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="s2">&quot;total_amount&quot;</span><span class="p">)]</span>
<span class="p">)</span>
<span class="c1"># Limit rows</span>
<span class="n">df</span> <span class="o">=</span> <span class="n">df</span><span class="o">.</span><span class="n">limit</span><span class="p">(</span><span class="mi">100</span><span class="p">)</span>
<span class="c1"># Drop columns</span>
<span class="n">df</span> <span class="o">=</span> <span class="n">df</span><span class="o">.</span><span class="n">drop</span><span class="p">(</span><span class="s2">&quot;temporary_column&quot;</span><span class="p">)</span>
</pre></div>
</div>
</section>
<section id="column-names-as-function-arguments">
<h2>Column Names as Function Arguments<a class="headerlink" href="#column-names-as-function-arguments" title="Link to this heading"></a></h2>
<p>Some <code class="docutils literal notranslate"><span class="pre">DataFrame</span></code> methods accept column names when an argument refers to an
existing column. These include:</p>
<ul class="simple">
<li><p><code class="xref py py-meth docutils literal notranslate"><span class="pre">select()</span></code></p></li>
<li><p><code class="xref py py-meth docutils literal notranslate"><span class="pre">sort()</span></code></p></li>
<li><p><code class="xref py py-meth docutils literal notranslate"><span class="pre">drop()</span></code></p></li>
<li><p><code class="xref py py-meth docutils literal notranslate"><span class="pre">join()</span></code> (<code class="docutils literal notranslate"><span class="pre">on</span></code> argument)</p></li>
<li><p><code class="xref py py-meth docutils literal notranslate"><span class="pre">aggregate()</span></code> (grouping columns)</p></li>
</ul>
<p>See the full function documentation for details on any specific function.</p>
<p>Note that <code class="xref py py-meth docutils literal notranslate"><span class="pre">join_on()</span></code> expects <code class="docutils literal notranslate"><span class="pre">col()</span></code>/<code class="docutils literal notranslate"><span class="pre">column()</span></code> expressions rather than plain strings.</p>
<p>For such methods, you can pass column names directly:</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="kn">from</span><span class="w"> </span><span class="nn">datafusion</span><span class="w"> </span><span class="kn">import</span> <span class="n">col</span><span class="p">,</span> <span class="n">functions</span> <span class="k">as</span> <span class="n">f</span>
<span class="n">df</span><span class="o">.</span><span class="n">sort</span><span class="p">(</span><span class="s1">&#39;id&#39;</span><span class="p">)</span>
<span class="n">df</span><span class="o">.</span><span class="n">aggregate</span><span class="p">(</span><span class="s1">&#39;id&#39;</span><span class="p">,</span> <span class="p">[</span><span class="n">f</span><span class="o">.</span><span class="n">count</span><span class="p">(</span><span class="n">col</span><span class="p">(</span><span class="s1">&#39;value&#39;</span><span class="p">))])</span>
</pre></div>
</div>
<p>The same operation can also be written with explicit column expressions, using either <code class="docutils literal notranslate"><span class="pre">col()</span></code> or <code class="docutils literal notranslate"><span class="pre">column()</span></code>:</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="kn">from</span><span class="w"> </span><span class="nn">datafusion</span><span class="w"> </span><span class="kn">import</span> <span class="n">col</span><span class="p">,</span> <span class="n">column</span><span class="p">,</span> <span class="n">functions</span> <span class="k">as</span> <span class="n">f</span>
<span class="n">df</span><span class="o">.</span><span class="n">sort</span><span class="p">(</span><span class="n">col</span><span class="p">(</span><span class="s1">&#39;id&#39;</span><span class="p">))</span>
<span class="n">df</span><span class="o">.</span><span class="n">aggregate</span><span class="p">(</span><span class="n">column</span><span class="p">(</span><span class="s1">&#39;id&#39;</span><span class="p">),</span> <span class="p">[</span><span class="n">f</span><span class="o">.</span><span class="n">count</span><span class="p">(</span><span class="n">col</span><span class="p">(</span><span class="s1">&#39;value&#39;</span><span class="p">))])</span>
</pre></div>
</div>
<p>Note that <code class="docutils literal notranslate"><span class="pre">column()</span></code> is an alias of <code class="docutils literal notranslate"><span class="pre">col()</span></code>, so you can use either name; the example above shows both in action.</p>
<p>Whenever an argument represents an expression—such as in
<code class="xref py py-meth docutils literal notranslate"><span class="pre">filter()</span></code> or
<code class="xref py py-meth docutils literal notranslate"><span class="pre">with_column()</span></code>—use <code class="docutils literal notranslate"><span class="pre">col()</span></code> to reference
columns. The comparison and arithmetic operators on <code class="docutils literal notranslate"><span class="pre">Expr</span></code> will automatically
convert any non-<code class="docutils literal notranslate"><span class="pre">Expr</span></code> value into a literal expression, so writing</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="kn">from</span><span class="w"> </span><span class="nn">datafusion</span><span class="w"> </span><span class="kn">import</span> <span class="n">col</span>
<span class="n">df</span><span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="n">col</span><span class="p">(</span><span class="s2">&quot;age&quot;</span><span class="p">)</span> <span class="o">&gt;</span> <span class="mi">21</span><span class="p">)</span>
</pre></div>
</div>
<p>is equivalent to using <code class="docutils literal notranslate"><span class="pre">lit(21)</span></code> explicitly. Use <code class="docutils literal notranslate"><span class="pre">lit()</span></code> (also available
as <code class="docutils literal notranslate"><span class="pre">literal()</span></code>) when you need to construct a literal expression directly.</p>
</section>
<section id="terminal-operations">
<h2>Terminal Operations<a class="headerlink" href="#terminal-operations" title="Link to this heading"></a></h2>
<p>To materialize the results of your DataFrame operations:</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="c1"># Collect all data as PyArrow RecordBatches</span>
<span class="n">result_batches</span> <span class="o">=</span> <span class="n">df</span><span class="o">.</span><span class="n">collect</span><span class="p">()</span>
<span class="c1"># Convert to various formats</span>
<span class="n">pandas_df</span> <span class="o">=</span> <span class="n">df</span><span class="o">.</span><span class="n">to_pandas</span><span class="p">()</span> <span class="c1"># Pandas DataFrame</span>
<span class="n">polars_df</span> <span class="o">=</span> <span class="n">df</span><span class="o">.</span><span class="n">to_polars</span><span class="p">()</span> <span class="c1"># Polars DataFrame</span>
<span class="n">arrow_table</span> <span class="o">=</span> <span class="n">df</span><span class="o">.</span><span class="n">to_arrow_table</span><span class="p">()</span> <span class="c1"># PyArrow Table</span>
<span class="n">py_dict</span> <span class="o">=</span> <span class="n">df</span><span class="o">.</span><span class="n">to_pydict</span><span class="p">()</span> <span class="c1"># Python dictionary</span>
<span class="n">py_list</span> <span class="o">=</span> <span class="n">df</span><span class="o">.</span><span class="n">to_pylist</span><span class="p">()</span> <span class="c1"># Python list of dictionaries</span>
<span class="c1"># Display results</span>
<span class="n">df</span><span class="o">.</span><span class="n">show</span><span class="p">()</span> <span class="c1"># Print tabular format to console</span>
<span class="c1"># Count rows</span>
<span class="n">count</span> <span class="o">=</span> <span class="n">df</span><span class="o">.</span><span class="n">count</span><span class="p">()</span>
</pre></div>
</div>
</section>
<section id="html-rendering">
<h2>HTML Rendering<a class="headerlink" href="#html-rendering" title="Link to this heading"></a></h2>
<p>When working in Jupyter notebooks or other environments that support HTML rendering, DataFrames will
automatically display as formatted HTML tables. For detailed information about customizing HTML
rendering, formatting options, and advanced styling, see <a class="reference internal" href="rendering.html"><span class="doc">HTML Rendering in Jupyter</span></a>.</p>
</section>
<section id="core-classes">
<h2>Core Classes<a class="headerlink" href="#core-classes" title="Link to this heading"></a></h2>
<dl>
<dt><strong>DataFrame</strong></dt><dd><p>The main DataFrame class for building and executing queries.</p>
<p>See: <code class="xref py py-class docutils literal notranslate"><span class="pre">datafusion.DataFrame</span></code></p>
</dd>
<dt><strong>SessionContext</strong></dt><dd><p>The primary entry point for creating DataFrames from various data sources.</p>
<p>Key methods for DataFrame creation:</p>
<ul class="simple">
<li><p><code class="xref py py-meth docutils literal notranslate"><span class="pre">read_csv()</span></code> - Read CSV files</p></li>
<li><p><code class="xref py py-meth docutils literal notranslate"><span class="pre">read_parquet()</span></code> - Read Parquet files</p></li>
<li><p><code class="xref py py-meth docutils literal notranslate"><span class="pre">read_json()</span></code> - Read JSON files</p></li>
<li><p><code class="xref py py-meth docutils literal notranslate"><span class="pre">read_avro()</span></code> - Read Avro files</p></li>
<li><p><code class="xref py py-meth docutils literal notranslate"><span class="pre">table()</span></code> - Access registered tables</p></li>
<li><p><code class="xref py py-meth docutils literal notranslate"><span class="pre">sql()</span></code> - Execute SQL queries</p></li>
<li><p><code class="xref py py-meth docutils literal notranslate"><span class="pre">from_pandas()</span></code> - Create from Pandas DataFrame</p></li>
<li><p><code class="xref py py-meth docutils literal notranslate"><span class="pre">from_arrow()</span></code> - Create from Arrow data</p></li>
</ul>
<p>See: <code class="xref py py-class docutils literal notranslate"><span class="pre">datafusion.SessionContext</span></code></p>
</dd>
</dl>
</section>
<section id="expression-classes">
<h2>Expression Classes<a class="headerlink" href="#expression-classes" title="Link to this heading"></a></h2>
<dl>
<dt><strong>Expr</strong></dt><dd><p>Represents expressions that can be used in DataFrame operations.</p>
<p>See: <a class="reference internal" href="../../autoapi/datafusion/index.html#datafusion.Expr" title="datafusion.Expr"><code class="xref py py-class docutils literal notranslate"><span class="pre">datafusion.Expr</span></code></a></p>
</dd>
</dl>
<p><strong>Functions for creating expressions:</strong></p>
<ul class="simple">
<li><p><a class="reference internal" href="../../autoapi/datafusion/index.html#datafusion.column" title="datafusion.column"><code class="xref py py-func docutils literal notranslate"><span class="pre">datafusion.column()</span></code></a> - Reference a column by name</p></li>
<li><p><a class="reference internal" href="../../autoapi/datafusion/index.html#datafusion.literal" title="datafusion.literal"><code class="xref py py-func docutils literal notranslate"><span class="pre">datafusion.literal()</span></code></a> - Create a literal value expression</p></li>
</ul>
</section>
<section id="built-in-functions">
<h2>Built-in Functions<a class="headerlink" href="#built-in-functions" title="Link to this heading"></a></h2>
<p>DataFusion provides many built-in functions for data manipulation:</p>
<ul class="simple">
<li><p><a class="reference internal" href="../../autoapi/datafusion/functions/index.html#module-datafusion.functions" title="datafusion.functions"><code class="xref py py-mod docutils literal notranslate"><span class="pre">datafusion.functions</span></code></a> - Mathematical, string, date/time, and aggregation functions</p></li>
</ul>
<p>For a complete list of available functions, see the <a class="reference internal" href="../../autoapi/datafusion/functions/index.html#module-datafusion.functions" title="datafusion.functions"><code class="xref py py-mod docutils literal notranslate"><span class="pre">datafusion.functions</span></code></a> module documentation.</p>
<div class="toctree-wrapper compound">
<ul>
<li class="toctree-l1"><a class="reference internal" href="rendering.html">HTML Rendering in Jupyter</a></li>
</ul>
</div>
</section>
</section>
</div>
<!-- Previous / next buttons -->
<div class='prev-next-area'>
<a class='left-prev' id="prev-link" href="../data-sources.html" title="previous page">
<i class="fas fa-angle-left"></i>
<div class="prev-next-info">
<p class="prev-next-subtitle">previous</p>
<p class="prev-next-title">Data Sources</p>
</div>
</a>
<a class='right-next' id="next-link" href="rendering.html" title="next page">
<div class="prev-next-info">
<p class="prev-next-subtitle">next</p>
<p class="prev-next-title">HTML Rendering in Jupyter</p>
</div>
<i class="fas fa-angle-right"></i>
</a>
</div>
</main>
</div>
</div>
<script src="../../_static/scripts/pydata-sphinx-theme.js?digest=1999514e3f237ded88cf"></script>
<!-- Based on pydata_sphinx_theme/footer.html -->
<footer class="footer mt-5 mt-md-0">
<div class="container">
<div class="footer-item">
<p class="copyright">
&copy; Copyright 2019-2024, Apache Software Foundation.<br>
</p>
</div>
<div class="footer-item">
<p class="sphinx-version">
Created using <a href="http://sphinx-doc.org/">Sphinx</a> 8.1.3.<br>
</p>
</div>
<div class="footer-item">
<p>Apache Arrow DataFusion, Arrow DataFusion, Apache, the Apache feather logo, and the Apache Arrow DataFusion project logo</p>
<p>are either registered trademarks or trademarks of The Apache Software Foundation in the United States and other countries.</p>
</div>
</div>
</footer>
</body>
</html>