blob: 45ac30e2ddcccfa01a3d2982be855b44fbc090db [file] [log] [blame]
<!DOCTYPE html>
<html lang="en" data-content_root="../../">
<head>
<meta charset="utf-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
<title>User-Defined Functions &#8212; Apache Arrow DataFusion documentation</title>
<link href="../../_static/styles/theme.css?digest=1999514e3f237ded88cf" rel="stylesheet">
<link href="../../_static/styles/pydata-sphinx-theme.css?digest=1999514e3f237ded88cf" rel="stylesheet">
<link rel="stylesheet"
href="../../_static/vendor/fontawesome/5.13.0/css/all.min.css">
<link rel="preload" as="font" type="font/woff2" crossorigin
href="../../_static/vendor/fontawesome/5.13.0/webfonts/fa-solid-900.woff2">
<link rel="preload" as="font" type="font/woff2" crossorigin
href="../../_static/vendor/fontawesome/5.13.0/webfonts/fa-brands-400.woff2">
<link rel="stylesheet" type="text/css" href="../../_static/pygments.css?v=8f2a1f02" />
<link rel="stylesheet" type="text/css" href="../../_static/styles/pydata-sphinx-theme.css?v=1140d252" />
<link rel="stylesheet" type="text/css" href="../../_static/graphviz.css?v=4ae1632d" />
<link rel="stylesheet" type="text/css" href="../../_static/theme_overrides.css?v=dca7052a" />
<link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=1999514e3f237ded88cf">
<script src="../../_static/documentation_options.js?v=8a448e45"></script>
<script src="../../_static/doctools.js?v=9bcbadda"></script>
<script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
<link rel="index" title="Index" href="../../genindex.html" />
<link rel="search" title="Search" href="../../search.html" />
<link rel="next" title="IO" href="../io/index.html" />
<link rel="prev" title="Window Functions" href="windows.html" />
<meta name="viewport" content="width=device-width, initial-scale=1" />
<meta name="docsearch:language" content="en">
<!-- Google Analytics -->
</head>
<body data-spy="scroll" data-target="#bd-toc-nav" data-offset="80">
<div class="container-fluid" id="banner"></div>
<div class="container-xl">
<div class="row">
<!-- Only show if we have sidebars configured, else just a small margin -->
<div class="col-12 col-md-3 bd-sidebar">
<div class="sidebar-start-items">
<a class="navbar-brand" href="../../index.html">
<img src="../../_static/images/2x_bgwhite_original.png" class="logo" alt="logo">
</a>
<form class="bd-search d-flex align-items-center" action="../../search.html" method="get">
<i class="icon fas fa-search"></i>
<input type="search" class="form-control" name="q" id="search-input" placeholder="Search the docs ..." aria-label="Search the docs ..." autocomplete="off" >
</form>
<nav class="bd-links" id="bd-docs-nav" aria-label="Main navigation">
<div class="bd-toc-item active">
<p aria-level="2" class="caption" role="heading">
<span class="caption-text">
LINKS
</span>
</p>
<ul class="nav bd-sidenav">
<li class="toctree-l1">
<a class="reference external" href="https://github.com/apache/datafusion-python">
Github and Issue Tracker
</a>
</li>
<li class="toctree-l1">
<a class="reference external" href="https://docs.rs/datafusion/latest/datafusion/">
Rust's API Docs
</a>
</li>
<li class="toctree-l1">
<a class="reference external" href="https://github.com/apache/datafusion/blob/main/CODE_OF_CONDUCT.md">
Code of conduct
</a>
</li>
<li class="toctree-l1">
<a class="reference external" href="https://github.com/apache/datafusion-python/tree/main/examples">
Examples
</a>
</li>
</ul>
<p aria-level="2" class="caption" role="heading">
<span class="caption-text">
USER GUIDE
</span>
</p>
<ul class="current nav bd-sidenav">
<li class="toctree-l1">
<a class="reference internal" href="../introduction.html">
Introduction
</a>
</li>
<li class="toctree-l1">
<a class="reference internal" href="../basics.html">
Concepts
</a>
</li>
<li class="toctree-l1">
<a class="reference internal" href="../data-sources.html">
Data Sources
</a>
</li>
<li class="toctree-l1 has-children">
<a class="reference internal" href="../dataframe/index.html">
DataFrames
</a>
<input class="toctree-checkbox" id="toctree-checkbox-1" name="toctree-checkbox-1" type="checkbox"/>
<label for="toctree-checkbox-1">
<i class="fas fa-chevron-down">
</i>
</label>
<ul>
<li class="toctree-l2">
<a class="reference internal" href="../dataframe/rendering.html">
HTML Rendering in Jupyter
</a>
</li>
</ul>
</li>
<li class="toctree-l1 current active has-children">
<a class="reference internal" href="index.html">
Common Operations
</a>
<input checked="" class="toctree-checkbox" id="toctree-checkbox-2" name="toctree-checkbox-2" type="checkbox"/>
<label for="toctree-checkbox-2">
<i class="fas fa-chevron-down">
</i>
</label>
<ul class="current">
<li class="toctree-l2">
<a class="reference internal" href="views.html">
Registering Views
</a>
</li>
<li class="toctree-l2">
<a class="reference internal" href="basic-info.html">
Basic Operations
</a>
</li>
<li class="toctree-l2">
<a class="reference internal" href="select-and-filter.html">
Column Selections
</a>
</li>
<li class="toctree-l2">
<a class="reference internal" href="expressions.html">
Expressions
</a>
</li>
<li class="toctree-l2">
<a class="reference internal" href="joins.html">
Joins
</a>
</li>
<li class="toctree-l2">
<a class="reference internal" href="functions.html">
Functions
</a>
</li>
<li class="toctree-l2">
<a class="reference internal" href="aggregations.html">
Aggregation
</a>
</li>
<li class="toctree-l2">
<a class="reference internal" href="windows.html">
Window Functions
</a>
</li>
<li class="toctree-l2 current active">
<a class="current reference internal" href="#">
User-Defined Functions
</a>
</li>
</ul>
</li>
<li class="toctree-l1 has-children">
<a class="reference internal" href="../io/index.html">
IO
</a>
<input class="toctree-checkbox" id="toctree-checkbox-3" name="toctree-checkbox-3" type="checkbox"/>
<label for="toctree-checkbox-3">
<i class="fas fa-chevron-down">
</i>
</label>
<ul>
<li class="toctree-l2">
<a class="reference internal" href="../io/arrow.html">
Arrow
</a>
</li>
<li class="toctree-l2">
<a class="reference internal" href="../io/avro.html">
Avro
</a>
</li>
<li class="toctree-l2">
<a class="reference internal" href="../io/csv.html">
CSV
</a>
</li>
<li class="toctree-l2">
<a class="reference internal" href="../io/json.html">
JSON
</a>
</li>
<li class="toctree-l2">
<a class="reference internal" href="../io/parquet.html">
Parquet
</a>
</li>
<li class="toctree-l2">
<a class="reference internal" href="../io/table_provider.html">
Custom Table Provider
</a>
</li>
</ul>
</li>
<li class="toctree-l1">
<a class="reference internal" href="../configuration.html">
Configuration
</a>
</li>
<li class="toctree-l1">
<a class="reference internal" href="../sql.html">
SQL
</a>
</li>
</ul>
<p aria-level="2" class="caption" role="heading">
<span class="caption-text">
CONTRIBUTOR GUIDE
</span>
</p>
<ul class="nav bd-sidenav">
<li class="toctree-l1">
<a class="reference internal" href="../../contributor-guide/introduction.html">
Introduction
</a>
</li>
<li class="toctree-l1">
<a class="reference internal" href="../../contributor-guide/ffi.html">
Python Extensions
</a>
</li>
</ul>
<p aria-level="2" class="caption" role="heading">
<span class="caption-text">
API
</span>
</p>
<ul class="nav bd-sidenav">
<li class="toctree-l1 has-children">
<a class="reference internal" href="../../autoapi/index.html">
API Reference
</a>
<input class="toctree-checkbox" id="toctree-checkbox-4" name="toctree-checkbox-4" type="checkbox"/>
<label for="toctree-checkbox-4">
<i class="fas fa-chevron-down">
</i>
</label>
<ul>
<li class="toctree-l2 has-children">
<a class="reference internal" href="../../autoapi/datafusion/index.html">
datafusion
</a>
<input class="toctree-checkbox" id="toctree-checkbox-5" name="toctree-checkbox-5" type="checkbox"/>
<label for="toctree-checkbox-5">
<i class="fas fa-chevron-down">
</i>
</label>
<ul>
<li class="toctree-l3">
<a class="reference internal" href="../../autoapi/datafusion/catalog/index.html">
datafusion.catalog
</a>
</li>
<li class="toctree-l3">
<a class="reference internal" href="../../autoapi/datafusion/context/index.html">
datafusion.context
</a>
</li>
<li class="toctree-l3">
<a class="reference internal" href="../../autoapi/datafusion/dataframe/index.html">
datafusion.dataframe
</a>
</li>
<li class="toctree-l3">
<a class="reference internal" href="../../autoapi/datafusion/dataframe_formatter/index.html">
datafusion.dataframe_formatter
</a>
</li>
<li class="toctree-l3">
<a class="reference internal" href="../../autoapi/datafusion/expr/index.html">
datafusion.expr
</a>
</li>
<li class="toctree-l3">
<a class="reference internal" href="../../autoapi/datafusion/functions/index.html">
datafusion.functions
</a>
</li>
<li class="toctree-l3">
<a class="reference internal" href="../../autoapi/datafusion/html_formatter/index.html">
datafusion.html_formatter
</a>
</li>
<li class="toctree-l3 has-children">
<a class="reference internal" href="../../autoapi/datafusion/input/index.html">
datafusion.input
</a>
<input class="toctree-checkbox" id="toctree-checkbox-6" name="toctree-checkbox-6" type="checkbox"/>
<label for="toctree-checkbox-6">
<i class="fas fa-chevron-down">
</i>
</label>
<ul>
<li class="toctree-l4">
<a class="reference internal" href="../../autoapi/datafusion/input/base/index.html">
datafusion.input.base
</a>
</li>
<li class="toctree-l4">
<a class="reference internal" href="../../autoapi/datafusion/input/location/index.html">
datafusion.input.location
</a>
</li>
</ul>
</li>
<li class="toctree-l3">
<a class="reference internal" href="../../autoapi/datafusion/io/index.html">
datafusion.io
</a>
</li>
<li class="toctree-l3">
<a class="reference internal" href="../../autoapi/datafusion/object_store/index.html">
datafusion.object_store
</a>
</li>
<li class="toctree-l3">
<a class="reference internal" href="../../autoapi/datafusion/plan/index.html">
datafusion.plan
</a>
</li>
<li class="toctree-l3">
<a class="reference internal" href="../../autoapi/datafusion/record_batch/index.html">
datafusion.record_batch
</a>
</li>
<li class="toctree-l3">
<a class="reference internal" href="../../autoapi/datafusion/substrait/index.html">
datafusion.substrait
</a>
</li>
<li class="toctree-l3">
<a class="reference internal" href="../../autoapi/datafusion/unparser/index.html">
datafusion.unparser
</a>
</li>
<li class="toctree-l3">
<a class="reference internal" href="../../autoapi/datafusion/user_defined/index.html">
datafusion.user_defined
</a>
</li>
</ul>
</li>
</ul>
</li>
</ul>
</div>
</nav>
</div>
<div class="sidebar-end-items">
</div>
</div>
<div class="d-none d-xl-block col-xl-2 bd-toc">
<div class="toc-item">
<div class="tocsection onthispage pt-5 pb-3">
<i class="fas fa-list"></i> On this page
</div>
<nav id="bd-toc-nav">
<ul class="visible nav section-nav flex-column">
<li class="toc-h2 nav-item toc-entry">
<a class="reference internal nav-link" href="#scalar-functions">
Scalar Functions
</a>
</li>
<li class="toc-h2 nav-item toc-entry">
<a class="reference internal nav-link" href="#aggregate-functions">
Aggregate Functions
</a>
</li>
<li class="toc-h2 nav-item toc-entry">
<a class="reference internal nav-link" href="#window-functions">
Window Functions
</a>
<ul class="visible nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry">
<a class="reference internal nav-link" href="#udwf-options">
UDWF options
</a>
</li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry">
<a class="reference internal nav-link" href="#table-functions">
Table Functions
</a>
</li>
</ul>
</nav>
</div>
<div class="toc-item">
</div>
</div>
<main class="col-12 col-md-9 col-xl-7 py-md-5 pl-md-5 pr-md-4 bd-content" role="main">
<div>
<section id="user-defined-functions">
<h1>User-Defined Functions<a class="headerlink" href="#user-defined-functions" title="Link to this heading">ΒΆ</a></h1>
<p>DataFusion provides powerful expressions and functions, reducing the need for custom Python
functions. However you can still incorporate your own functions, i.e. User-Defined Functions (UDFs).</p>
<section id="scalar-functions">
<h2>Scalar Functions<a class="headerlink" href="#scalar-functions" title="Link to this heading">ΒΆ</a></h2>
<p>When writing a user-defined function that can operate on a row by row basis, these are called Scalar
Functions. You can define your own scalar function by calling
<a class="reference internal" href="../../autoapi/datafusion/user_defined/index.html#datafusion.user_defined.ScalarUDF.udf" title="datafusion.user_defined.ScalarUDF.udf"><code class="xref py py-func docutils literal notranslate"><span class="pre">udf()</span></code></a> .</p>
<p>The basic definition of a scalar UDF is a python function that takes one or more
<a class="reference external" href="https://arrow.apache.org/docs/python/index.html">pyarrow</a> arrays and returns a single array as
output. DataFusion scalar UDFs operate on an entire batch of records at a time, though the
evaluation of those records should be on a row by row basis. In the following example, we compute
if the input array contains null values.</p>
<div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="n">In</span> <span class="p">[</span><span class="mi">1</span><span class="p">]:</span> <span class="kn">import</span><span class="w"> </span><span class="nn">pyarrow</span>
<span class="n">In</span> <span class="p">[</span><span class="mi">2</span><span class="p">]:</span> <span class="kn">import</span><span class="w"> </span><span class="nn">datafusion</span>
<span class="n">In</span> <span class="p">[</span><span class="mi">3</span><span class="p">]:</span> <span class="kn">from</span><span class="w"> </span><span class="nn">datafusion</span><span class="w"> </span><span class="kn">import</span> <span class="n">udf</span><span class="p">,</span> <span class="n">col</span>
<span class="n">In</span> <span class="p">[</span><span class="mi">4</span><span class="p">]:</span> <span class="k">def</span><span class="w"> </span><span class="nf">is_null</span><span class="p">(</span><span class="n">array</span><span class="p">:</span> <span class="n">pyarrow</span><span class="o">.</span><span class="n">Array</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">pyarrow</span><span class="o">.</span><span class="n">Array</span><span class="p">:</span>
<span class="o">...</span><span class="p">:</span> <span class="k">return</span> <span class="n">array</span><span class="o">.</span><span class="n">is_null</span><span class="p">()</span>
<span class="o">...</span><span class="p">:</span>
<span class="n">In</span> <span class="p">[</span><span class="mi">5</span><span class="p">]:</span> <span class="n">is_null_arr</span> <span class="o">=</span> <span class="n">udf</span><span class="p">(</span><span class="n">is_null</span><span class="p">,</span> <span class="p">[</span><span class="n">pyarrow</span><span class="o">.</span><span class="n">int64</span><span class="p">()],</span> <span class="n">pyarrow</span><span class="o">.</span><span class="n">bool_</span><span class="p">(),</span> <span class="s1">&#39;stable&#39;</span><span class="p">)</span>
<span class="n">In</span> <span class="p">[</span><span class="mi">6</span><span class="p">]:</span> <span class="n">ctx</span> <span class="o">=</span> <span class="n">datafusion</span><span class="o">.</span><span class="n">SessionContext</span><span class="p">()</span>
<span class="n">In</span> <span class="p">[</span><span class="mi">7</span><span class="p">]:</span> <span class="n">batch</span> <span class="o">=</span> <span class="n">pyarrow</span><span class="o">.</span><span class="n">RecordBatch</span><span class="o">.</span><span class="n">from_arrays</span><span class="p">(</span>
<span class="o">...</span><span class="p">:</span> <span class="p">[</span><span class="n">pyarrow</span><span class="o">.</span><span class="n">array</span><span class="p">([</span><span class="mi">1</span><span class="p">,</span> <span class="kc">None</span><span class="p">,</span> <span class="mi">3</span><span class="p">]),</span> <span class="n">pyarrow</span><span class="o">.</span><span class="n">array</span><span class="p">([</span><span class="mi">4</span><span class="p">,</span> <span class="mi">5</span><span class="p">,</span> <span class="mi">6</span><span class="p">])],</span>
<span class="o">...</span><span class="p">:</span> <span class="n">names</span><span class="o">=</span><span class="p">[</span><span class="s2">&quot;a&quot;</span><span class="p">,</span> <span class="s2">&quot;b&quot;</span><span class="p">],</span>
<span class="o">...</span><span class="p">:</span> <span class="p">)</span>
<span class="o">...</span><span class="p">:</span>
<span class="n">In</span> <span class="p">[</span><span class="mi">8</span><span class="p">]:</span> <span class="n">df</span> <span class="o">=</span> <span class="n">ctx</span><span class="o">.</span><span class="n">create_dataframe</span><span class="p">([[</span><span class="n">batch</span><span class="p">]],</span> <span class="n">name</span><span class="o">=</span><span class="s2">&quot;batch_array&quot;</span><span class="p">)</span>
<span class="n">In</span> <span class="p">[</span><span class="mi">9</span><span class="p">]:</span> <span class="n">df</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="n">col</span><span class="p">(</span><span class="s2">&quot;a&quot;</span><span class="p">),</span> <span class="n">is_null_arr</span><span class="p">(</span><span class="n">col</span><span class="p">(</span><span class="s2">&quot;a&quot;</span><span class="p">))</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="s2">&quot;is_null&quot;</span><span class="p">))</span><span class="o">.</span><span class="n">show</span><span class="p">()</span>
<span class="n">DataFrame</span><span class="p">()</span>
<span class="o">+---+---------+</span>
<span class="o">|</span> <span class="n">a</span> <span class="o">|</span> <span class="n">is_null</span> <span class="o">|</span>
<span class="o">+---+---------+</span>
<span class="o">|</span> <span class="mi">1</span> <span class="o">|</span> <span class="n">false</span> <span class="o">|</span>
<span class="o">|</span> <span class="o">|</span> <span class="n">true</span> <span class="o">|</span>
<span class="o">|</span> <span class="mi">3</span> <span class="o">|</span> <span class="n">false</span> <span class="o">|</span>
<span class="o">+---+---------+</span>
</pre></div>
</div>
<p>In the previous example, we used the fact that pyarrow provides a variety of built in array
functions such as <code class="docutils literal notranslate"><span class="pre">is_null()</span></code>. There are additional pyarrow
<a class="reference external" href="https://arrow.apache.org/docs/python/compute.html">compute functions</a> available. When possible,
it is highly recommended to use these functions because they can perform computations without doing
any copy operations from the original arrays. This leads to greatly improved performance.</p>
<p>If you need to perform an operation in python that is not available with the pyarrow compute
functions, you will need to convert the record batch into python values, perform your operation,
and construct an array. This operation of converting the built in data type of the array into a
python object can be one of the slowest operations in DataFusion, so it should be done sparingly.</p>
<p>The following example performs the same operation as before with <code class="docutils literal notranslate"><span class="pre">is_null</span></code> but demonstrates
converting to Python objects to do the evaluation.</p>
<div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="n">In</span> <span class="p">[</span><span class="mi">10</span><span class="p">]:</span> <span class="kn">import</span><span class="w"> </span><span class="nn">pyarrow</span>
<span class="n">In</span> <span class="p">[</span><span class="mi">11</span><span class="p">]:</span> <span class="kn">import</span><span class="w"> </span><span class="nn">datafusion</span>
<span class="n">In</span> <span class="p">[</span><span class="mi">12</span><span class="p">]:</span> <span class="kn">from</span><span class="w"> </span><span class="nn">datafusion</span><span class="w"> </span><span class="kn">import</span> <span class="n">udf</span><span class="p">,</span> <span class="n">col</span>
<span class="n">In</span> <span class="p">[</span><span class="mi">13</span><span class="p">]:</span> <span class="k">def</span><span class="w"> </span><span class="nf">is_null</span><span class="p">(</span><span class="n">array</span><span class="p">:</span> <span class="n">pyarrow</span><span class="o">.</span><span class="n">Array</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">pyarrow</span><span class="o">.</span><span class="n">Array</span><span class="p">:</span>
<span class="o">....</span><span class="p">:</span> <span class="k">return</span> <span class="n">pyarrow</span><span class="o">.</span><span class="n">array</span><span class="p">([</span><span class="n">value</span><span class="o">.</span><span class="n">as_py</span><span class="p">()</span> <span class="ow">is</span> <span class="kc">None</span> <span class="k">for</span> <span class="n">value</span> <span class="ow">in</span> <span class="n">array</span><span class="p">])</span>
<span class="o">....</span><span class="p">:</span>
<span class="n">In</span> <span class="p">[</span><span class="mi">14</span><span class="p">]:</span> <span class="n">is_null_arr</span> <span class="o">=</span> <span class="n">udf</span><span class="p">(</span><span class="n">is_null</span><span class="p">,</span> <span class="p">[</span><span class="n">pyarrow</span><span class="o">.</span><span class="n">int64</span><span class="p">()],</span> <span class="n">pyarrow</span><span class="o">.</span><span class="n">bool_</span><span class="p">(),</span> <span class="s1">&#39;stable&#39;</span><span class="p">)</span>
<span class="n">In</span> <span class="p">[</span><span class="mi">15</span><span class="p">]:</span> <span class="n">ctx</span> <span class="o">=</span> <span class="n">datafusion</span><span class="o">.</span><span class="n">SessionContext</span><span class="p">()</span>
<span class="n">In</span> <span class="p">[</span><span class="mi">16</span><span class="p">]:</span> <span class="n">batch</span> <span class="o">=</span> <span class="n">pyarrow</span><span class="o">.</span><span class="n">RecordBatch</span><span class="o">.</span><span class="n">from_arrays</span><span class="p">(</span>
<span class="o">....</span><span class="p">:</span> <span class="p">[</span><span class="n">pyarrow</span><span class="o">.</span><span class="n">array</span><span class="p">([</span><span class="mi">1</span><span class="p">,</span> <span class="kc">None</span><span class="p">,</span> <span class="mi">3</span><span class="p">]),</span> <span class="n">pyarrow</span><span class="o">.</span><span class="n">array</span><span class="p">([</span><span class="mi">4</span><span class="p">,</span> <span class="mi">5</span><span class="p">,</span> <span class="mi">6</span><span class="p">])],</span>
<span class="o">....</span><span class="p">:</span> <span class="n">names</span><span class="o">=</span><span class="p">[</span><span class="s2">&quot;a&quot;</span><span class="p">,</span> <span class="s2">&quot;b&quot;</span><span class="p">],</span>
<span class="o">....</span><span class="p">:</span> <span class="p">)</span>
<span class="o">....</span><span class="p">:</span>
<span class="n">In</span> <span class="p">[</span><span class="mi">17</span><span class="p">]:</span> <span class="n">df</span> <span class="o">=</span> <span class="n">ctx</span><span class="o">.</span><span class="n">create_dataframe</span><span class="p">([[</span><span class="n">batch</span><span class="p">]],</span> <span class="n">name</span><span class="o">=</span><span class="s2">&quot;batch_array&quot;</span><span class="p">)</span>
<span class="n">In</span> <span class="p">[</span><span class="mi">18</span><span class="p">]:</span> <span class="n">df</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="n">col</span><span class="p">(</span><span class="s2">&quot;a&quot;</span><span class="p">),</span> <span class="n">is_null_arr</span><span class="p">(</span><span class="n">col</span><span class="p">(</span><span class="s2">&quot;a&quot;</span><span class="p">))</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="s2">&quot;is_null&quot;</span><span class="p">))</span><span class="o">.</span><span class="n">show</span><span class="p">()</span>
<span class="n">DataFrame</span><span class="p">()</span>
<span class="o">+---+---------+</span>
<span class="o">|</span> <span class="n">a</span> <span class="o">|</span> <span class="n">is_null</span> <span class="o">|</span>
<span class="o">+---+---------+</span>
<span class="o">|</span> <span class="mi">1</span> <span class="o">|</span> <span class="n">false</span> <span class="o">|</span>
<span class="o">|</span> <span class="o">|</span> <span class="n">true</span> <span class="o">|</span>
<span class="o">|</span> <span class="mi">3</span> <span class="o">|</span> <span class="n">false</span> <span class="o">|</span>
<span class="o">+---+---------+</span>
</pre></div>
</div>
</section>
<section id="aggregate-functions">
<h2>Aggregate Functions<a class="headerlink" href="#aggregate-functions" title="Link to this heading">ΒΆ</a></h2>
<p>The <a class="reference internal" href="../../autoapi/datafusion/user_defined/index.html#datafusion.user_defined.AggregateUDF.udaf" title="datafusion.user_defined.AggregateUDF.udaf"><code class="xref py py-func docutils literal notranslate"><span class="pre">udaf()</span></code></a> function allows you to define User-Defined
Aggregate Functions (UDAFs). To use this you must implement an
<a class="reference internal" href="../../autoapi/datafusion/user_defined/index.html#datafusion.user_defined.Accumulator" title="datafusion.user_defined.Accumulator"><code class="xref py py-class docutils literal notranslate"><span class="pre">Accumulator</span></code></a> that determines how the aggregation is performed.</p>
<p>When defining a UDAF there are four methods you need to implement. The <code class="docutils literal notranslate"><span class="pre">update</span></code> function takes the
array(s) of input and updates the internal state of the accumulator. You should define this function
to have as many input arguments as you will pass when calling the UDAF. Since aggregation may be
split into multiple batches, we must have a method to combine multiple batches. For this, we have
two functions, <code class="docutils literal notranslate"><span class="pre">state</span></code> and <code class="docutils literal notranslate"><span class="pre">merge</span></code>. <code class="docutils literal notranslate"><span class="pre">state</span></code> will return an array of scalar values that contain
the current state of a single batch accumulation. Then we must <code class="docutils literal notranslate"><span class="pre">merge</span></code> the results of these
different states. Finally <code class="docutils literal notranslate"><span class="pre">evaluate</span></code> is the call that will return the final result after the
<code class="docutils literal notranslate"><span class="pre">merge</span></code> is complete.</p>
<p>In the following example we want to define a custom aggregate function that will return the
difference between the sum of two columns. The state can be represented by a single value and we can
also see how the inputs to <code class="docutils literal notranslate"><span class="pre">update</span></code> and <code class="docutils literal notranslate"><span class="pre">merge</span></code> differ.</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="kn">import</span><span class="w"> </span><span class="nn">pyarrow</span>
<span class="kn">import</span><span class="w"> </span><span class="nn">pyarrow.compute</span>
<span class="kn">import</span><span class="w"> </span><span class="nn">datafusion</span>
<span class="kn">from</span><span class="w"> </span><span class="nn">datafusion</span><span class="w"> </span><span class="kn">import</span> <span class="n">col</span><span class="p">,</span> <span class="n">udaf</span><span class="p">,</span> <span class="n">Accumulator</span>
<span class="kn">from</span><span class="w"> </span><span class="nn">typing</span><span class="w"> </span><span class="kn">import</span> <span class="n">List</span>
<span class="k">class</span><span class="w"> </span><span class="nc">MyAccumulator</span><span class="p">(</span><span class="n">Accumulator</span><span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Interface of a user-defined accumulation.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">def</span><span class="w"> </span><span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_sum</span> <span class="o">=</span> <span class="mf">0.0</span>
<span class="k">def</span><span class="w"> </span><span class="nf">update</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">values_a</span><span class="p">:</span> <span class="n">pyarrow</span><span class="o">.</span><span class="n">Array</span><span class="p">,</span> <span class="n">values_b</span><span class="p">:</span> <span class="n">pyarrow</span><span class="o">.</span><span class="n">Array</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="kc">None</span><span class="p">:</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_sum</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_sum</span> <span class="o">+</span> <span class="n">pyarrow</span><span class="o">.</span><span class="n">compute</span><span class="o">.</span><span class="n">sum</span><span class="p">(</span><span class="n">values_a</span><span class="p">)</span><span class="o">.</span><span class="n">as_py</span><span class="p">()</span> <span class="o">-</span> <span class="n">pyarrow</span><span class="o">.</span><span class="n">compute</span><span class="o">.</span><span class="n">sum</span><span class="p">(</span><span class="n">values_b</span><span class="p">)</span><span class="o">.</span><span class="n">as_py</span><span class="p">()</span>
<span class="k">def</span><span class="w"> </span><span class="nf">merge</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">states</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="n">pyarrow</span><span class="o">.</span><span class="n">Array</span><span class="p">])</span> <span class="o">-&gt;</span> <span class="kc">None</span><span class="p">:</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_sum</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_sum</span> <span class="o">+</span> <span class="n">pyarrow</span><span class="o">.</span><span class="n">compute</span><span class="o">.</span><span class="n">sum</span><span class="p">(</span><span class="n">states</span><span class="p">[</span><span class="mi">0</span><span class="p">])</span><span class="o">.</span><span class="n">as_py</span><span class="p">()</span>
<span class="k">def</span><span class="w"> </span><span class="nf">state</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">pyarrow</span><span class="o">.</span><span class="n">Array</span><span class="p">:</span>
<span class="k">return</span> <span class="n">pyarrow</span><span class="o">.</span><span class="n">array</span><span class="p">([</span><span class="bp">self</span><span class="o">.</span><span class="n">_sum</span><span class="p">])</span>
<span class="k">def</span><span class="w"> </span><span class="nf">evaluate</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">pyarrow</span><span class="o">.</span><span class="n">Scalar</span><span class="p">:</span>
<span class="k">return</span> <span class="n">pyarrow</span><span class="o">.</span><span class="n">scalar</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_sum</span><span class="p">)</span>
<span class="n">ctx</span> <span class="o">=</span> <span class="n">datafusion</span><span class="o">.</span><span class="n">SessionContext</span><span class="p">()</span>
<span class="n">df</span> <span class="o">=</span> <span class="n">ctx</span><span class="o">.</span><span class="n">from_pydict</span><span class="p">(</span>
<span class="p">{</span>
<span class="s2">&quot;a&quot;</span><span class="p">:</span> <span class="p">[</span><span class="mi">4</span><span class="p">,</span> <span class="mi">5</span><span class="p">,</span> <span class="mi">6</span><span class="p">],</span>
<span class="s2">&quot;b&quot;</span><span class="p">:</span> <span class="p">[</span><span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="mi">3</span><span class="p">],</span>
<span class="p">}</span>
<span class="p">)</span>
<span class="n">my_udaf</span> <span class="o">=</span> <span class="n">udaf</span><span class="p">(</span><span class="n">MyAccumulator</span><span class="p">,</span> <span class="p">[</span><span class="n">pyarrow</span><span class="o">.</span><span class="n">float64</span><span class="p">(),</span> <span class="n">pyarrow</span><span class="o">.</span><span class="n">float64</span><span class="p">()],</span> <span class="n">pyarrow</span><span class="o">.</span><span class="n">float64</span><span class="p">(),</span> <span class="p">[</span><span class="n">pyarrow</span><span class="o">.</span><span class="n">float64</span><span class="p">()],</span> <span class="s1">&#39;stable&#39;</span><span class="p">)</span>
<span class="n">df</span><span class="o">.</span><span class="n">aggregate</span><span class="p">([],</span> <span class="p">[</span><span class="n">my_udaf</span><span class="p">(</span><span class="n">col</span><span class="p">(</span><span class="s2">&quot;a&quot;</span><span class="p">),</span> <span class="n">col</span><span class="p">(</span><span class="s2">&quot;b&quot;</span><span class="p">))</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="s2">&quot;col_diff&quot;</span><span class="p">)])</span>
</pre></div>
</div>
</section>
<section id="window-functions">
<h2>Window Functions<a class="headerlink" href="#window-functions" title="Link to this heading">ΒΆ</a></h2>
<p>To implement a User-Defined Window Function (UDWF) you must call the
<a class="reference internal" href="../../autoapi/datafusion/user_defined/index.html#datafusion.user_defined.WindowUDF.udwf" title="datafusion.user_defined.WindowUDF.udwf"><code class="xref py py-func docutils literal notranslate"><span class="pre">udwf()</span></code></a> function using a class that implements the abstract
class <a class="reference internal" href="../../autoapi/datafusion/user_defined/index.html#datafusion.user_defined.WindowEvaluator" title="datafusion.user_defined.WindowEvaluator"><code class="xref py py-class docutils literal notranslate"><span class="pre">WindowEvaluator</span></code></a>.</p>
<p>There are three methods of evaluation of UDWFs.</p>
<ul class="simple">
<li><p><code class="docutils literal notranslate"><span class="pre">evaluate</span></code> is the simplest case, where you are given an array and are expected to calculate the
value for a single row of that array. This is the simplest case, but also the least performant.</p></li>
<li><p><code class="docutils literal notranslate"><span class="pre">evaluate_all</span></code> computes the values for all rows for an input array at a single time.</p></li>
<li><p><code class="docutils literal notranslate"><span class="pre">evaluate_all_with_rank</span></code> computes the values for all rows, but you only have the rank
information for the rows.</p></li>
</ul>
<p>Which methods you implement are based upon which of these options are set.</p>
<table class="table">
<thead>
<tr class="row-odd"><th class="head"><p><code class="docutils literal notranslate"><span class="pre">uses_window_frame</span></code></p></th>
<th class="head"><p><code class="docutils literal notranslate"><span class="pre">supports_bounded_execution</span></code></p></th>
<th class="head"><p><code class="docutils literal notranslate"><span class="pre">include_rank</span></code></p></th>
<th class="head"><p>function_to_implement</p></th>
</tr>
</thead>
<tbody>
<tr class="row-even"><td><p>False (default)</p></td>
<td><p>False (default)</p></td>
<td><p>False (default)</p></td>
<td><p><code class="docutils literal notranslate"><span class="pre">evaluate_all</span></code></p></td>
</tr>
<tr class="row-odd"><td><p>False</p></td>
<td><p>True</p></td>
<td><p>False</p></td>
<td><p><code class="docutils literal notranslate"><span class="pre">evaluate</span></code></p></td>
</tr>
<tr class="row-even"><td><p>False</p></td>
<td><p>True</p></td>
<td><p>False</p></td>
<td><p><code class="docutils literal notranslate"><span class="pre">evaluate_all_with_rank</span></code></p></td>
</tr>
<tr class="row-odd"><td><p>True</p></td>
<td><p>True/False</p></td>
<td><p>True/False</p></td>
<td><p><code class="docutils literal notranslate"><span class="pre">evaluate</span></code></p></td>
</tr>
</tbody>
</table>
<section id="udwf-options">
<h3>UDWF options<a class="headerlink" href="#udwf-options" title="Link to this heading">ΒΆ</a></h3>
<p>When you define your UDWF you can override the functions that return these values. They will
determine which evaluate functions are called.</p>
<ul class="simple">
<li><p><code class="docutils literal notranslate"><span class="pre">uses_window_frame</span></code> is set for functions that compute based on the specified window frame. If
your function depends upon the specified frame, set this to <code class="docutils literal notranslate"><span class="pre">True</span></code>.</p></li>
<li><p><code class="docutils literal notranslate"><span class="pre">supports_bounded_execution</span></code> specifies if your function can be incrementally computed.</p></li>
<li><p><code class="docutils literal notranslate"><span class="pre">include_rank</span></code> is set to <code class="docutils literal notranslate"><span class="pre">True</span></code> for window functions that can be computed only using the rank
information.</p></li>
</ul>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="kn">import</span><span class="w"> </span><span class="nn">pyarrow</span><span class="w"> </span><span class="k">as</span><span class="w"> </span><span class="nn">pa</span>
<span class="kn">from</span><span class="w"> </span><span class="nn">datafusion</span><span class="w"> </span><span class="kn">import</span> <span class="n">udwf</span><span class="p">,</span> <span class="n">col</span><span class="p">,</span> <span class="n">SessionContext</span>
<span class="kn">from</span><span class="w"> </span><span class="nn">datafusion.user_defined</span><span class="w"> </span><span class="kn">import</span> <span class="n">WindowEvaluator</span>
<span class="k">class</span><span class="w"> </span><span class="nc">ExponentialSmooth</span><span class="p">(</span><span class="n">WindowEvaluator</span><span class="p">):</span>
<span class="k">def</span><span class="w"> </span><span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">alpha</span><span class="p">:</span> <span class="nb">float</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="kc">None</span><span class="p">:</span>
<span class="bp">self</span><span class="o">.</span><span class="n">alpha</span> <span class="o">=</span> <span class="n">alpha</span>
<span class="k">def</span><span class="w"> </span><span class="nf">evaluate_all</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">values</span><span class="p">:</span> <span class="nb">list</span><span class="p">[</span><span class="n">pa</span><span class="o">.</span><span class="n">Array</span><span class="p">],</span> <span class="n">num_rows</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">pa</span><span class="o">.</span><span class="n">Array</span><span class="p">:</span>
<span class="n">results</span> <span class="o">=</span> <span class="p">[]</span>
<span class="n">curr_value</span> <span class="o">=</span> <span class="mf">0.0</span>
<span class="n">values</span> <span class="o">=</span> <span class="n">values</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span>
<span class="k">for</span> <span class="n">idx</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="n">num_rows</span><span class="p">):</span>
<span class="k">if</span> <span class="n">idx</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span>
<span class="n">curr_value</span> <span class="o">=</span> <span class="n">values</span><span class="p">[</span><span class="n">idx</span><span class="p">]</span><span class="o">.</span><span class="n">as_py</span><span class="p">()</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">curr_value</span> <span class="o">=</span> <span class="n">values</span><span class="p">[</span><span class="n">idx</span><span class="p">]</span><span class="o">.</span><span class="n">as_py</span><span class="p">()</span> <span class="o">*</span> <span class="bp">self</span><span class="o">.</span><span class="n">alpha</span> <span class="o">+</span> <span class="n">curr_value</span> <span class="o">*</span> <span class="p">(</span>
<span class="mf">1.0</span> <span class="o">-</span> <span class="bp">self</span><span class="o">.</span><span class="n">alpha</span>
<span class="p">)</span>
<span class="n">results</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">curr_value</span><span class="p">)</span>
<span class="k">return</span> <span class="n">pa</span><span class="o">.</span><span class="n">array</span><span class="p">(</span><span class="n">results</span><span class="p">)</span>
<span class="n">exp_smooth</span> <span class="o">=</span> <span class="n">udwf</span><span class="p">(</span>
<span class="n">ExponentialSmooth</span><span class="p">(</span><span class="mf">0.9</span><span class="p">),</span>
<span class="n">pa</span><span class="o">.</span><span class="n">float64</span><span class="p">(),</span>
<span class="n">pa</span><span class="o">.</span><span class="n">float64</span><span class="p">(),</span>
<span class="n">volatility</span><span class="o">=</span><span class="s2">&quot;immutable&quot;</span><span class="p">,</span>
<span class="p">)</span>
<span class="n">ctx</span> <span class="o">=</span> <span class="n">SessionContext</span><span class="p">()</span>
<span class="n">df</span> <span class="o">=</span> <span class="n">ctx</span><span class="o">.</span><span class="n">from_pydict</span><span class="p">({</span>
<span class="s2">&quot;a&quot;</span><span class="p">:</span> <span class="p">[</span><span class="mf">1.0</span><span class="p">,</span> <span class="mf">2.1</span><span class="p">,</span> <span class="mf">2.9</span><span class="p">,</span> <span class="mf">4.0</span><span class="p">,</span> <span class="mf">5.1</span><span class="p">,</span> <span class="mf">6.0</span><span class="p">,</span> <span class="mf">6.9</span><span class="p">,</span> <span class="mf">8.0</span><span class="p">]</span>
<span class="p">})</span>
<span class="n">df</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="s2">&quot;a&quot;</span><span class="p">,</span> <span class="n">exp_smooth</span><span class="p">(</span><span class="n">col</span><span class="p">(</span><span class="s2">&quot;a&quot;</span><span class="p">))</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="s2">&quot;smooth_a&quot;</span><span class="p">))</span><span class="o">.</span><span class="n">show</span><span class="p">()</span>
</pre></div>
</div>
</section>
</section>
<section id="table-functions">
<h2>Table Functions<a class="headerlink" href="#table-functions" title="Link to this heading">ΒΆ</a></h2>
<p>User Defined Table Functions are slightly different than the other functions
described here. These functions take any number of <cite>Expr</cite> arguments, but only
literal expressions are supported. Table functions must return a Table
Provider as described in the ref:<cite>_io_custom_table_provider</cite> page.</p>
<p>Once you have a table function, you can register it with the session context
by using <a class="reference internal" href="../../autoapi/datafusion/context/index.html#datafusion.context.SessionContext.register_udtf" title="datafusion.context.SessionContext.register_udtf"><code class="xref py py-func docutils literal notranslate"><span class="pre">datafusion.context.SessionContext.register_udtf()</span></code></a>.</p>
<p>There are examples of both rust backed and python based table functions in the
examples folder of the repository. If you have a rust backed table function
that you wish to expose via PyO3, you need to expose it as a <code class="docutils literal notranslate"><span class="pre">PyCapsule</span></code>.</p>
<div class="highlight-rust notranslate"><div class="highlight"><pre><span></span><span class="cp">#[pymethods]</span>
<span class="k">impl</span><span class="w"> </span><span class="n">MyTableFunction</span><span class="w"> </span><span class="p">{</span>
<span class="w"> </span><span class="k">fn</span><span class="w"> </span><span class="nf">__datafusion_table_function__</span><span class="o">&lt;&#39;</span><span class="na">py</span><span class="o">&gt;</span><span class="p">(</span>
<span class="w"> </span><span class="o">&amp;</span><span class="bp">self</span><span class="p">,</span>
<span class="w"> </span><span class="n">py</span><span class="p">:</span><span class="w"> </span><span class="nc">Python</span><span class="o">&lt;&#39;</span><span class="na">py</span><span class="o">&gt;</span><span class="p">,</span>
<span class="w"> </span><span class="p">)</span><span class="w"> </span><span class="p">-&gt;</span><span class="w"> </span><span class="nc">PyResult</span><span class="o">&lt;</span><span class="n">Bound</span><span class="o">&lt;&#39;</span><span class="na">py</span><span class="p">,</span><span class="w"> </span><span class="n">PyCapsule</span><span class="o">&gt;&gt;</span><span class="w"> </span><span class="p">{</span>
<span class="w"> </span><span class="kd">let</span><span class="w"> </span><span class="n">name</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">cr</span><span class="s">&quot;datafusion_table_function&quot;</span><span class="p">.</span><span class="n">into</span><span class="p">();</span>
<span class="w"> </span><span class="kd">let</span><span class="w"> </span><span class="n">func</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="bp">self</span><span class="p">.</span><span class="n">clone</span><span class="p">();</span>
<span class="w"> </span><span class="kd">let</span><span class="w"> </span><span class="n">provider</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">FFI_TableFunction</span><span class="p">::</span><span class="n">new</span><span class="p">(</span><span class="n">Arc</span><span class="p">::</span><span class="n">new</span><span class="p">(</span><span class="n">func</span><span class="p">),</span><span class="w"> </span><span class="nb">None</span><span class="p">);</span>
<span class="w"> </span><span class="n">PyCapsule</span><span class="p">::</span><span class="n">new</span><span class="p">(</span><span class="n">py</span><span class="p">,</span><span class="w"> </span><span class="n">provider</span><span class="p">,</span><span class="w"> </span><span class="nb">Some</span><span class="p">(</span><span class="n">name</span><span class="p">))</span>
<span class="w"> </span><span class="p">}</span>
<span class="p">}</span>
</pre></div>
</div>
</section>
</section>
</div>
<!-- Previous / next buttons -->
<div class='prev-next-area'>
<a class='left-prev' id="prev-link" href="windows.html" title="previous page">
<i class="fas fa-angle-left"></i>
<div class="prev-next-info">
<p class="prev-next-subtitle">previous</p>
<p class="prev-next-title">Window Functions</p>
</div>
</a>
<a class='right-next' id="next-link" href="../io/index.html" title="next page">
<div class="prev-next-info">
<p class="prev-next-subtitle">next</p>
<p class="prev-next-title">IO</p>
</div>
<i class="fas fa-angle-right"></i>
</a>
</div>
</main>
</div>
</div>
<script src="../../_static/scripts/pydata-sphinx-theme.js?digest=1999514e3f237ded88cf"></script>
<!-- Based on pydata_sphinx_theme/footer.html -->
<footer class="footer mt-5 mt-md-0">
<div class="container">
<div class="footer-item">
<p class="copyright">
&copy; Copyright 2019-2024, Apache Software Foundation.<br>
</p>
</div>
<div class="footer-item">
<p class="sphinx-version">
Created using <a href="http://sphinx-doc.org/">Sphinx</a> 8.1.3.<br>
</p>
</div>
<div class="footer-item">
<p>Apache Arrow DataFusion, Arrow DataFusion, Apache, the Apache feather logo, and the Apache Arrow DataFusion project logo</p>
<p>are either registered trademarks or trademarks of The Apache Software Foundation in the United States and other countries.</p>
</div>
</div>
</footer>
</body>
</html>