blob: 610b66a4c549156b5200631be20d7b925ca89eb6 [file] [log] [blame]
<!DOCTYPE html>
<html >
<head>
<meta charset="utf-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="generator" content="Docutils 0.17.1: http://docutils.sourceforge.net/" />
<title>Python User-defined Table Functions (UDTFs) &#8212; PySpark 4.0.0-preview1 documentation</title>
<script data-cfasync="false">
document.documentElement.dataset.mode = localStorage.getItem("mode") || "";
document.documentElement.dataset.theme = localStorage.getItem("theme") || "light";
</script>
<!-- Loaded before other Sphinx assets -->
<link href="../../_static/styles/theme.css?digest=e353d410970836974a52" rel="stylesheet" />
<link href="../../_static/styles/bootstrap.css?digest=e353d410970836974a52" rel="stylesheet" />
<link href="../../_static/styles/pydata-sphinx-theme.css?digest=e353d410970836974a52" rel="stylesheet" />
<link href="../../_static/vendor/fontawesome/6.1.2/css/all.min.css?digest=e353d410970836974a52" rel="stylesheet" />
<link rel="preload" as="font" type="font/woff2" crossorigin href="../../_static/vendor/fontawesome/6.1.2/webfonts/fa-solid-900.woff2" />
<link rel="preload" as="font" type="font/woff2" crossorigin href="../../_static/vendor/fontawesome/6.1.2/webfonts/fa-brands-400.woff2" />
<link rel="preload" as="font" type="font/woff2" crossorigin href="../../_static/vendor/fontawesome/6.1.2/webfonts/fa-regular-400.woff2" />
<link rel="stylesheet" type="text/css" href="../../_static/pygments.css" />
<link rel="stylesheet" type="text/css" href="../../_static/copybutton.css" />
<link rel="stylesheet" type="text/css" href="../../_static/css/pyspark.css" />
<!-- Pre-loaded scripts that we'll load fully later -->
<link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=e353d410970836974a52" />
<link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=e353d410970836974a52" />
<script data-url_root="../../" id="documentation_options" src="../../_static/documentation_options.js"></script>
<script src="../../_static/jquery.js"></script>
<script src="../../_static/underscore.js"></script>
<script src="../../_static/doctools.js"></script>
<script src="../../_static/clipboard.min.js"></script>
<script src="../../_static/copybutton.js"></script>
<script crossorigin="anonymous" integrity="sha256-Ae2Vz/4ePdIu6ZyI/5ZGsYnb+m0JlOmKPjt6XZ9JJkA=" src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.4/require.min.js"></script>
<script>DOCUMENTATION_OPTIONS.pagename = 'user_guide/sql/python_udtf';</script>
<link rel="canonical" href="https://spark.apache.org/docs/latest/api/python/user_guide/sql/python_udtf.html" />
<link rel="search" title="Search" href="../../search.html" />
<link rel="next" title="Python Data Source API" href="python_data_source.html" />
<link rel="prev" title="Apache Arrow in PySpark" href="arrow_pandas.html" />
<meta name="viewport" content="width=device-width, initial-scale=1" />
<meta name="docsearch:language" content="None">
<!-- Matomo -->
<script type="text/javascript">
var _paq = window._paq = window._paq || [];
/* tracker methods like "setCustomDimension" should be called before "trackPageView" */
_paq.push(["disableCookies"]);
_paq.push(['trackPageView']);
_paq.push(['enableLinkTracking']);
(function() {
var u="https://analytics.apache.org/";
_paq.push(['setTrackerUrl', u+'matomo.php']);
_paq.push(['setSiteId', '40']);
var d=document, g=d.createElement('script'), s=d.getElementsByTagName('script')[0];
g.async=true; g.src=u+'matomo.js'; s.parentNode.insertBefore(g,s);
})();
</script>
<!-- End Matomo Code -->
</head>
<body data-bs-spy="scroll" data-bs-target=".bd-toc-nav" data-offset="180" data-bs-root-margin="0px 0px -60%" data-default-mode="">
<a class="skip-link" href="#main-content">Skip to main content</a>
<input type="checkbox"
class="sidebar-toggle"
name="__primary"
id="__primary"/>
<label class="overlay overlay-primary" for="__primary"></label>
<input type="checkbox"
class="sidebar-toggle"
name="__secondary"
id="__secondary"/>
<label class="overlay overlay-secondary" for="__secondary"></label>
<div class="search-button__wrapper">
<div class="search-button__overlay"></div>
<div class="search-button__search-container">
<form class="bd-search d-flex align-items-center"
action="../../search.html"
method="get">
<i class="fa-solid fa-magnifying-glass"></i>
<input type="search"
class="form-control"
name="q"
id="search-input"
placeholder="Search the docs ..."
aria-label="Search the docs ..."
autocomplete="off"
autocorrect="off"
autocapitalize="off"
spellcheck="false"/>
<span class="search-button__kbd-shortcut"><kbd class="kbd-shortcut__modifier">Ctrl</kbd>+<kbd>K</kbd></span>
</form></div>
</div>
<nav class="bd-header navbar navbar-expand-lg bd-navbar">
<div class="bd-header__inner bd-page-width">
<label class="sidebar-toggle primary-toggle" for="__primary">
<span class="fa-solid fa-bars"></span>
</label>
<div class="navbar-header-items__start">
<div class="navbar-item">
<a class="navbar-brand logo" href="../../index.html">
<img src="../../_static/spark-logo-light.png" class="logo__image only-light" alt="Logo image"/>
<script>document.write(`<img src="../../_static/spark-logo-dark.png" class="logo__image only-dark" alt="Logo image"/>`);</script>
</a></div>
</div>
<div class="col-lg-9 navbar-header-items">
<div class="me-auto navbar-header-items__center">
<div class="navbar-item"><nav class="navbar-nav">
<p class="sidebar-header-items__title"
role="heading"
aria-level="1"
aria-label="Site Navigation">
Site Navigation
</p>
<ul class="bd-navbar-elements navbar-nav">
<li class="nav-item">
<a class="nav-link nav-internal" href="../../index.html">
Overview
</a>
</li>
<li class="nav-item">
<a class="nav-link nav-internal" href="../../getting_started/index.html">
Getting Started
</a>
</li>
<li class="nav-item current active">
<a class="nav-link nav-internal" href="../index.html">
User Guides
</a>
</li>
<li class="nav-item">
<a class="nav-link nav-internal" href="../../reference/index.html">
API Reference
</a>
</li>
<li class="nav-item">
<a class="nav-link nav-internal" href="../../development/index.html">
Development
</a>
</li>
<li class="nav-item">
<a class="nav-link nav-internal" href="../../migration_guide/index.html">
Migration Guides
</a>
</li>
</ul>
</nav></div>
</div>
<div class="navbar-header-items__end">
<div class="navbar-item navbar-persistent--container">
<script>
document.write(`
<button class="btn btn-sm navbar-btn search-button search-button__button" title="Search" aria-label="Search" data-bs-placement="bottom" data-bs-toggle="tooltip">
<i class="fa-solid fa-magnifying-glass"></i>
</button>
`);
</script>
</div>
<div class="navbar-item"><!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<div id="version-button" class="dropdown">
<button type="button" class="btn btn-secondary btn-sm navbar-btn dropdown-toggle" id="version_switcher_button" data-toggle="dropdown">
4.0.0-preview1
<span class="caret"></span>
</button>
<div id="version_switcher" class="dropdown-menu list-group-flush py-0" aria-labelledby="version_switcher_button">
<!-- dropdown will be populated by javascript on page load -->
</div>
</div>
<script type="text/javascript">
// Function to construct the target URL from the JSON components
function buildURL(entry) {
var template = "https://spark.apache.org/docs/{version}/api/python/index.html"; // supplied by jinja
template = template.replace("{version}", entry.version);
return template;
}
// Function to check if corresponding page path exists in other version of docs
// and, if so, go there instead of the homepage of the other docs version
function checkPageExistsAndRedirect(event) {
const currentFilePath = "user_guide/sql/python_udtf.html",
otherDocsHomepage = event.target.getAttribute("href");
let tryUrl = `${otherDocsHomepage}${currentFilePath}`;
$.ajax({
type: 'HEAD',
url: tryUrl,
// if the page exists, go there
success: function() {
location.href = tryUrl;
}
}).fail(function() {
location.href = otherDocsHomepage;
});
return false;
}
// Function to populate the version switcher
(function () {
// get JSON config
$.getJSON("https://spark.apache.org/static/versions.json", function(data, textStatus, jqXHR) {
// create the nodes first (before AJAX calls) to ensure the order is
// correct (for now, links will go to doc version homepage)
$.each(data, function(index, entry) {
// if no custom name specified (e.g., "latest"), use version string
if (!("name" in entry)) {
entry.name = entry.version;
}
// construct the appropriate URL, and add it to the dropdown
entry.url = buildURL(entry);
const node = document.createElement("a");
node.setAttribute("class", "list-group-item list-group-item-action py-1");
node.setAttribute("href", `${entry.url}`);
node.textContent = `${entry.name}`;
node.onclick = checkPageExistsAndRedirect;
$("#version_switcher").append(node);
});
});
})();
</script></div>
<div class="navbar-item">
<script>
document.write(`
<button class="theme-switch-button btn btn-sm btn-outline-primary navbar-btn rounded-circle" title="light/dark" aria-label="light/dark" data-bs-placement="bottom" data-bs-toggle="tooltip">
<span class="theme-switch" data-mode="light"><i class="fa-solid fa-sun"></i></span>
<span class="theme-switch" data-mode="dark"><i class="fa-solid fa-moon"></i></span>
<span class="theme-switch" data-mode="auto"><i class="fa-solid fa-circle-half-stroke"></i></span>
</button>
`);
</script></div>
<div class="navbar-item"><ul class="navbar-icon-links navbar-nav"
aria-label="Icon Links">
<li class="nav-item">
<a href="https://github.com/apache/spark" title="GitHub" class="nav-link" rel="noopener" target="_blank" data-bs-toggle="tooltip" data-bs-placement="bottom"><span><i class="fa-brands fa-github"></i></span>
<label class="sr-only">GitHub</label></a>
</li>
<li class="nav-item">
<a href="https://pypi.org/project/pyspark" title="PyPI" class="nav-link" rel="noopener" target="_blank" data-bs-toggle="tooltip" data-bs-placement="bottom"><span><i class="fa-solid fa-box"></i></span>
<label class="sr-only">PyPI</label></a>
</li>
</ul></div>
</div>
</div>
<div class="navbar-persistent--mobile">
<script>
document.write(`
<button class="btn btn-sm navbar-btn search-button search-button__button" title="Search" aria-label="Search" data-bs-placement="bottom" data-bs-toggle="tooltip">
<i class="fa-solid fa-magnifying-glass"></i>
</button>
`);
</script>
</div>
<label class="sidebar-toggle secondary-toggle" for="__secondary">
<span class="fa-solid fa-outdent"></span>
</label>
</div>
</nav>
<div class="bd-container">
<div class="bd-container__inner bd-page-width">
<div class="bd-sidebar-primary bd-sidebar">
<div class="sidebar-header-items sidebar-primary__section">
<div class="sidebar-header-items__center">
<div class="navbar-item"><nav class="navbar-nav">
<p class="sidebar-header-items__title"
role="heading"
aria-level="1"
aria-label="Site Navigation">
Site Navigation
</p>
<ul class="bd-navbar-elements navbar-nav">
<li class="nav-item">
<a class="nav-link nav-internal" href="../../index.html">
Overview
</a>
</li>
<li class="nav-item">
<a class="nav-link nav-internal" href="../../getting_started/index.html">
Getting Started
</a>
</li>
<li class="nav-item current active">
<a class="nav-link nav-internal" href="../index.html">
User Guides
</a>
</li>
<li class="nav-item">
<a class="nav-link nav-internal" href="../../reference/index.html">
API Reference
</a>
</li>
<li class="nav-item">
<a class="nav-link nav-internal" href="../../development/index.html">
Development
</a>
</li>
<li class="nav-item">
<a class="nav-link nav-internal" href="../../migration_guide/index.html">
Migration Guides
</a>
</li>
</ul>
</nav></div>
</div>
<div class="sidebar-header-items__end">
<div class="navbar-item"><!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<div id="version-button" class="dropdown">
<button type="button" class="btn btn-secondary btn-sm navbar-btn dropdown-toggle" id="version_switcher_button" data-toggle="dropdown">
4.0.0-preview1
<span class="caret"></span>
</button>
<div id="version_switcher" class="dropdown-menu list-group-flush py-0" aria-labelledby="version_switcher_button">
<!-- dropdown will be populated by javascript on page load -->
</div>
</div>
<script type="text/javascript">
// Function to construct the target URL from the JSON components
function buildURL(entry) {
var template = "https://spark.apache.org/docs/{version}/api/python/index.html"; // supplied by jinja
template = template.replace("{version}", entry.version);
return template;
}
// Function to check if corresponding page path exists in other version of docs
// and, if so, go there instead of the homepage of the other docs version
function checkPageExistsAndRedirect(event) {
const currentFilePath = "user_guide/sql/python_udtf.html",
otherDocsHomepage = event.target.getAttribute("href");
let tryUrl = `${otherDocsHomepage}${currentFilePath}`;
$.ajax({
type: 'HEAD',
url: tryUrl,
// if the page exists, go there
success: function() {
location.href = tryUrl;
}
}).fail(function() {
location.href = otherDocsHomepage;
});
return false;
}
// Function to populate the version switcher
(function () {
// get JSON config
$.getJSON("https://spark.apache.org/static/versions.json", function(data, textStatus, jqXHR) {
// create the nodes first (before AJAX calls) to ensure the order is
// correct (for now, links will go to doc version homepage)
$.each(data, function(index, entry) {
// if no custom name specified (e.g., "latest"), use version string
if (!("name" in entry)) {
entry.name = entry.version;
}
// construct the appropriate URL, and add it to the dropdown
entry.url = buildURL(entry);
const node = document.createElement("a");
node.setAttribute("class", "list-group-item list-group-item-action py-1");
node.setAttribute("href", `${entry.url}`);
node.textContent = `${entry.name}`;
node.onclick = checkPageExistsAndRedirect;
$("#version_switcher").append(node);
});
});
})();
</script></div>
<div class="navbar-item">
<script>
document.write(`
<button class="theme-switch-button btn btn-sm btn-outline-primary navbar-btn rounded-circle" title="light/dark" aria-label="light/dark" data-bs-placement="bottom" data-bs-toggle="tooltip">
<span class="theme-switch" data-mode="light"><i class="fa-solid fa-sun"></i></span>
<span class="theme-switch" data-mode="dark"><i class="fa-solid fa-moon"></i></span>
<span class="theme-switch" data-mode="auto"><i class="fa-solid fa-circle-half-stroke"></i></span>
</button>
`);
</script></div>
<div class="navbar-item"><ul class="navbar-icon-links navbar-nav"
aria-label="Icon Links">
<li class="nav-item">
<a href="https://github.com/apache/spark" title="GitHub" class="nav-link" rel="noopener" target="_blank" data-bs-toggle="tooltip" data-bs-placement="bottom"><span><i class="fa-brands fa-github"></i></span>
<label class="sr-only">GitHub</label></a>
</li>
<li class="nav-item">
<a href="https://pypi.org/project/pyspark" title="PyPI" class="nav-link" rel="noopener" target="_blank" data-bs-toggle="tooltip" data-bs-placement="bottom"><span><i class="fa-solid fa-box"></i></span>
<label class="sr-only">PyPI</label></a>
</li>
</ul></div>
</div>
</div>
<div class="sidebar-primary-items__start sidebar-primary__section">
<div class="sidebar-primary-item"><nav class="bd-docs-nav bd-links"
aria-label="Section Navigation">
<p class="bd-links__title" role="heading" aria-level="1">Section Navigation</p>
<div class="bd-toc-item navbar-nav"><ul class="current nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../python_packaging.html">Python Package Management</a></li>
<li class="toctree-l1 current active has-children"><a class="reference internal" href="index.html">Spark SQL</a><input checked="" class="toctree-checkbox" id="toctree-checkbox-1" name="toctree-checkbox-1" type="checkbox"/><label class="toctree-toggle" for="toctree-checkbox-1"><i class="fa-solid fa-chevron-down"></i></label><ul class="current">
<li class="toctree-l2"><a class="reference internal" href="arrow_pandas.html">Apache Arrow in PySpark</a></li>
<li class="toctree-l2 current active"><a class="current reference internal" href="#">Python User-defined Table Functions (UDTFs)</a></li>
<li class="toctree-l2"><a class="reference internal" href="python_data_source.html">Python Data Source API</a></li>
<li class="toctree-l2"><a class="reference internal" href="type_conversions.html">Python to Spark Type Conversions</a></li>
</ul>
</li>
<li class="toctree-l1 has-children"><a class="reference internal" href="../pandas_on_spark/index.html">Pandas API on Spark</a><input class="toctree-checkbox" id="toctree-checkbox-2" name="toctree-checkbox-2" type="checkbox"/><label class="toctree-toggle" for="toctree-checkbox-2"><i class="fa-solid fa-chevron-down"></i></label><ul>
<li class="toctree-l2"><a class="reference internal" href="../pandas_on_spark/options.html">Options and settings</a></li>
<li class="toctree-l2"><a class="reference internal" href="../pandas_on_spark/pandas_pyspark.html">From/to pandas and PySpark DataFrames</a></li>
<li class="toctree-l2"><a class="reference internal" href="../pandas_on_spark/transform_apply.html">Transform and apply a function</a></li>
<li class="toctree-l2"><a class="reference internal" href="../pandas_on_spark/types.html">Type Support in Pandas API on Spark</a></li>
<li class="toctree-l2"><a class="reference internal" href="../pandas_on_spark/typehints.html">Type Hints in Pandas API on Spark</a></li>
<li class="toctree-l2"><a class="reference internal" href="../pandas_on_spark/from_to_dbms.html">From/to other DBMSes</a></li>
<li class="toctree-l2"><a class="reference internal" href="../pandas_on_spark/best_practices.html">Best Practices</a></li>
<li class="toctree-l2"><a class="reference internal" href="../pandas_on_spark/supported_pandas_api.html">Supported pandas API</a></li>
<li class="toctree-l2"><a class="reference internal" href="../pandas_on_spark/faq.html">FAQ</a></li>
</ul>
</li>
</ul>
</div>
</nav></div>
</div>
<div class="sidebar-primary-items__end sidebar-primary__section">
</div>
<div id="rtd-footer-container"></div>
</div>
<main id="main-content" class="bd-main">
<div class="bd-content">
<div class="bd-article-container">
<div class="bd-header-article">
<div class="header-article-items header-article__inner">
<div class="header-article-items__start">
<div class="header-article-item">
<nav aria-label="Breadcrumbs">
<ul class="bd-breadcrumbs" role="navigation" aria-label="Breadcrumb">
<li class="breadcrumb-item breadcrumb-home">
<a href="../../index.html" class="nav-link" aria-label="Home">
<i class="fa-solid fa-home"></i>
</a>
</li>
<li class="breadcrumb-item"><a href="../index.html" class="nav-link">User Guides</a></li>
<li class="breadcrumb-item"><a href="index.html" class="nav-link">Spark SQL</a></li>
<li class="breadcrumb-item active" aria-current="page">Python User-defined Table Functions (UDTFs)</li>
</ul>
</nav>
</div>
</div>
</div>
</div>
<div id="searchbox"></div>
<article class="bd-article" role="main">
<section id="python-user-defined-table-functions-udtfs">
<h1>Python User-defined Table Functions (UDTFs)<a class="headerlink" href="#python-user-defined-table-functions-udtfs" title="Permalink to this headline">#</a></h1>
<p>Spark 3.5 introduces the Python user-defined table function (UDTF), a new type of user-defined function.
Unlike scalar functions that return a single result value from each call, each UDTF is invoked in
the <code class="docutils literal notranslate"><span class="pre">FROM</span></code> clause of a query and returns an entire table as output.
Each UDTF call can accept zero or more arguments.
These arguments can either be scalar expressions or table arguments that represent entire input tables.</p>
<section id="implementing-a-python-udtf">
<h2>Implementing a Python UDTF<a class="headerlink" href="#implementing-a-python-udtf" title="Permalink to this headline">#</a></h2>
<p>To implement a Python UDTF, you first need to define a class implementing the methods:</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="k">class</span> <span class="nc">PythonUDTF</span><span class="p">:</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="kc">None</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Initializes the user-defined table function (UDTF). This is optional.</span>
<span class="sd"> This method serves as the default constructor and is called once when the</span>
<span class="sd"> UDTF is instantiated on the executor side.</span>
<span class="sd"> Any class fields assigned in this method will be available for subsequent</span>
<span class="sd"> calls to the `eval` and `terminate` methods. This class instance will remain</span>
<span class="sd"> alive until all rows in the current partition have been consumed by the `eval`</span>
<span class="sd"> method.</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> - You cannot create or reference the Spark session within the UDTF. Any</span>
<span class="sd"> attempt to do so will result in a serialization error.</span>
<span class="sd"> - If the below `analyze` method is implemented, it is also possible to define this</span>
<span class="sd"> method as: `__init__(self, analyze_result: AnalyzeResult)`. In this case, the result</span>
<span class="sd"> of the `analyze` method is passed into all future instantiations of this UDTF class.</span>
<span class="sd"> In this way, the UDTF may inspect the schema and metadata of the output table as</span>
<span class="sd"> needed during execution of other methods in this class. Note that it is possible to</span>
<span class="sd"> create a subclass of the `AnalyzeResult` class if desired for purposes of passing</span>
<span class="sd"> custom information generated just once during UDTF analysis to other method calls;</span>
<span class="sd"> this can be especially useful if this initialization is expensive.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="o">...</span>
<span class="nd">@staticmethod</span>
<span class="k">def</span> <span class="nf">analyze</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="n">args</span><span class="p">:</span> <span class="n">AnalyzeArgument</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">AnalyzeResult</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Static method to compute the output schema of a particular call to this function in</span>
<span class="sd"> response to the arguments provided.</span>
<span class="sd"> This method is optional and only needed if the registration of the UDTF did not provide</span>
<span class="sd"> a static output schema to be use for all calls to the function. In this context,</span>
<span class="sd"> `output schema` refers to the ordered list of the names and types of the columns in the</span>
<span class="sd"> function&#39;s result table.</span>
<span class="sd"> This method accepts zero or more parameters mapping 1:1 with the arguments provided to</span>
<span class="sd"> the particular UDTF call under consideration. Each parameter is an instance of the</span>
<span class="sd"> `AnalyzeArgument` class.</span>
<span class="sd"> `AnalyzeArgument` fields</span>
<span class="sd"> ------------------------</span>
<span class="sd"> dataType: DataType</span>
<span class="sd"> Indicates the type of the provided input argument to this particular UDTF call.</span>
<span class="sd"> For input table arguments, this is a StructType representing the table&#39;s columns.</span>
<span class="sd"> value: Optional[Any]</span>
<span class="sd"> The value of the provided input argument to this particular UDTF call. This is</span>
<span class="sd"> `None` for table arguments, or for literal scalar arguments that are not constant.</span>
<span class="sd"> isTable: bool</span>
<span class="sd"> This is true if the provided input argument to this particular UDTF call is a</span>
<span class="sd"> table argument.</span>
<span class="sd"> isConstantExpression: bool</span>
<span class="sd"> This is true if the provided input argument to this particular UDTF call is either a</span>
<span class="sd"> literal or other constant-foldable scalar expression.</span>
<span class="sd"> This method returns an instance of the `AnalyzeResult` class which includes the result</span>
<span class="sd"> table&#39;s schema as a StructType. If the UDTF accepts an input table argument, then the</span>
<span class="sd"> `AnalyzeResult` can also include a requested way to partition and order the rows of</span>
<span class="sd"> the input table across several UDTF calls. See below for more information about UDTF</span>
<span class="sd"> table arguments and how to call them in SQL queries, including the WITH SINGLE</span>
<span class="sd"> PARTITION clause (corresponding to the `withSinglePartition` field here), PARTITION BY</span>
<span class="sd"> clause (corresponding to the `partitionBy` field here), ORDER BY clause (corresponding</span>
<span class="sd"> to the `orderBy` field here), and passing table subqueries as arguments (corresponding</span>
<span class="sd"> to the `select` field here).</span>
<span class="sd"> `AnalyzeResult` fields</span>
<span class="sd"> ----------------------</span>
<span class="sd"> schema: StructType</span>
<span class="sd"> The schema of the result table.</span>
<span class="sd"> withSinglePartition: bool = False</span>
<span class="sd"> If True, the query planner will arrange a repartitioning operation from the previous</span>
<span class="sd"> execution stage such that all rows of the input table are consumed by the `eval`</span>
<span class="sd"> method from exactly one instance of the UDTF class.</span>
<span class="sd"> partitionBy: Sequence[PartitioningColumn] = field(default_factory=tuple)</span>
<span class="sd"> If non-empty, the query planner will arrange a repartitioning such that all rows</span>
<span class="sd"> with each unique combination of values of the partitioning expressions are consumed</span>
<span class="sd"> by a separate unique instance of the UDTF class.</span>
<span class="sd"> orderBy: Sequence[OrderingColumn] = field(default_factory=tuple)</span>
<span class="sd"> If non-empty, this specifies the requested ordering of rows within each partition.</span>
<span class="sd"> select: Sequence[SelectedColumn] = field(default_factory=tuple)</span>
<span class="sd"> If non-empty, this is a sequence of expressions that the UDTF is specifying for</span>
<span class="sd"> Catalyst to evaluate against the columns in the input TABLE argument. The UDTF then</span>
<span class="sd"> receives one input attribute for each name in the list, in the order they are</span>
<span class="sd"> listed.</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> - It is possible for the `analyze` method to accept the exact arguments expected,</span>
<span class="sd"> mapping 1:1 with the arguments provided to the UDTF call.</span>
<span class="sd"> - The `analyze` method can instead choose to accept positional arguments if desired</span>
<span class="sd"> (using `*args`) or keyword arguments (using `**kwargs`).</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> This is an `analyze` implementation that returns one output column for each word in the</span>
<span class="sd"> input string argument.</span>
<span class="sd"> &gt;&gt;&gt; @staticmethod</span>
<span class="sd"> ... def analyze(text: str) -&gt; AnalyzeResult:</span>
<span class="sd"> ... schema = StructType()</span>
<span class="sd"> ... for index, word in enumerate(text.split(&quot; &quot;)):</span>
<span class="sd"> ... schema = schema.add(f&quot;word_{index}&quot;)</span>
<span class="sd"> ... return AnalyzeResult(schema=schema)</span>
<span class="sd"> Same as above, but using *args to accept the arguments.</span>
<span class="sd"> &gt;&gt;&gt; @staticmethod</span>
<span class="sd"> ... def analyze(*args) -&gt; AnalyzeResult:</span>
<span class="sd"> ... assert len(args) == 1, &quot;This function accepts one argument only&quot;</span>
<span class="sd"> ... assert args[0].dataType == StringType(), &quot;Only string arguments are supported&quot;</span>
<span class="sd"> ... text = args[0]</span>
<span class="sd"> ... schema = StructType()</span>
<span class="sd"> ... for index, word in enumerate(text.split(&quot; &quot;)):</span>
<span class="sd"> ... schema = schema.add(f&quot;word_{index}&quot;)</span>
<span class="sd"> ... return AnalyzeResult(schema=schema)</span>
<span class="sd"> Same as above, but using **kwargs to accept the arguments.</span>
<span class="sd"> &gt;&gt;&gt; @staticmethod</span>
<span class="sd"> ... def analyze(**kwargs) -&gt; AnalyzeResult:</span>
<span class="sd"> ... assert len(kwargs) == 1, &quot;This function accepts one argument only&quot;</span>
<span class="sd"> ... assert &quot;text&quot; in kwargs, &quot;An argument named &#39;text&#39; is required&quot;</span>
<span class="sd"> ... assert kwargs[&quot;text&quot;].dataType == StringType(), &quot;Only strings are supported&quot;</span>
<span class="sd"> ... text = args[&quot;text&quot;]</span>
<span class="sd"> ... schema = StructType()</span>
<span class="sd"> ... for index, word in enumerate(text.split(&quot; &quot;)):</span>
<span class="sd"> ... schema = schema.add(f&quot;word_{index}&quot;)</span>
<span class="sd"> ... return AnalyzeResult(schema=schema)</span>
<span class="sd"> This is an `analyze` implementation that returns a constant output schema, but add</span>
<span class="sd"> custom information in the result metadata to be consumed by future __init__ method</span>
<span class="sd"> calls:</span>
<span class="sd"> &gt;&gt;&gt; @staticmethod</span>
<span class="sd"> ... def analyze(text: str) -&gt; AnalyzeResult:</span>
<span class="sd"> ... @dataclass</span>
<span class="sd"> ... class AnalyzeResultWithOtherMetadata(AnalyzeResult):</span>
<span class="sd"> ... num_words: int</span>
<span class="sd"> ... num_articles: int</span>
<span class="sd"> ... words = text.split(&quot; &quot;)</span>
<span class="sd"> ... return AnalyzeResultWithOtherMetadata(</span>
<span class="sd"> ... schema=StructType()</span>
<span class="sd"> ... .add(&quot;word&quot;, StringType())</span>
<span class="sd"> ... .add(&#39;total&quot;, IntegerType()),</span>
<span class="sd"> ... num_words=len(words),</span>
<span class="sd"> ... num_articles=len((</span>
<span class="sd"> ... word for word in words</span>
<span class="sd"> ... if word == &#39;a&#39; or word == &#39;an&#39; or word == &#39;the&#39;)))</span>
<span class="sd"> This is an `analyze` implementation that returns a constant output schema, and also</span>
<span class="sd"> requests to select a subset of columns from the input table and for the input table to</span>
<span class="sd"> be partitioned across several UDTF calls based on the values of the `date` column.</span>
<span class="sd"> A SQL query may this UDTF passing a table argument like &quot;SELECT * FROM udtf(TABLE(t))&quot;.</span>
<span class="sd"> Then this `analyze` method specifies additional constraints on the input table:</span>
<span class="sd"> (1) The input table must be partitioned across several UDTF calls based on the values of</span>
<span class="sd"> the month value of each `date` column.</span>
<span class="sd"> (2) The rows within each partition will arrive ordered by the `date` column.</span>
<span class="sd"> (3) The UDTF will only receive the `date` and `word` columns from the input table.</span>
<span class="sd"> &gt;&gt;&gt; @staticmethod</span>
<span class="sd"> ... def analyze(*args) -&gt; AnalyzeResult:</span>
<span class="sd"> ... assert len(args) == 1, &quot;This function accepts one argument only&quot;</span>
<span class="sd"> ... assert args[0].isTable, &quot;Only table arguments are supported&quot;</span>
<span class="sd"> ... return AnalyzeResult(</span>
<span class="sd"> ... schema=StructType()</span>
<span class="sd"> ... .add(&quot;month&quot;, DateType())</span>
<span class="sd"> ... .add(&#39;longest_word&quot;, IntegerType()),</span>
<span class="sd"> ... partitionBy=[</span>
<span class="sd"> ... PartitioningColumn(&quot;extract(month from date)&quot;)],</span>
<span class="sd"> ... orderBy=[</span>
<span class="sd"> ... OrderingColumn(&quot;date&quot;)],</span>
<span class="sd"> ... select=[</span>
<span class="sd"> ... SelectedColumn(&quot;date&quot;),</span>
<span class="sd"> ... SelectedColumn(</span>
<span class="sd"> ... name=&quot;length(word),</span>
<span class="sd"> ... alias=&quot;length_word&quot;)])</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="o">...</span>
<span class="k">def</span> <span class="nf">eval</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="n">args</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Iterator</span><span class="p">[</span><span class="n">Any</span><span class="p">]:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Evaluates the function using the given input arguments.</span>
<span class="sd"> This method is required and must be implemented.</span>
<span class="sd"> Argument Mapping:</span>
<span class="sd"> - Each provided scalar expression maps to exactly one value in the</span>
<span class="sd"> `*args` list.</span>
<span class="sd"> - Each provided table argument maps to a pyspark.sql.Row object containing</span>
<span class="sd"> the columns in the order they appear in the provided input table,</span>
<span class="sd"> and with the names computed by the query analyzer.</span>
<span class="sd"> This method is called on every input row, and can produce zero or more</span>
<span class="sd"> output rows. Each element in the output tuple corresponds to one column</span>
<span class="sd"> specified in the return type of the UDTF.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> *args : Any</span>
<span class="sd"> Arbitrary positional arguments representing the input to the UDTF.</span>
<span class="sd"> Yields</span>
<span class="sd"> ------</span>
<span class="sd"> tuple</span>
<span class="sd"> A tuple, list, or pyspark.sql.Row object representing a single row in the UDTF</span>
<span class="sd"> result table. Yield as many times as needed to produce multiple rows.</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> - It is also possible for UDTFs to accept the exact arguments expected, along with</span>
<span class="sd"> their types.</span>
<span class="sd"> - UDTFs can instead accept keyword arguments during the function call if needed.</span>
<span class="sd"> - The `eval` method can raise a `SkipRestOfInputTableException` to indicate that the</span>
<span class="sd"> UDTF wants to skip consuming all remaining rows from the current partition of the</span>
<span class="sd"> input table. This will cause the UDTF to proceed directly to the `terminate` method.</span>
<span class="sd"> - The `eval` method can raise any other exception to indicate that the UDTF should be</span>
<span class="sd"> aborted entirely. This will cause the UDTF to skip the `terminate` method and proceed</span>
<span class="sd"> directly to the `cleanup` method, and then the exception will be propagated to the</span>
<span class="sd"> query processor causing the invoking query to fail.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> This `eval` method returns one row and one column for each input.</span>
<span class="sd"> &gt;&gt;&gt; def eval(self, x: int):</span>
<span class="sd"> ... yield (x, )</span>
<span class="sd"> This `eval` method returns two rows and two columns for each input.</span>
<span class="sd"> &gt;&gt;&gt; def eval(self, x: int, y: int):</span>
<span class="sd"> ... yield (x + y, x - y)</span>
<span class="sd"> ... yield (y + x, y - x)</span>
<span class="sd"> Same as above, but using *args to accept the arguments:</span>
<span class="sd"> &gt;&gt;&gt; def eval(self, *args):</span>
<span class="sd"> ... assert len(args) == 2, &quot;This function accepts two integer arguments only&quot;</span>
<span class="sd"> ... x = args[0]</span>
<span class="sd"> ... y = args[1]</span>
<span class="sd"> ... yield (x + y, x - y)</span>
<span class="sd"> ... yield (y + x, y - x)</span>
<span class="sd"> Same as above, but using **kwargs to accept the arguments:</span>
<span class="sd"> &gt;&gt;&gt; def eval(self, **kwargs):</span>
<span class="sd"> ... assert len(kwargs) == 2, &quot;This function accepts two integer arguments only&quot;</span>
<span class="sd"> ... x = kwargs[&quot;x&quot;]</span>
<span class="sd"> ... y = kwargs[&quot;y&quot;]</span>
<span class="sd"> ... yield (x + y, x - y)</span>
<span class="sd"> ... yield (y + x, y - x)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="o">...</span>
<span class="k">def</span> <span class="nf">terminate</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Iterator</span><span class="p">[</span><span class="n">Any</span><span class="p">]:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Called when the UDTF has successfully processed all input rows.</span>
<span class="sd"> This method is optional to implement and is useful for performing any</span>
<span class="sd"> finalization operations after the UDTF has finished processing</span>
<span class="sd"> all rows. It can also be used to yield additional rows if needed.</span>
<span class="sd"> Table functions that consume all rows in the entire input partition</span>
<span class="sd"> and then compute and return the entire output table can do so from</span>
<span class="sd"> this method as well (please be mindful of memory usage when doing</span>
<span class="sd"> this).</span>
<span class="sd"> If any exceptions occur during input row processing, this method</span>
<span class="sd"> won&#39;t be called.</span>
<span class="sd"> Yields</span>
<span class="sd"> ------</span>
<span class="sd"> tuple</span>
<span class="sd"> A tuple representing a single row in the UDTF result table.</span>
<span class="sd"> Yield this if you want to return additional rows during termination.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; def terminate(self) -&gt; Iterator[Any]:</span>
<span class="sd"> &gt;&gt;&gt; yield &quot;done&quot;, None</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="o">...</span>
<span class="k">def</span> <span class="nf">cleanup</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="kc">None</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Invoked after the UDTF completes processing input rows.</span>
<span class="sd"> This method is optional to implement and is useful for final cleanup</span>
<span class="sd"> regardless of whether the UDTF processed all input rows successfully</span>
<span class="sd"> or was aborted due to exceptions.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; def cleanup(self) -&gt; None:</span>
<span class="sd"> &gt;&gt;&gt; self.conn.close()</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="o">...</span>
</pre></div>
</div>
</section>
<section id="defining-the-output-schema">
<h2>Defining the Output Schema<a class="headerlink" href="#defining-the-output-schema" title="Permalink to this headline">#</a></h2>
<p>The return type of the UDTF defines the schema of the table it outputs.</p>
<p>You can specify it either after the <code class="docutils literal notranslate"><span class="pre">&#64;udtf</span></code> decorator or as a result from the <code class="docutils literal notranslate"><span class="pre">analyze</span></code> method.</p>
<p>It must be either a <code class="docutils literal notranslate"><span class="pre">StructType</span></code>:</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="n">StructType</span><span class="p">()</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="s2">&quot;c1&quot;</span><span class="p">,</span> <span class="n">StringType</span><span class="p">())</span>
</pre></div>
</div>
<p>or a DDL string representing a struct type:</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="n">c1</span><span class="p">:</span> <span class="n">string</span>
</pre></div>
</div>
</section>
<section id="emitting-output-rows">
<h2>Emitting Output Rows<a class="headerlink" href="#emitting-output-rows" title="Permalink to this headline">#</a></h2>
<p>The <cite>eval</cite> and <cite>terminate</cite> methods then emit zero or more output rows conforming to this schema by
yielding tuples, lists, or <code class="docutils literal notranslate"><span class="pre">pyspark.sql.Row</span></code> objects.</p>
<p>For example, here we return a row by providing a tuple of three elements:</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="k">def</span> <span class="nf">eval</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">x</span><span class="p">,</span> <span class="n">y</span><span class="p">,</span> <span class="n">z</span><span class="p">):</span>
<span class="k">yield</span> <span class="p">(</span><span class="n">x</span><span class="p">,</span> <span class="n">y</span><span class="p">,</span> <span class="n">z</span><span class="p">)</span>
</pre></div>
</div>
<p>It is also acceptable to omit the parentheses:</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="k">def</span> <span class="nf">eval</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">x</span><span class="p">,</span> <span class="n">y</span><span class="p">,</span> <span class="n">z</span><span class="p">):</span>
<span class="k">yield</span> <span class="n">x</span><span class="p">,</span> <span class="n">y</span><span class="p">,</span> <span class="n">z</span>
</pre></div>
</div>
<p>Remember to add a trailing comma if returning a row with only one column!</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="k">def</span> <span class="nf">eval</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">x</span><span class="p">,</span> <span class="n">y</span><span class="p">,</span> <span class="n">z</span><span class="p">):</span>
<span class="k">yield</span> <span class="n">x</span><span class="p">,</span>
</pre></div>
</div>
<p>It is also possible to yield a <code class="docutils literal notranslate"><span class="pre">pyspark.sql.Row</span></code> object.</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="k">def</span> <span class="nf">eval</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">x</span><span class="p">,</span> <span class="n">y</span><span class="p">,</span> <span class="n">z</span><span class="p">)</span>
<span class="kn">from</span> <span class="nn">pyspark.sql.types</span> <span class="kn">import</span> <span class="n">Row</span>
<span class="k">yield</span> <span class="n">Row</span><span class="p">(</span><span class="n">x</span><span class="p">,</span> <span class="n">y</span><span class="p">,</span> <span class="n">z</span><span class="p">)</span>
</pre></div>
</div>
<p>This is an example of yielding output rows from the <cite>terminate</cite> method using a Python list.
Usually it makes sense to store state inside the class for this purpose from earlier steps in the
UDTF evaluation.</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="k">def</span> <span class="nf">terminate</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="k">yield</span> <span class="p">[</span><span class="bp">self</span><span class="o">.</span><span class="n">x</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">y</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">z</span><span class="p">]</span>
</pre></div>
</div>
</section>
<section id="registering-and-using-python-udtfs-in-sql">
<h2>Registering and Using Python UDTFs in SQL<a class="headerlink" href="#registering-and-using-python-udtfs-in-sql" title="Permalink to this headline">#</a></h2>
<p>Python UDTFs can be registered and used in SQL queries.</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="kn">from</span> <span class="nn">pyspark.sql.functions</span> <span class="kn">import</span> <span class="n">udtf</span>
<span class="nd">@udtf</span><span class="p">(</span><span class="n">returnType</span><span class="o">=</span><span class="s2">&quot;word: string&quot;</span><span class="p">)</span>
<span class="k">class</span> <span class="nc">WordSplitter</span><span class="p">:</span>
<span class="k">def</span> <span class="nf">eval</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">text</span><span class="p">:</span> <span class="nb">str</span><span class="p">):</span>
<span class="k">for</span> <span class="n">word</span> <span class="ow">in</span> <span class="n">text</span><span class="o">.</span><span class="n">split</span><span class="p">(</span><span class="s2">&quot; &quot;</span><span class="p">):</span>
<span class="k">yield</span> <span class="p">(</span><span class="n">word</span><span class="o">.</span><span class="n">strip</span><span class="p">(),)</span>
<span class="c1"># Register the UDTF for use in Spark SQL.</span>
<span class="n">spark</span><span class="o">.</span><span class="n">udtf</span><span class="o">.</span><span class="n">register</span><span class="p">(</span><span class="s2">&quot;split_words&quot;</span><span class="p">,</span> <span class="n">WordSplitter</span><span class="p">)</span>
<span class="c1"># Example: Using the UDTF in SQL.</span>
<span class="n">spark</span><span class="o">.</span><span class="n">sql</span><span class="p">(</span><span class="s2">&quot;SELECT * FROM split_words(&#39;hello world&#39;)&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">show</span><span class="p">()</span>
<span class="c1"># +-----+</span>
<span class="c1"># | word|</span>
<span class="c1"># +-----+</span>
<span class="c1"># |hello|</span>
<span class="c1"># |world|</span>
<span class="c1"># +-----+</span>
<span class="c1"># Example: Using the UDTF with a lateral join in SQL.</span>
<span class="c1"># The lateral join allows us to reference the columns and aliases</span>
<span class="c1"># in the previous FROM clause items as inputs to the UDTF.</span>
<span class="n">spark</span><span class="o">.</span><span class="n">sql</span><span class="p">(</span>
<span class="s2">&quot;SELECT * FROM VALUES (&#39;Hello World&#39;), (&#39;Apache Spark&#39;) t(text), &quot;</span>
<span class="s2">&quot;LATERAL split_words(text)&quot;</span>
<span class="p">)</span><span class="o">.</span><span class="n">show</span><span class="p">()</span>
<span class="c1"># +------------+------+</span>
<span class="c1"># | text| word|</span>
<span class="c1"># +------------+------+</span>
<span class="c1"># | Hello World| Hello|</span>
<span class="c1"># | Hello World| World|</span>
<span class="c1"># |Apache Spark|Apache|</span>
<span class="c1"># |Apache Spark| Spark|</span>
<span class="c1"># +------------+------+</span>
</pre></div>
</div>
</section>
<section id="arrow-optimization">
<h2>Arrow Optimization<a class="headerlink" href="#arrow-optimization" title="Permalink to this headline">#</a></h2>
<p>Apache Arrow is an in-memory columnar data format used in Spark to efficiently transfer
data between Java and Python processes. Apache Arrow is disabled by default for Python UDTFs.</p>
<p>Arrow can improve performance when each input row generates a large result table from the UDTF.</p>
<p>To enable Arrow optimization, set the <code class="docutils literal notranslate"><span class="pre">spark.sql.execution.pythonUDTF.arrow.enabled</span></code>
configuration to <code class="docutils literal notranslate"><span class="pre">true</span></code>. You can also enable it by specifying the <code class="docutils literal notranslate"><span class="pre">useArrow</span></code> parameter
when declaring the UDTF.</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="kn">from</span> <span class="nn">pyspark.sql.functions</span> <span class="kn">import</span> <span class="n">udtf</span>
<span class="nd">@udtf</span><span class="p">(</span><span class="n">returnType</span><span class="o">=</span><span class="s2">&quot;c1: int, c2: int&quot;</span><span class="p">,</span> <span class="n">useArrow</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
<span class="k">class</span> <span class="nc">PlusOne</span><span class="p">:</span>
<span class="k">def</span> <span class="nf">eval</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">x</span><span class="p">:</span> <span class="nb">int</span><span class="p">):</span>
<span class="k">yield</span> <span class="n">x</span><span class="p">,</span> <span class="n">x</span> <span class="o">+</span> <span class="mi">1</span>
</pre></div>
</div>
<p>For more details, please see <a class="reference internal" href="../arrow_pandas.html"><span class="doc">Apache Arrow in PySpark</span></a>.</p>
</section>
<section id="udtf-examples-with-scalar-arguments">
<h2>UDTF Examples with Scalar Arguments<a class="headerlink" href="#udtf-examples-with-scalar-arguments" title="Permalink to this headline">#</a></h2>
<p>Here is a simple example of a UDTF class implementation:</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="c1"># Define the UDTF class and implement the required `eval` method.</span>
<span class="k">class</span> <span class="nc">SquareNumbers</span><span class="p">:</span>
<span class="k">def</span> <span class="nf">eval</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">start</span><span class="p">:</span> <span class="nb">int</span><span class="p">,</span> <span class="n">end</span><span class="p">:</span> <span class="nb">int</span><span class="p">):</span>
<span class="k">for</span> <span class="n">num</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="n">start</span><span class="p">,</span> <span class="n">end</span> <span class="o">+</span> <span class="mi">1</span><span class="p">):</span>
<span class="k">yield</span> <span class="p">(</span><span class="n">num</span><span class="p">,</span> <span class="n">num</span> <span class="o">*</span> <span class="n">num</span><span class="p">)</span>
</pre></div>
</div>
<p>To make use of the UDTF, you’ll first need to instantiate it using the <code class="docutils literal notranslate"><span class="pre">&#64;udtf</span></code> decorator:</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="kn">from</span> <span class="nn">pyspark.sql.functions</span> <span class="kn">import</span> <span class="n">lit</span><span class="p">,</span> <span class="n">udtf</span>
<span class="c1"># Create a UDTF using the class definition and the `udtf` function.</span>
<span class="n">square_num</span> <span class="o">=</span> <span class="n">udtf</span><span class="p">(</span><span class="n">SquareNumbers</span><span class="p">,</span> <span class="n">returnType</span><span class="o">=</span><span class="s2">&quot;num: int, squared: int&quot;</span><span class="p">)</span>
<span class="c1"># Invoke the UDTF in PySpark.</span>
<span class="n">square_num</span><span class="p">(</span><span class="n">lit</span><span class="p">(</span><span class="mi">1</span><span class="p">),</span> <span class="n">lit</span><span class="p">(</span><span class="mi">3</span><span class="p">))</span><span class="o">.</span><span class="n">show</span><span class="p">()</span>
<span class="c1"># +---+-------+</span>
<span class="c1"># |num|squared|</span>
<span class="c1"># +---+-------+</span>
<span class="c1"># | 1| 1|</span>
<span class="c1"># | 2| 4|</span>
<span class="c1"># | 3| 9|</span>
<span class="c1"># +---+-------+</span>
</pre></div>
</div>
<p>An alternative way to create a UDTF is to use the <a class="reference internal" href="../../reference/pyspark.sql/api/pyspark.sql.functions.udtf.html#pyspark.sql.functions.udtf" title="pyspark.sql.functions.udtf"><code class="xref py py-func docutils literal notranslate"><span class="pre">udtf()</span></code></a> function:</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="kn">from</span> <span class="nn">pyspark.sql.functions</span> <span class="kn">import</span> <span class="n">lit</span><span class="p">,</span> <span class="n">udtf</span>
<span class="c1"># Define a UDTF using the `udtf` decorator directly on the class.</span>
<span class="nd">@udtf</span><span class="p">(</span><span class="n">returnType</span><span class="o">=</span><span class="s2">&quot;num: int, squared: int&quot;</span><span class="p">)</span>
<span class="k">class</span> <span class="nc">SquareNumbers</span><span class="p">:</span>
<span class="k">def</span> <span class="nf">eval</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">start</span><span class="p">:</span> <span class="nb">int</span><span class="p">,</span> <span class="n">end</span><span class="p">:</span> <span class="nb">int</span><span class="p">):</span>
<span class="k">for</span> <span class="n">num</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="n">start</span><span class="p">,</span> <span class="n">end</span> <span class="o">+</span> <span class="mi">1</span><span class="p">):</span>
<span class="k">yield</span> <span class="p">(</span><span class="n">num</span><span class="p">,</span> <span class="n">num</span> <span class="o">*</span> <span class="n">num</span><span class="p">)</span>
<span class="c1"># Invoke the UDTF in PySpark using the SquareNumbers class directly.</span>
<span class="n">SquareNumbers</span><span class="p">(</span><span class="n">lit</span><span class="p">(</span><span class="mi">1</span><span class="p">),</span> <span class="n">lit</span><span class="p">(</span><span class="mi">3</span><span class="p">))</span><span class="o">.</span><span class="n">show</span><span class="p">()</span>
<span class="c1"># +---+-------+</span>
<span class="c1"># |num|squared|</span>
<span class="c1"># +---+-------+</span>
<span class="c1"># | 1| 1|</span>
<span class="c1"># | 2| 4|</span>
<span class="c1"># | 3| 9|</span>
<span class="c1"># +---+-------+</span>
</pre></div>
</div>
<p>Here is a Python UDTF that expands date ranges into individual dates:</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="kn">from</span> <span class="nn">datetime</span> <span class="kn">import</span> <span class="n">datetime</span><span class="p">,</span> <span class="n">timedelta</span>
<span class="kn">from</span> <span class="nn">pyspark.sql.functions</span> <span class="kn">import</span> <span class="n">lit</span><span class="p">,</span> <span class="n">udtf</span>
<span class="nd">@udtf</span><span class="p">(</span><span class="n">returnType</span><span class="o">=</span><span class="s2">&quot;date: string&quot;</span><span class="p">)</span>
<span class="k">class</span> <span class="nc">DateExpander</span><span class="p">:</span>
<span class="k">def</span> <span class="nf">eval</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">start_date</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="n">end_date</span><span class="p">:</span> <span class="nb">str</span><span class="p">):</span>
<span class="n">current</span> <span class="o">=</span> <span class="n">datetime</span><span class="o">.</span><span class="n">strptime</span><span class="p">(</span><span class="n">start_date</span><span class="p">,</span> <span class="s1">&#39;%Y-%m-</span><span class="si">%d</span><span class="s1">&#39;</span><span class="p">)</span>
<span class="n">end</span> <span class="o">=</span> <span class="n">datetime</span><span class="o">.</span><span class="n">strptime</span><span class="p">(</span><span class="n">end_date</span><span class="p">,</span> <span class="s1">&#39;%Y-%m-</span><span class="si">%d</span><span class="s1">&#39;</span><span class="p">)</span>
<span class="k">while</span> <span class="n">current</span> <span class="o">&lt;=</span> <span class="n">end</span><span class="p">:</span>
<span class="k">yield</span> <span class="p">(</span><span class="n">current</span><span class="o">.</span><span class="n">strftime</span><span class="p">(</span><span class="s1">&#39;%Y-%m-</span><span class="si">%d</span><span class="s1">&#39;</span><span class="p">),)</span>
<span class="n">current</span> <span class="o">+=</span> <span class="n">timedelta</span><span class="p">(</span><span class="n">days</span><span class="o">=</span><span class="mi">1</span><span class="p">)</span>
<span class="n">DateExpander</span><span class="p">(</span><span class="n">lit</span><span class="p">(</span><span class="s2">&quot;2023-02-25&quot;</span><span class="p">),</span> <span class="n">lit</span><span class="p">(</span><span class="s2">&quot;2023-03-01&quot;</span><span class="p">))</span><span class="o">.</span><span class="n">show</span><span class="p">()</span>
<span class="c1"># +----------+</span>
<span class="c1"># | date|</span>
<span class="c1"># +----------+</span>
<span class="c1"># |2023-02-25|</span>
<span class="c1"># |2023-02-26|</span>
<span class="c1"># |2023-02-27|</span>
<span class="c1"># |2023-02-28|</span>
<span class="c1"># |2023-03-01|</span>
<span class="c1"># +----------+</span>
</pre></div>
</div>
<p>Here is a Python UDTF with <code class="docutils literal notranslate"><span class="pre">__init__</span></code> and <code class="docutils literal notranslate"><span class="pre">terminate</span></code>:</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="kn">from</span> <span class="nn">pyspark.sql.functions</span> <span class="kn">import</span> <span class="n">udtf</span>
<span class="nd">@udtf</span><span class="p">(</span><span class="n">returnType</span><span class="o">=</span><span class="s2">&quot;cnt: int&quot;</span><span class="p">)</span>
<span class="k">class</span> <span class="nc">CountUDTF</span><span class="p">:</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="c1"># Initialize the counter to 0 when an instance of the class is created.</span>
<span class="bp">self</span><span class="o">.</span><span class="n">count</span> <span class="o">=</span> <span class="mi">0</span>
<span class="k">def</span> <span class="nf">eval</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">x</span><span class="p">:</span> <span class="nb">int</span><span class="p">):</span>
<span class="c1"># Increment the counter by 1 for each input value received.</span>
<span class="bp">self</span><span class="o">.</span><span class="n">count</span> <span class="o">+=</span> <span class="mi">1</span>
<span class="k">def</span> <span class="nf">terminate</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="c1"># Yield the final count when the UDTF is done processing.</span>
<span class="k">yield</span> <span class="bp">self</span><span class="o">.</span><span class="n">count</span><span class="p">,</span>
<span class="n">spark</span><span class="o">.</span><span class="n">udtf</span><span class="o">.</span><span class="n">register</span><span class="p">(</span><span class="s2">&quot;count_udtf&quot;</span><span class="p">,</span> <span class="n">CountUDTF</span><span class="p">)</span>
<span class="n">spark</span><span class="o">.</span><span class="n">sql</span><span class="p">(</span><span class="s2">&quot;SELECT * FROM range(0, 10, 1, 1), LATERAL count_udtf(id)&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">show</span><span class="p">()</span>
<span class="c1"># +---+---+</span>
<span class="c1"># | id|cnt|</span>
<span class="c1"># +---+---+</span>
<span class="c1"># | 9| 10|</span>
<span class="c1"># +---+---+</span>
<span class="n">spark</span><span class="o">.</span><span class="n">sql</span><span class="p">(</span><span class="s2">&quot;SELECT * FROM range(0, 10, 1, 2), LATERAL count_udtf(id)&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">show</span><span class="p">()</span>
<span class="c1"># +---+---+</span>
<span class="c1"># | id|cnt|</span>
<span class="c1"># +---+---+</span>
<span class="c1"># | 4| 5|</span>
<span class="c1"># | 9| 5|</span>
<span class="c1"># +---+---+</span>
</pre></div>
</div>
</section>
<section id="accepting-an-input-table-argument">
<h2>Accepting an Input Table Argument<a class="headerlink" href="#accepting-an-input-table-argument" title="Permalink to this headline">#</a></h2>
<p>The UDTF examples above show functions that accept scalar input arguments, such as integers or
strings.</p>
<p>However, any Python UDTF can also accept an input table as an argument, and this can work in
conjunction with scalar input argument(s) for the same function definition. You are allowed to
have only one such table argument as input.</p>
<p>Then any SQL query can provide an input table using the <code class="docutils literal notranslate"><span class="pre">TABLE</span></code> keyword followed by parentheses
surrounding an appropriate table identifier, like <code class="docutils literal notranslate"><span class="pre">TABLE(t)</span></code>. Alternatively, you can pass a table
subquery, like <code class="docutils literal notranslate"><span class="pre">TABLE(SELECT</span> <span class="pre">a,</span> <span class="pre">b,</span> <span class="pre">c</span> <span class="pre">FROM</span> <span class="pre">t)</span></code> or
<code class="docutils literal notranslate"><span class="pre">TABLE(SELECT</span> <span class="pre">t1.a,</span> <span class="pre">t2.b</span> <span class="pre">FROM</span> <span class="pre">t1</span> <span class="pre">INNER</span> <span class="pre">JOIN</span> <span class="pre">t2</span> <span class="pre">USING</span> <span class="pre">(key))</span></code>.</p>
<p>The input table argument is then represented as a <code class="docutils literal notranslate"><span class="pre">pyspark.sql.Row</span></code> argument to the <code class="docutils literal notranslate"><span class="pre">eval</span></code>
method, with one call to the <code class="docutils literal notranslate"><span class="pre">eval</span></code> method for each row in the input table.</p>
<p>For example:</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="kn">from</span> <span class="nn">pyspark.sql.functions</span> <span class="kn">import</span> <span class="n">udtf</span>
<span class="kn">from</span> <span class="nn">pyspark.sql.types</span> <span class="kn">import</span> <span class="n">Row</span>
<span class="nd">@udtf</span><span class="p">(</span><span class="n">returnType</span><span class="o">=</span><span class="s2">&quot;id: int&quot;</span><span class="p">)</span>
<span class="k">class</span> <span class="nc">FilterUDTF</span><span class="p">:</span>
<span class="k">def</span> <span class="nf">eval</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">row</span><span class="p">:</span> <span class="n">Row</span><span class="p">):</span>
<span class="k">if</span> <span class="n">row</span><span class="p">[</span><span class="s2">&quot;id&quot;</span><span class="p">]</span> <span class="o">&gt;</span> <span class="mi">5</span><span class="p">:</span>
<span class="k">yield</span> <span class="n">row</span><span class="p">[</span><span class="s2">&quot;id&quot;</span><span class="p">],</span>
<span class="n">spark</span><span class="o">.</span><span class="n">udtf</span><span class="o">.</span><span class="n">register</span><span class="p">(</span><span class="s2">&quot;filter_udtf&quot;</span><span class="p">,</span> <span class="n">FilterUDTF</span><span class="p">)</span>
<span class="n">spark</span><span class="o">.</span><span class="n">sql</span><span class="p">(</span><span class="s2">&quot;SELECT * FROM filter_udtf(TABLE(SELECT * FROM range(10)))&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">show</span><span class="p">()</span>
<span class="c1"># +---+</span>
<span class="c1"># | id|</span>
<span class="c1"># +---+</span>
<span class="c1"># | 6|</span>
<span class="c1"># | 7|</span>
<span class="c1"># | 8|</span>
<span class="c1"># | 9|</span>
<span class="c1"># +---+</span>
</pre></div>
</div>
<p>When calling a UDTF with a table argument, any SQL query can request that the input table be
partitioned across several UDTF calls based on the values of one or more columns of the input
table. To do so, specify the <code class="docutils literal notranslate"><span class="pre">PARTITION</span> <span class="pre">BY</span></code> clause in the function call after the <code class="docutils literal notranslate"><span class="pre">TABLE</span></code>
argument. This provides a guaranteee that all input rows with each unique combination of values of
the partitioning columns will get consumed by exactly one instance of the UDTF class.</p>
<p>Note that in addition to simple column references, the <code class="docutils literal notranslate"><span class="pre">PARTITION</span> <span class="pre">BY</span></code> clause also accepts
arbitrary expressions based on columns of the input table. For example, you can specify the
<code class="docutils literal notranslate"><span class="pre">LENGTH</span></code> of a string, extract a month from a date, or concatenate two values.</p>
<p>It is also possible to specify <code class="docutils literal notranslate"><span class="pre">WITH</span> <span class="pre">SINGLE</span> <span class="pre">PARTITION</span></code> instead of <code class="docutils literal notranslate"><span class="pre">PARTITION</span> <span class="pre">BY</span></code> to request
only one partition wherein all input rows must be consumed by exactly one instance of the UDTF
class.</p>
<p>Within each partition, you can optionally specify a required ordering of the input rows as the
UDTF’s <code class="docutils literal notranslate"><span class="pre">eval</span></code> method consumes them. To do so, provide an <code class="docutils literal notranslate"><span class="pre">ORDER</span> <span class="pre">BY</span></code> clause after the
<code class="docutils literal notranslate"><span class="pre">PARTITION</span> <span class="pre">BY</span></code> or <code class="docutils literal notranslate"><span class="pre">WITH</span> <span class="pre">SINGLE</span> <span class="pre">PARTITION</span></code> clause described above.</p>
<p>For example:</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="kn">from</span> <span class="nn">pyspark.sql.functions</span> <span class="kn">import</span> <span class="n">udtf</span>
<span class="kn">from</span> <span class="nn">pyspark.sql.types</span> <span class="kn">import</span> <span class="n">Row</span>
<span class="c1"># Define and register a UDTF.</span>
<span class="nd">@udtf</span><span class="p">(</span><span class="n">returnType</span><span class="o">=</span><span class="s2">&quot;a: string, b: int&quot;</span><span class="p">)</span>
<span class="k">class</span> <span class="nc">FilterUDTF</span><span class="p">:</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="bp">self</span><span class="o">.</span><span class="n">key</span> <span class="o">=</span> <span class="s2">&quot;&quot;</span>
<span class="bp">self</span><span class="o">.</span><span class="n">max</span> <span class="o">=</span> <span class="mi">0</span>
<span class="k">def</span> <span class="nf">eval</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">row</span><span class="p">:</span> <span class="n">Row</span><span class="p">):</span>
<span class="bp">self</span><span class="o">.</span><span class="n">key</span> <span class="o">=</span> <span class="n">row</span><span class="p">[</span><span class="s2">&quot;a&quot;</span><span class="p">]</span>
<span class="bp">self</span><span class="o">.</span><span class="n">max</span> <span class="o">=</span> <span class="nb">max</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">max</span><span class="p">,</span> <span class="n">row</span><span class="p">[</span><span class="s2">&quot;b&quot;</span><span class="p">])</span>
<span class="k">def</span> <span class="nf">terminate</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="k">yield</span> <span class="bp">self</span><span class="o">.</span><span class="n">key</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">max</span>
<span class="n">spark</span><span class="o">.</span><span class="n">udtf</span><span class="o">.</span><span class="n">register</span><span class="p">(</span><span class="s2">&quot;filter_udtf&quot;</span><span class="p">,</span> <span class="n">FilterUDTF</span><span class="p">)</span>
<span class="c1"># Create an input table with some example values.</span>
<span class="n">spark</span><span class="o">.</span><span class="n">sql</span><span class="p">(</span><span class="s2">&quot;DROP TABLE IF EXISTS values_table&quot;</span><span class="p">)</span>
<span class="n">spark</span><span class="o">.</span><span class="n">sql</span><span class="p">(</span><span class="s2">&quot;CREATE TABLE values_table (a STRING, b INT)&quot;</span><span class="p">)</span>
<span class="n">spark</span><span class="o">.</span><span class="n">sql</span><span class="p">(</span><span class="s2">&quot;INSERT INTO values_table VALUES (&#39;abc&#39;, 2), (&#39;abc&#39;, 4), (&#39;def&#39;, 6), (&#39;def&#39;, 8)&quot;</span><span class="p">)</span>
<span class="n">spark</span><span class="o">.</span><span class="n">table</span><span class="p">(</span><span class="s2">&quot;values_table&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">show</span><span class="p">()</span>
<span class="c1"># +-------+----+</span>
<span class="c1"># | a | b |</span>
<span class="c1"># +-------+----+</span>
<span class="c1"># | &quot;abc&quot; | 2 |</span>
<span class="c1"># | &quot;abc&quot; | 4 |</span>
<span class="c1"># | &quot;def&quot; | 6 |</span>
<span class="c1"># | &quot;def&quot; | 8 |</span>
<span class="c1"># +-------+----+</span>
<span class="c1"># Query the UDTF with the input table as an argument, and a directive to partition the input</span>
<span class="c1"># rows such that all rows with each unique value of the `a` column are processed by the same</span>
<span class="c1"># instance of the UDTF class. Within each partition, the rows are ordered by the `b` column.</span>
<span class="n">spark</span><span class="o">.</span><span class="n">sql</span><span class="p">(</span><span class="s2">&quot;&quot;&quot;</span>
<span class="s2"> SELECT * FROM filter_udtf(TABLE(values_table) PARTITION BY a ORDER BY b) ORDER BY 1</span>
<span class="s2"> &quot;&quot;&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">show</span><span class="p">()</span>
<span class="c1"># +-------+----+</span>
<span class="c1"># | a | b |</span>
<span class="c1"># +-------+----+</span>
<span class="c1"># | &quot;abc&quot; | 4 |</span>
<span class="c1"># | &quot;def&quot; | 8 |</span>
<span class="c1"># +-------+----+</span>
<span class="c1"># Query the UDTF with the input table as an argument, and a directive to partition the input</span>
<span class="c1"># rows such that all rows with each unique result of evaluating the &quot;LENGTH(a)&quot; expression are</span>
<span class="c1"># processed by the same instance of the UDTF class. Within each partition, the rows are ordered</span>
<span class="c1"># by the `b` column.</span>
<span class="n">spark</span><span class="o">.</span><span class="n">sql</span><span class="p">(</span><span class="s2">&quot;&quot;&quot;</span>
<span class="s2"> SELECT * FROM filter_udtf(TABLE(values_table) PARTITION BY LENGTH(a) ORDER BY b) ORDER BY 1</span>
<span class="s2"> &quot;&quot;&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">show</span><span class="p">()</span>
<span class="c1"># +-------+---+</span>
<span class="c1"># | a | b |</span>
<span class="c1"># +-------+---+</span>
<span class="c1"># | &quot;def&quot; | 8 |</span>
<span class="c1"># +-------+---+</span>
<span class="c1"># Query the UDTF with the input table as an argument, and a directive to consider all the input</span>
<span class="c1"># rows in one single partition such that exactly once instance of the UDTF class consumes all of</span>
<span class="c1"># the input rows. Within each partition, the rows are ordered by the `b` column.</span>
<span class="n">spark</span><span class="o">.</span><span class="n">sql</span><span class="p">(</span><span class="s2">&quot;&quot;&quot;</span>
<span class="s2"> SELECT * FROM filter_udtf(TABLE(values_table) WITH SINGLE PARTITION ORDER BY b) ORDER BY 1</span>
<span class="s2"> &quot;&quot;&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">show</span><span class="p">()</span>
<span class="c1"># +-------+----+</span>
<span class="c1"># | a | b |</span>
<span class="c1"># +-------+----+</span>
<span class="c1"># | &quot;def&quot; | 8 |</span>
<span class="c1"># +-------+----+</span>
<span class="c1"># Clean up.</span>
<span class="n">spark</span><span class="o">.</span><span class="n">sql</span><span class="p">(</span><span class="s2">&quot;DROP TABLE values_table&quot;</span><span class="p">)</span>
</pre></div>
</div>
<p>Note that in for each of these ways of partitioning the input table when calling UDTFs in SQL
queries, there is a corresponding way for the UDTF’s <code class="docutils literal notranslate"><span class="pre">analyze</span></code> method to specify the same
partitioning method automatically instead.</p>
<p>For example, instead of calling a UDTF as <code class="docutils literal notranslate"><span class="pre">SELECT</span> <span class="pre">*</span> <span class="pre">FROM</span> <span class="pre">udtf(TABLE(t)</span> <span class="pre">PARTITION</span> <span class="pre">BY</span> <span class="pre">a)</span></code>, you can
update the <code class="docutils literal notranslate"><span class="pre">analyze</span></code> method to set the field <code class="docutils literal notranslate"><span class="pre">partitionBy=[PartitioningColumn(&quot;a&quot;)]</span></code> and simply
call the function like <code class="docutils literal notranslate"><span class="pre">SELECT</span> <span class="pre">*</span> <span class="pre">FROM</span> <span class="pre">udtf(TABLE(t))</span></code>.</p>
<p>By the same token, instead of specifying <code class="docutils literal notranslate"><span class="pre">TABLE(t)</span> <span class="pre">WITH</span> <span class="pre">SINGLE</span> <span class="pre">PARTITION</span></code> in the SQL query,
make <code class="docutils literal notranslate"><span class="pre">analyze</span></code> set the field <code class="docutils literal notranslate"><span class="pre">withSinglePartition=true</span></code> and then just pass <code class="docutils literal notranslate"><span class="pre">TABLE(t)</span></code>.</p>
<p>Instead of passing <code class="docutils literal notranslate"><span class="pre">TABLE(t)</span> <span class="pre">ORDER</span> <span class="pre">BY</span> <span class="pre">b</span></code> in the SQL query, you can make <code class="docutils literal notranslate"><span class="pre">analyze</span></code> set
<code class="docutils literal notranslate"><span class="pre">orderBy=[OrderingColumn(&quot;b&quot;)]</span></code> and then just pass <code class="docutils literal notranslate"><span class="pre">TABLE(t)</span></code>.</p>
<p>Instead of passing <code class="docutils literal notranslate"><span class="pre">TABLE(SELECT</span> <span class="pre">a</span> <span class="pre">FROM</span> <span class="pre">t)</span></code> in the SQL query, you can make <code class="docutils literal notranslate"><span class="pre">analyze</span></code> set
<code class="docutils literal notranslate"><span class="pre">select=[SelectedColumn(&quot;a&quot;)]</span></code> and then just pass <code class="docutils literal notranslate"><span class="pre">TABLE(t)</span></code>.</p>
</section>
</section>
</article>
<footer class="bd-footer-article">
<div class="footer-article-items footer-article__inner">
<div class="footer-article-item"><!-- Previous / next buttons -->
<div class="prev-next-area">
<a class="left-prev"
href="arrow_pandas.html"
title="previous page">
<i class="fa-solid fa-angle-left"></i>
<div class="prev-next-info">
<p class="prev-next-subtitle">previous</p>
<p class="prev-next-title">Apache Arrow in PySpark</p>
</div>
</a>
<a class="right-next"
href="python_data_source.html"
title="next page">
<div class="prev-next-info">
<p class="prev-next-subtitle">next</p>
<p class="prev-next-title">Python Data Source API</p>
</div>
<i class="fa-solid fa-angle-right"></i>
</a>
</div></div>
</div>
</footer>
</div>
<div class="bd-sidebar-secondary bd-toc"><div class="sidebar-secondary-items sidebar-secondary__inner">
<div class="sidebar-secondary-item">
<div class="page-toc tocsection onthispage">
<i class="fa-solid fa-list"></i> On this page
</div>
<nav class="bd-toc-nav page-toc">
<ul class="visible nav section-nav flex-column">
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#implementing-a-python-udtf">Implementing a Python UDTF</a></li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#defining-the-output-schema">Defining the Output Schema</a></li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#emitting-output-rows">Emitting Output Rows</a></li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#registering-and-using-python-udtfs-in-sql">Registering and Using Python UDTFs in SQL</a></li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#arrow-optimization">Arrow Optimization</a></li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#udtf-examples-with-scalar-arguments">UDTF Examples with Scalar Arguments</a></li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#accepting-an-input-table-argument">Accepting an Input Table Argument</a></li>
</ul>
</nav></div>
<div class="sidebar-secondary-item">
<div class="tocsection sourcelink">
<a href="../../_sources/user_guide/sql/python_udtf.rst.txt">
<i class="fa-solid fa-file-lines"></i> Show Source
</a>
</div>
</div>
</div></div>
</div>
<footer class="bd-footer-content">
</footer>
</main>
</div>
</div>
<!-- Scripts loaded after <body> so the DOM is not blocked -->
<script src="../../_static/scripts/bootstrap.js?digest=e353d410970836974a52"></script>
<script src="../../_static/scripts/pydata-sphinx-theme.js?digest=e353d410970836974a52"></script>
<footer class="bd-footer">
<div class="bd-footer__inner bd-page-width">
<div class="footer-items__start">
<div class="footer-item"><p class="copyright">
Copyright @ 2024 The Apache Software Foundation, Licensed under the <a href="https://www.apache.org/licenses/LICENSE-2.0">Apache License, Version 2.0</a>.
</p></div>
<div class="footer-item">
<p class="sphinx-version">
Created using <a href="https://www.sphinx-doc.org/">Sphinx</a> 4.5.0.
<br/>
</p>
</div>
</div>
<div class="footer-items__end">
<div class="footer-item"><p class="theme-version">
Built with the <a href="https://pydata-sphinx-theme.readthedocs.io/en/stable/index.html">PyData Sphinx Theme</a> 0.13.3.
</p></div>
</div>
</div>
</footer>
</body>
</html>