blob: a11f3feaf6d10e32cf88675e871b261960f097cc [file] [log] [blame]
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8" />
<title>pyspark.sql.functions.pandas_udf &#8212; PySpark 3.2.2 documentation</title>
<link rel="stylesheet" href="../../_static/css/index.73d71520a4ca3b99cfee5594769eaaae.css">
<link rel="stylesheet"
href="../../_static/vendor/fontawesome/5.13.0/css/all.min.css">
<link rel="preload" as="font" type="font/woff2" crossorigin
href="../../_static/vendor/fontawesome/5.13.0/webfonts/fa-solid-900.woff2">
<link rel="preload" as="font" type="font/woff2" crossorigin
href="../../_static/vendor/fontawesome/5.13.0/webfonts/fa-brands-400.woff2">
<link rel="stylesheet"
href="../../_static/vendor/open-sans_all/1.44.1/index.css">
<link rel="stylesheet"
href="../../_static/vendor/lato_latin-ext/1.44.1/index.css">
<link rel="stylesheet" href="../../_static/basic.css" type="text/css" />
<link rel="stylesheet" href="../../_static/pygments.css" type="text/css" />
<link rel="stylesheet" type="text/css" href="../../_static/css/pyspark.css" />
<link rel="preload" as="script" href="../../_static/js/index.3da636dd464baa7582d2.js">
<script id="documentation_options" data-url_root="../../" src="../../_static/documentation_options.js"></script>
<script src="../../_static/jquery.js"></script>
<script src="../../_static/underscore.js"></script>
<script src="../../_static/doctools.js"></script>
<script src="../../_static/language_data.js"></script>
<script src="../../_static/copybutton.js"></script>
<script crossorigin="anonymous" integrity="sha256-Ae2Vz/4ePdIu6ZyI/5ZGsYnb+m0JlOmKPjt6XZ9JJkA=" src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.4/require.min.js"></script>
<script async="async" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
<script type="text/x-mathjax-config">MathJax.Hub.Config({"tex2jax": {"inlineMath": [["$", "$"], ["\\(", "\\)"]], "processEscapes": true, "ignoreClass": "document", "processClass": "math|output_area"}})</script>
<link rel="canonical" href="https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.functions.pandas_udf.html" />
<link rel="search" title="Search" href="../../search.html" />
<link rel="next" title="pyspark.sql.functions.percent_rank" href="pyspark.sql.functions.percent_rank.html" />
<link rel="prev" title="pyspark.sql.functions.overlay" href="pyspark.sql.functions.overlay.html" />
<meta name="viewport" content="width=device-width, initial-scale=1" />
<meta name="docsearch:language" content="en" />
</head>
<body data-spy="scroll" data-target="#bd-toc-nav" data-offset="80">
<nav class="navbar navbar-light navbar-expand-lg bg-light fixed-top bd-navbar" id="navbar-main">
<div class="container-xl">
<a class="navbar-brand" href="../../index.html">
<img src="../../_static/spark-logo-reverse.png" class="logo" alt="logo" />
</a>
<button class="navbar-toggler" type="button" data-toggle="collapse" data-target="#navbar-menu" aria-controls="navbar-menu" aria-expanded="false" aria-label="Toggle navigation">
<span class="navbar-toggler-icon"></span>
</button>
<div id="navbar-menu" class="col-lg-9 collapse navbar-collapse">
<ul id="navbar-main-elements" class="navbar-nav mr-auto">
<li class="nav-item ">
<a class="nav-link" href="../../getting_started/index.html">Getting Started</a>
</li>
<li class="nav-item ">
<a class="nav-link" href="../../user_guide/index.html">User Guide</a>
</li>
<li class="nav-item active">
<a class="nav-link" href="../index.html">API Reference</a>
</li>
<li class="nav-item ">
<a class="nav-link" href="../../development/index.html">Development</a>
</li>
<li class="nav-item ">
<a class="nav-link" href="../../migration_guide/index.html">Migration Guide</a>
</li>
</ul>
<ul class="navbar-nav">
</ul>
</div>
</div>
</nav>
<div class="container-xl">
<div class="row">
<div class="col-12 col-md-3 bd-sidebar"><form class="bd-search d-flex align-items-center" action="../../search.html" method="get">
<i class="icon fas fa-search"></i>
<input type="search" class="form-control" name="q" id="search-input" placeholder="Search the docs ..." aria-label="Search the docs ..." autocomplete="off" >
</form>
<nav class="bd-links" id="bd-docs-nav" aria-label="Main navigation">
<div class="bd-toc-item active">
<ul class="nav bd-sidenav">
<li class="active">
<a href="../pyspark.sql.html">Spark SQL</a>
</li>
<li class="">
<a href="../pyspark.pandas/index.html">Pandas API on Spark</a>
</li>
<li class="">
<a href="../pyspark.ss.html">Structured Streaming</a>
</li>
<li class="">
<a href="../pyspark.ml.html">MLlib (DataFrame-based)</a>
</li>
<li class="">
<a href="../pyspark.streaming.html">Spark Streaming</a>
</li>
<li class="">
<a href="../pyspark.mllib.html">MLlib (RDD-based)</a>
</li>
<li class="">
<a href="../pyspark.html">Spark Core</a>
</li>
<li class="">
<a href="../pyspark.resource.html">Resource Management</a>
</li>
</ul>
</nav>
</div>
<div class="d-none d-xl-block col-xl-2 bd-toc">
<nav id="bd-toc-nav">
<ul class="nav section-nav flex-column">
</ul>
</nav>
</div>
<main class="col-12 col-md-9 col-xl-7 py-md-5 pl-md-5 pr-md-4 bd-content" role="main">
<div>
<div class="section" id="pyspark-sql-functions-pandas-udf">
<h1>pyspark.sql.functions.pandas_udf<a class="headerlink" href="#pyspark-sql-functions-pandas-udf" title="Permalink to this headline">¶</a></h1>
<dl class="py function">
<dt id="pyspark.sql.functions.pandas_udf">
<code class="sig-prename descclassname">pyspark.sql.functions.</code><code class="sig-name descname">pandas_udf</code><span class="sig-paren">(</span><em class="sig-param"><span class="n">f</span><span class="o">=</span><span class="default_value">None</span></em>, <em class="sig-param"><span class="n">returnType</span><span class="o">=</span><span class="default_value">None</span></em>, <em class="sig-param"><span class="n">functionType</span><span class="o">=</span><span class="default_value">None</span></em><span class="sig-paren">)</span><a class="reference internal" href="../../_modules/pyspark/sql/pandas/functions.html#pandas_udf"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#pyspark.sql.functions.pandas_udf" title="Permalink to this definition">¶</a></dt>
<dd><p>Creates a pandas user defined function (a.k.a. vectorized user defined function).</p>
<p>Pandas UDFs are user defined functions that are executed by Spark using Arrow to transfer
data and Pandas to work with the data, which allows vectorized operations. A Pandas UDF
is defined using the <cite>pandas_udf</cite> as a decorator or to wrap the function, and no
additional configuration is required. A Pandas UDF behaves as a regular PySpark function
API in general.</p>
<div class="versionadded">
<p><span class="versionmodified added">New in version 2.3.0.</span></p>
</div>
<dl class="field-list">
<dt class="field-odd">Parameters</dt>
<dd class="field-odd"><dl>
<dt><strong>f</strong><span class="classifier">function, optional</span></dt><dd><p>user-defined function. A python function if used as a standalone function</p>
</dd>
<dt><strong>returnType</strong><span class="classifier"><a class="reference internal" href="pyspark.sql.types.DataType.html#pyspark.sql.types.DataType" title="pyspark.sql.types.DataType"><code class="xref py py-class docutils literal notranslate"><span class="pre">pyspark.sql.types.DataType</span></code></a> or str, optional</span></dt><dd><p>the return type of the user-defined function. The value can be either a
<a class="reference internal" href="pyspark.sql.types.DataType.html#pyspark.sql.types.DataType" title="pyspark.sql.types.DataType"><code class="xref py py-class docutils literal notranslate"><span class="pre">pyspark.sql.types.DataType</span></code></a> object or a DDL-formatted type string.</p>
</dd>
<dt><strong>functionType</strong><span class="classifier">int, optional</span></dt><dd><p>an enum value in <code class="xref py py-class docutils literal notranslate"><span class="pre">pyspark.sql.functions.PandasUDFType</span></code>.
Default: SCALAR. This parameter exists for compatibility.
Using Python type hints is encouraged.</p>
</dd>
</dl>
</dd>
</dl>
<div class="admonition seealso">
<p class="admonition-title">See also</p>
<dl class="simple">
<dt><a class="reference internal" href="pyspark.sql.GroupedData.agg.html#pyspark.sql.GroupedData.agg" title="pyspark.sql.GroupedData.agg"><code class="xref py py-obj docutils literal notranslate"><span class="pre">pyspark.sql.GroupedData.agg</span></code></a></dt><dd></dd>
<dt><a class="reference internal" href="pyspark.sql.DataFrame.mapInPandas.html#pyspark.sql.DataFrame.mapInPandas" title="pyspark.sql.DataFrame.mapInPandas"><code class="xref py py-obj docutils literal notranslate"><span class="pre">pyspark.sql.DataFrame.mapInPandas</span></code></a></dt><dd></dd>
<dt><a class="reference internal" href="pyspark.sql.GroupedData.applyInPandas.html#pyspark.sql.GroupedData.applyInPandas" title="pyspark.sql.GroupedData.applyInPandas"><code class="xref py py-obj docutils literal notranslate"><span class="pre">pyspark.sql.GroupedData.applyInPandas</span></code></a></dt><dd></dd>
<dt><a class="reference internal" href="pyspark.sql.PandasCogroupedOps.applyInPandas.html#pyspark.sql.PandasCogroupedOps.applyInPandas" title="pyspark.sql.PandasCogroupedOps.applyInPandas"><code class="xref py py-obj docutils literal notranslate"><span class="pre">pyspark.sql.PandasCogroupedOps.applyInPandas</span></code></a></dt><dd></dd>
<dt><code class="xref py py-obj docutils literal notranslate"><span class="pre">pyspark.sql.UDFRegistration.register</span></code></dt><dd></dd>
</dl>
</div>
<p class="rubric">Notes</p>
<p>The user-defined functions do not support conditional expressions or short circuiting
in boolean expressions and it ends up with being executed all internally. If the functions
can fail on special rows, the workaround is to incorporate the condition into the functions.</p>
<p>The user-defined functions do not take keyword arguments on the calling side.</p>
<p>The data type of returned <cite>pandas.Series</cite> from the user-defined functions should be
matched with defined <cite>returnType</cite> (see <code class="xref py py-meth docutils literal notranslate"><span class="pre">types.to_arrow_type()</span></code> and
<code class="xref py py-meth docutils literal notranslate"><span class="pre">types.from_arrow_type()</span></code>). When there is mismatch between them, Spark might do
conversion on returned data. The conversion is not guaranteed to be correct and results
should be checked for accuracy by users.</p>
<p>Currently,
<a class="reference internal" href="pyspark.sql.types.ArrayType.html#pyspark.sql.types.ArrayType" title="pyspark.sql.types.ArrayType"><code class="xref py py-class docutils literal notranslate"><span class="pre">pyspark.sql.types.ArrayType</span></code></a> of <a class="reference internal" href="pyspark.sql.types.TimestampType.html#pyspark.sql.types.TimestampType" title="pyspark.sql.types.TimestampType"><code class="xref py py-class docutils literal notranslate"><span class="pre">pyspark.sql.types.TimestampType</span></code></a> and
nested <a class="reference internal" href="pyspark.sql.types.StructType.html#pyspark.sql.types.StructType" title="pyspark.sql.types.StructType"><code class="xref py py-class docutils literal notranslate"><span class="pre">pyspark.sql.types.StructType</span></code></a>
are currently not supported as output types.</p>
<p class="rubric">Examples</p>
<p>In order to use this API, customarily the below are imported:</p>
<div class="doctest highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="kn">import</span> <span class="nn">pandas</span> <span class="k">as</span> <span class="nn">pd</span>
<span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">pyspark.sql.functions</span> <span class="kn">import</span> <span class="n">pandas_udf</span>
</pre></div>
</div>
<p>From Spark 3.0 with Python 3.6+, <a class="reference external" href="https://www.python.org/dev/peps/pep-0484">Python type hints</a>
detect the function types as below:</p>
<div class="doctest highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="nd">@pandas_udf</span><span class="p">(</span><span class="n">IntegerType</span><span class="p">())</span>
<span class="gp">... </span><span class="k">def</span> <span class="nf">slen</span><span class="p">(</span><span class="n">s</span><span class="p">:</span> <span class="n">pd</span><span class="o">.</span><span class="n">Series</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">pd</span><span class="o">.</span><span class="n">Series</span><span class="p">:</span>
<span class="gp">... </span> <span class="k">return</span> <span class="n">s</span><span class="o">.</span><span class="n">str</span><span class="o">.</span><span class="n">len</span><span class="p">()</span>
</pre></div>
</div>
<p>Prior to Spark 3.0, the pandas UDF used <cite>functionType</cite> to decide the execution type as below:</p>
<div class="doctest highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">pyspark.sql.functions</span> <span class="kn">import</span> <span class="n">PandasUDFType</span>
<span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">pyspark.sql.types</span> <span class="kn">import</span> <span class="n">IntegerType</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nd">@pandas_udf</span><span class="p">(</span><span class="n">IntegerType</span><span class="p">(),</span> <span class="n">PandasUDFType</span><span class="o">.</span><span class="n">SCALAR</span><span class="p">)</span>
<span class="gp">... </span><span class="k">def</span> <span class="nf">slen</span><span class="p">(</span><span class="n">s</span><span class="p">):</span>
<span class="gp">... </span> <span class="k">return</span> <span class="n">s</span><span class="o">.</span><span class="n">str</span><span class="o">.</span><span class="n">len</span><span class="p">()</span>
</pre></div>
</div>
<p>It is preferred to specify type hints for the pandas UDF instead of specifying pandas UDF
type via <cite>functionType</cite> which will be deprecated in the future releases.</p>
<p>Note that the type hint should use <cite>pandas.Series</cite> in all cases but there is one variant
that <cite>pandas.DataFrame</cite> should be used for its input or output type hint instead when the input
or output column is of <a class="reference internal" href="pyspark.sql.types.StructType.html#pyspark.sql.types.StructType" title="pyspark.sql.types.StructType"><code class="xref py py-class docutils literal notranslate"><span class="pre">pyspark.sql.types.StructType</span></code></a>. The following example shows
a Pandas UDF which takes long column, string column and struct column, and outputs a struct
column. It requires the function to specify the type hints of <cite>pandas.Series</cite> and
<cite>pandas.DataFrame</cite> as below:</p>
<div class="doctest highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="nd">@pandas_udf</span><span class="p">(</span><span class="s2">&quot;col1 string, col2 long&quot;</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="k">def</span> <span class="nf">func</span><span class="p">(</span><span class="n">s1</span><span class="p">:</span> <span class="n">pd</span><span class="o">.</span><span class="n">Series</span><span class="p">,</span> <span class="n">s2</span><span class="p">:</span> <span class="n">pd</span><span class="o">.</span><span class="n">Series</span><span class="p">,</span> <span class="n">s3</span><span class="p">:</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">:</span>
<span class="gp">... </span> <span class="n">s3</span><span class="p">[</span><span class="s1">&#39;col2&#39;</span><span class="p">]</span> <span class="o">=</span> <span class="n">s1</span> <span class="o">+</span> <span class="n">s2</span><span class="o">.</span><span class="n">str</span><span class="o">.</span><span class="n">len</span><span class="p">()</span>
<span class="gp">... </span> <span class="k">return</span> <span class="n">s3</span>
<span class="gp">...</span>
<span class="gp">&gt;&gt;&gt; </span><span class="c1"># Create a Spark DataFrame that has three columns including a struct column.</span>
<span class="gp">... </span><span class="n">df</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">(</span>
<span class="gp">... </span> <span class="p">[[</span><span class="mi">1</span><span class="p">,</span> <span class="s2">&quot;a string&quot;</span><span class="p">,</span> <span class="p">(</span><span class="s2">&quot;a nested string&quot;</span><span class="p">,)]],</span>
<span class="gp">... </span> <span class="s2">&quot;long_col long, string_col string, struct_col struct&lt;col1:string&gt;&quot;</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">df</span><span class="o">.</span><span class="n">printSchema</span><span class="p">()</span>
<span class="go">root</span>
<span class="go">|-- long_column: long (nullable = true)</span>
<span class="go">|-- string_column: string (nullable = true)</span>
<span class="go">|-- struct_column: struct (nullable = true)</span>
<span class="go">| |-- col1: string (nullable = true)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">df</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="n">func</span><span class="p">(</span><span class="s2">&quot;long_col&quot;</span><span class="p">,</span> <span class="s2">&quot;string_col&quot;</span><span class="p">,</span> <span class="s2">&quot;struct_col&quot;</span><span class="p">))</span><span class="o">.</span><span class="n">printSchema</span><span class="p">()</span>
<span class="go">|-- func(long_col, string_col, struct_col): struct (nullable = true)</span>
<span class="go">| |-- col1: string (nullable = true)</span>
<span class="go">| |-- col2: long (nullable = true)</span>
</pre></div>
</div>
<p>In the following sections, it describes the combinations of the supported type hints. For
simplicity, <cite>pandas.DataFrame</cite> variant is omitted.</p>
<ul>
<li><dl>
<dt>Series to Series</dt><dd><p><cite>pandas.Series</cite>, … -&gt; <cite>pandas.Series</cite></p>
<p>The function takes one or more <cite>pandas.Series</cite> and outputs one <cite>pandas.Series</cite>.
The output of the function should always be of the same length as the input.</p>
<div class="doctest highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="nd">@pandas_udf</span><span class="p">(</span><span class="s2">&quot;string&quot;</span><span class="p">)</span>
<span class="gp">... </span><span class="k">def</span> <span class="nf">to_upper</span><span class="p">(</span><span class="n">s</span><span class="p">:</span> <span class="n">pd</span><span class="o">.</span><span class="n">Series</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">pd</span><span class="o">.</span><span class="n">Series</span><span class="p">:</span>
<span class="gp">... </span> <span class="k">return</span> <span class="n">s</span><span class="o">.</span><span class="n">str</span><span class="o">.</span><span class="n">upper</span><span class="p">()</span>
<span class="gp">...</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">df</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">([(</span><span class="s2">&quot;John Doe&quot;</span><span class="p">,)],</span> <span class="p">(</span><span class="s2">&quot;name&quot;</span><span class="p">,))</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">df</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="n">to_upper</span><span class="p">(</span><span class="s2">&quot;name&quot;</span><span class="p">))</span><span class="o">.</span><span class="n">show</span><span class="p">()</span>
<span class="go">+--------------+</span>
<span class="go">|to_upper(name)|</span>
<span class="go">+--------------+</span>
<span class="go">| JOHN DOE|</span>
<span class="go">+--------------+</span>
</pre></div>
</div>
<div class="doctest highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="nd">@pandas_udf</span><span class="p">(</span><span class="s2">&quot;first string, last string&quot;</span><span class="p">)</span>
<span class="gp">... </span><span class="k">def</span> <span class="nf">split_expand</span><span class="p">(</span><span class="n">s</span><span class="p">:</span> <span class="n">pd</span><span class="o">.</span><span class="n">Series</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">:</span>
<span class="gp">... </span> <span class="k">return</span> <span class="n">s</span><span class="o">.</span><span class="n">str</span><span class="o">.</span><span class="n">split</span><span class="p">(</span><span class="n">expand</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
<span class="gp">...</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">df</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">([(</span><span class="s2">&quot;John Doe&quot;</span><span class="p">,)],</span> <span class="p">(</span><span class="s2">&quot;name&quot;</span><span class="p">,))</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">df</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="n">split_expand</span><span class="p">(</span><span class="s2">&quot;name&quot;</span><span class="p">))</span><span class="o">.</span><span class="n">show</span><span class="p">()</span>
<span class="go">+------------------+</span>
<span class="go">|split_expand(name)|</span>
<span class="go">+------------------+</span>
<span class="go">| [John, Doe]|</span>
<span class="go">+------------------+</span>
</pre></div>
</div>
<div class="admonition note">
<p class="admonition-title">Note</p>
<p>The length of the input is not that of the whole input column, but is the
length of an internal batch used for each call to the function.</p>
</div>
</dd>
</dl>
</li>
<li><dl>
<dt>Iterator of Series to Iterator of Series</dt><dd><p><cite>Iterator[pandas.Series]</cite> -&gt; <cite>Iterator[pandas.Series]</cite></p>
<p>The function takes an iterator of <cite>pandas.Series</cite> and outputs an iterator of
<cite>pandas.Series</cite>. In this case, the created pandas UDF instance requires one input
column when this is called as a PySpark column. The length of the entire output from
the function should be the same length of the entire input; therefore, it can
prefetch the data from the input iterator as long as the lengths are the same.</p>
<p>It is also useful when the UDF execution
requires initializing some states although internally it works identically as
Series to Series case. The pseudocode below illustrates the example.</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="nd">@pandas_udf</span><span class="p">(</span><span class="s2">&quot;long&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">calculate</span><span class="p">(</span><span class="n">iterator</span><span class="p">:</span> <span class="n">Iterator</span><span class="p">[</span><span class="n">pd</span><span class="o">.</span><span class="n">Series</span><span class="p">])</span> <span class="o">-&gt;</span> <span class="n">Iterator</span><span class="p">[</span><span class="n">pd</span><span class="o">.</span><span class="n">Series</span><span class="p">]:</span>
<span class="c1"># Do some expensive initialization with a state</span>
<span class="n">state</span> <span class="o">=</span> <span class="n">very_expensive_initialization</span><span class="p">()</span>
<span class="k">for</span> <span class="n">x</span> <span class="ow">in</span> <span class="n">iterator</span><span class="p">:</span>
<span class="c1"># Use that state for whole iterator.</span>
<span class="k">yield</span> <span class="n">calculate_with_state</span><span class="p">(</span><span class="n">x</span><span class="p">,</span> <span class="n">state</span><span class="p">)</span>
<span class="n">df</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="n">calculate</span><span class="p">(</span><span class="s2">&quot;value&quot;</span><span class="p">))</span><span class="o">.</span><span class="n">show</span><span class="p">()</span>
</pre></div>
</div>
<div class="doctest highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">typing</span> <span class="kn">import</span> <span class="n">Iterator</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nd">@pandas_udf</span><span class="p">(</span><span class="s2">&quot;long&quot;</span><span class="p">)</span>
<span class="gp">... </span><span class="k">def</span> <span class="nf">plus_one</span><span class="p">(</span><span class="n">iterator</span><span class="p">:</span> <span class="n">Iterator</span><span class="p">[</span><span class="n">pd</span><span class="o">.</span><span class="n">Series</span><span class="p">])</span> <span class="o">-&gt;</span> <span class="n">Iterator</span><span class="p">[</span><span class="n">pd</span><span class="o">.</span><span class="n">Series</span><span class="p">]:</span>
<span class="gp">... </span> <span class="k">for</span> <span class="n">s</span> <span class="ow">in</span> <span class="n">iterator</span><span class="p">:</span>
<span class="gp">... </span> <span class="k">yield</span> <span class="n">s</span> <span class="o">+</span> <span class="mi">1</span>
<span class="gp">...</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">df</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">(</span><span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">([</span><span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="mi">3</span><span class="p">],</span> <span class="n">columns</span><span class="o">=</span><span class="p">[</span><span class="s2">&quot;v&quot;</span><span class="p">]))</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">df</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="n">plus_one</span><span class="p">(</span><span class="n">df</span><span class="o">.</span><span class="n">v</span><span class="p">))</span><span class="o">.</span><span class="n">show</span><span class="p">()</span>
<span class="go">+-----------+</span>
<span class="go">|plus_one(v)|</span>
<span class="go">+-----------+</span>
<span class="go">| 2|</span>
<span class="go">| 3|</span>
<span class="go">| 4|</span>
<span class="go">+-----------+</span>
</pre></div>
</div>
<div class="admonition note">
<p class="admonition-title">Note</p>
<p>The length of each series is the length of a batch internally used.</p>
</div>
</dd>
</dl>
</li>
<li><dl>
<dt>Iterator of Multiple Series to Iterator of Series</dt><dd><p><cite>Iterator[Tuple[pandas.Series, …]]</cite> -&gt; <cite>Iterator[pandas.Series]</cite></p>
<p>The function takes an iterator of a tuple of multiple <cite>pandas.Series</cite> and outputs an
iterator of <cite>pandas.Series</cite>. In this case, the created pandas UDF instance requires
input columns as many as the series when this is called as a PySpark column.
Otherwise, it has the same characteristics and restrictions as Iterator of Series
to Iterator of Series case.</p>
<div class="doctest highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">typing</span> <span class="kn">import</span> <span class="n">Iterator</span><span class="p">,</span> <span class="n">Tuple</span>
<span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">pyspark.sql.functions</span> <span class="kn">import</span> <span class="n">struct</span><span class="p">,</span> <span class="n">col</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nd">@pandas_udf</span><span class="p">(</span><span class="s2">&quot;long&quot;</span><span class="p">)</span>
<span class="gp">... </span><span class="k">def</span> <span class="nf">multiply</span><span class="p">(</span><span class="n">iterator</span><span class="p">:</span> <span class="n">Iterator</span><span class="p">[</span><span class="n">Tuple</span><span class="p">[</span><span class="n">pd</span><span class="o">.</span><span class="n">Series</span><span class="p">,</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">]])</span> <span class="o">-&gt;</span> <span class="n">Iterator</span><span class="p">[</span><span class="n">pd</span><span class="o">.</span><span class="n">Series</span><span class="p">]:</span>
<span class="gp">... </span> <span class="k">for</span> <span class="n">s1</span><span class="p">,</span> <span class="n">df</span> <span class="ow">in</span> <span class="n">iterator</span><span class="p">:</span>
<span class="gp">... </span> <span class="k">yield</span> <span class="n">s1</span> <span class="o">*</span> <span class="n">df</span><span class="o">.</span><span class="n">v</span>
<span class="gp">...</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">df</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">(</span><span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">([</span><span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="mi">3</span><span class="p">],</span> <span class="n">columns</span><span class="o">=</span><span class="p">[</span><span class="s2">&quot;v&quot;</span><span class="p">]))</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">df</span><span class="o">.</span><span class="n">withColumn</span><span class="p">(</span><span class="s1">&#39;output&#39;</span><span class="p">,</span> <span class="n">multiply</span><span class="p">(</span><span class="n">col</span><span class="p">(</span><span class="s2">&quot;v&quot;</span><span class="p">),</span> <span class="n">struct</span><span class="p">(</span><span class="n">col</span><span class="p">(</span><span class="s2">&quot;v&quot;</span><span class="p">))))</span><span class="o">.</span><span class="n">show</span><span class="p">()</span>
<span class="go">+---+------+</span>
<span class="go">| v|output|</span>
<span class="go">+---+------+</span>
<span class="go">| 1| 1|</span>
<span class="go">| 2| 4|</span>
<span class="go">| 3| 9|</span>
<span class="go">+---+------+</span>
</pre></div>
</div>
<div class="admonition note">
<p class="admonition-title">Note</p>
<p>The length of each series is the length of a batch internally used.</p>
</div>
</dd>
</dl>
</li>
<li><dl>
<dt>Series to Scalar</dt><dd><p><cite>pandas.Series</cite>, … -&gt; <cite>Any</cite></p>
<p>The function takes <cite>pandas.Series</cite> and returns a scalar value. The <cite>returnType</cite>
should be a primitive data type, and the returned scalar can be either a python primitive
type, e.g., int or float or a numpy data type, e.g., numpy.int64 or numpy.float64.
<cite>Any</cite> should ideally be a specific scalar type accordingly.</p>
<div class="doctest highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="nd">@pandas_udf</span><span class="p">(</span><span class="s2">&quot;double&quot;</span><span class="p">)</span>
<span class="gp">... </span><span class="k">def</span> <span class="nf">mean_udf</span><span class="p">(</span><span class="n">v</span><span class="p">:</span> <span class="n">pd</span><span class="o">.</span><span class="n">Series</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">float</span><span class="p">:</span>
<span class="gp">... </span> <span class="k">return</span> <span class="n">v</span><span class="o">.</span><span class="n">mean</span><span class="p">()</span>
<span class="gp">...</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">df</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">(</span>
<span class="gp">... </span> <span class="p">[(</span><span class="mi">1</span><span class="p">,</span> <span class="mf">1.0</span><span class="p">),</span> <span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="mf">2.0</span><span class="p">),</span> <span class="p">(</span><span class="mi">2</span><span class="p">,</span> <span class="mf">3.0</span><span class="p">),</span> <span class="p">(</span><span class="mi">2</span><span class="p">,</span> <span class="mf">5.0</span><span class="p">),</span> <span class="p">(</span><span class="mi">2</span><span class="p">,</span> <span class="mf">10.0</span><span class="p">)],</span> <span class="p">(</span><span class="s2">&quot;id&quot;</span><span class="p">,</span> <span class="s2">&quot;v&quot;</span><span class="p">))</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">df</span><span class="o">.</span><span class="n">groupby</span><span class="p">(</span><span class="s2">&quot;id&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">agg</span><span class="p">(</span><span class="n">mean_udf</span><span class="p">(</span><span class="n">df</span><span class="p">[</span><span class="s1">&#39;v&#39;</span><span class="p">]))</span><span class="o">.</span><span class="n">show</span><span class="p">()</span>
<span class="go">+---+-----------+</span>
<span class="go">| id|mean_udf(v)|</span>
<span class="go">+---+-----------+</span>
<span class="go">| 1| 1.5|</span>
<span class="go">| 2| 6.0|</span>
<span class="go">+---+-----------+</span>
</pre></div>
</div>
<p>This UDF can also be used as window functions as below:</p>
<div class="doctest highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">pyspark.sql</span> <span class="kn">import</span> <span class="n">Window</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nd">@pandas_udf</span><span class="p">(</span><span class="s2">&quot;double&quot;</span><span class="p">)</span>
<span class="gp">... </span><span class="k">def</span> <span class="nf">mean_udf</span><span class="p">(</span><span class="n">v</span><span class="p">:</span> <span class="n">pd</span><span class="o">.</span><span class="n">Series</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">float</span><span class="p">:</span>
<span class="gp">... </span> <span class="k">return</span> <span class="n">v</span><span class="o">.</span><span class="n">mean</span><span class="p">()</span>
<span class="gp">...</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">df</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">(</span>
<span class="gp">... </span> <span class="p">[(</span><span class="mi">1</span><span class="p">,</span> <span class="mf">1.0</span><span class="p">),</span> <span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="mf">2.0</span><span class="p">),</span> <span class="p">(</span><span class="mi">2</span><span class="p">,</span> <span class="mf">3.0</span><span class="p">),</span> <span class="p">(</span><span class="mi">2</span><span class="p">,</span> <span class="mf">5.0</span><span class="p">),</span> <span class="p">(</span><span class="mi">2</span><span class="p">,</span> <span class="mf">10.0</span><span class="p">)],</span> <span class="p">(</span><span class="s2">&quot;id&quot;</span><span class="p">,</span> <span class="s2">&quot;v&quot;</span><span class="p">))</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">w</span> <span class="o">=</span> <span class="n">Window</span><span class="o">.</span><span class="n">partitionBy</span><span class="p">(</span><span class="s1">&#39;id&#39;</span><span class="p">)</span><span class="o">.</span><span class="n">orderBy</span><span class="p">(</span><span class="s1">&#39;v&#39;</span><span class="p">)</span><span class="o">.</span><span class="n">rowsBetween</span><span class="p">(</span><span class="o">-</span><span class="mi">1</span><span class="p">,</span> <span class="mi">0</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">df</span><span class="o">.</span><span class="n">withColumn</span><span class="p">(</span><span class="s1">&#39;mean_v&#39;</span><span class="p">,</span> <span class="n">mean_udf</span><span class="p">(</span><span class="s2">&quot;v&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">over</span><span class="p">(</span><span class="n">w</span><span class="p">))</span><span class="o">.</span><span class="n">show</span><span class="p">()</span>
<span class="go">+---+----+------+</span>
<span class="go">| id| v|mean_v|</span>
<span class="go">+---+----+------+</span>
<span class="go">| 1| 1.0| 1.0|</span>
<span class="go">| 1| 2.0| 1.5|</span>
<span class="go">| 2| 3.0| 3.0|</span>
<span class="go">| 2| 5.0| 4.0|</span>
<span class="go">| 2|10.0| 7.5|</span>
<span class="go">+---+----+------+</span>
</pre></div>
</div>
<div class="admonition note">
<p class="admonition-title">Note</p>
<p>For performance reasons, the input series to window functions are not copied.
Therefore, mutating the input series is not allowed and will cause incorrect results.
For the same reason, users should also not rely on the index of the input series.</p>
</div>
</dd>
</dl>
</li>
</ul>
</dd></dl>
</div>
</div>
<div class='prev-next-bottom'>
<a class='left-prev' id="prev-link" href="pyspark.sql.functions.overlay.html" title="previous page">pyspark.sql.functions.overlay</a>
<a class='right-next' id="next-link" href="pyspark.sql.functions.percent_rank.html" title="next page">pyspark.sql.functions.percent_rank</a>
</div>
</main>
</div>
</div>
<script src="../../_static/js/index.3da636dd464baa7582d2.js"></script>
<footer class="footer mt-5 mt-md-0">
<div class="container">
<p>
&copy; Copyright .<br/>
Created using <a href="http://sphinx-doc.org/">Sphinx</a> 3.0.4.<br/>
</p>
</div>
</footer>
</body>
</html>