| |
| |
| <!DOCTYPE html> |
| |
| |
| <html > |
| |
| <head> |
| <meta charset="utf-8" /> |
| <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="generator" content="Docutils 0.17.1: http://docutils.sourceforge.net/" /> |
| |
| <title>Chapter 5: Unleashing UDFs & UDTFs — PySpark 4.1.0-preview1 documentation</title> |
| |
| |
| |
| <script data-cfasync="false"> |
| document.documentElement.dataset.mode = localStorage.getItem("mode") || ""; |
| document.documentElement.dataset.theme = localStorage.getItem("theme") || "light"; |
| </script> |
| |
| <!-- Loaded before other Sphinx assets --> |
| <link href="../_static/styles/theme.css?digest=e353d410970836974a52" rel="stylesheet" /> |
| <link href="../_static/styles/bootstrap.css?digest=e353d410970836974a52" rel="stylesheet" /> |
| <link href="../_static/styles/pydata-sphinx-theme.css?digest=e353d410970836974a52" rel="stylesheet" /> |
| |
| |
| <link href="../_static/vendor/fontawesome/6.1.2/css/all.min.css?digest=e353d410970836974a52" rel="stylesheet" /> |
| <link rel="preload" as="font" type="font/woff2" crossorigin href="../_static/vendor/fontawesome/6.1.2/webfonts/fa-solid-900.woff2" /> |
| <link rel="preload" as="font" type="font/woff2" crossorigin href="../_static/vendor/fontawesome/6.1.2/webfonts/fa-brands-400.woff2" /> |
| <link rel="preload" as="font" type="font/woff2" crossorigin href="../_static/vendor/fontawesome/6.1.2/webfonts/fa-regular-400.woff2" /> |
| |
| <link rel="stylesheet" type="text/css" href="../_static/pygments.css" /> |
| <link rel="stylesheet" type="text/css" href="../_static/copybutton.css" /> |
| <link rel="stylesheet" type="text/css" href="../_static/nbsphinx-code-cells.css" /> |
| <link rel="stylesheet" type="text/css" href="../_static/css/pyspark.css" /> |
| |
| <!-- Pre-loaded scripts that we'll load fully later --> |
| <link rel="preload" as="script" href="../_static/scripts/bootstrap.js?digest=e353d410970836974a52" /> |
| <link rel="preload" as="script" href="../_static/scripts/pydata-sphinx-theme.js?digest=e353d410970836974a52" /> |
| |
| <script data-url_root="../" id="documentation_options" src="../_static/documentation_options.js"></script> |
| <script src="../_static/jquery.js"></script> |
| <script src="../_static/underscore.js"></script> |
| <script src="../_static/doctools.js"></script> |
| <script src="../_static/clipboard.min.js"></script> |
| <script src="../_static/copybutton.js"></script> |
| <script crossorigin="anonymous" integrity="sha256-Ae2Vz/4ePdIu6ZyI/5ZGsYnb+m0JlOmKPjt6XZ9JJkA=" src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.4/require.min.js"></script> |
| <script>window.MathJax = {"tex": {"inlineMath": [["$", "$"], ["\\(", "\\)"]], "processEscapes": true}, "options": {"ignoreHtmlClass": "tex2jax_ignore|mathjax_ignore|document", "processHtmlClass": "tex2jax_process|mathjax_process|math|output_area"}}</script> |
| <script defer="defer" src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js"></script> |
| <script>DOCUMENTATION_OPTIONS.pagename = 'user_guide/udfandudtf';</script> |
| <script> |
| DOCUMENTATION_OPTIONS.theme_switcher_json_url = 'https://spark.apache.org/static/versions.json'; |
| DOCUMENTATION_OPTIONS.theme_switcher_version_match = '4.1.0-preview1'; |
| </script> |
| <link rel="canonical" href="https://spark.apache.org/docs/latest/api/python/user_guide/udfandudtf.html" /> |
| <link rel="search" title="Search" href="../search.html" /> |
| <link rel="next" title="Chapter 6: Old SQL, New Tricks - Running SQL on PySpark" href="sql.html" /> |
| <link rel="prev" title="Chapter 4: Bug Busting - Debugging PySpark" href="bugbusting.html" /> |
| <meta name="viewport" content="width=device-width, initial-scale=1" /> |
| <meta name="docsearch:language" content="None"> |
| |
| |
| <!-- Matomo --> |
| <script type="text/javascript"> |
| var _paq = window._paq = window._paq || []; |
| /* tracker methods like "setCustomDimension" should be called before "trackPageView" */ |
| _paq.push(["disableCookies"]); |
| _paq.push(['trackPageView']); |
| _paq.push(['enableLinkTracking']); |
| (function() { |
| var u="https://analytics.apache.org/"; |
| _paq.push(['setTrackerUrl', u+'matomo.php']); |
| _paq.push(['setSiteId', '40']); |
| var d=document, g=d.createElement('script'), s=d.getElementsByTagName('script')[0]; |
| g.async=true; g.src=u+'matomo.js'; s.parentNode.insertBefore(g,s); |
| })(); |
| </script> |
| <!-- End Matomo Code --> |
| |
| </head> |
| |
| |
| <body data-bs-spy="scroll" data-bs-target=".bd-toc-nav" data-offset="180" data-bs-root-margin="0px 0px -60%" data-default-mode=""> |
| |
| |
| |
| <a class="skip-link" href="#main-content">Skip to main content</a> |
| |
| <input type="checkbox" |
| class="sidebar-toggle" |
| name="__primary" |
| id="__primary"/> |
| <label class="overlay overlay-primary" for="__primary"></label> |
| |
| <input type="checkbox" |
| class="sidebar-toggle" |
| name="__secondary" |
| id="__secondary"/> |
| <label class="overlay overlay-secondary" for="__secondary"></label> |
| |
| <div class="search-button__wrapper"> |
| <div class="search-button__overlay"></div> |
| <div class="search-button__search-container"> |
| <form class="bd-search d-flex align-items-center" |
| action="../search.html" |
| method="get"> |
| <i class="fa-solid fa-magnifying-glass"></i> |
| <input type="search" |
| class="form-control" |
| name="q" |
| id="search-input" |
| placeholder="Search the docs ..." |
| aria-label="Search the docs ..." |
| autocomplete="off" |
| autocorrect="off" |
| autocapitalize="off" |
| spellcheck="false"/> |
| <span class="search-button__kbd-shortcut"><kbd class="kbd-shortcut__modifier">Ctrl</kbd>+<kbd>K</kbd></span> |
| </form></div> |
| </div> |
| |
| <nav class="bd-header navbar navbar-expand-lg bd-navbar"> |
| <div class="bd-header__inner bd-page-width"> |
| <label class="sidebar-toggle primary-toggle" for="__primary"> |
| <span class="fa-solid fa-bars"></span> |
| </label> |
| |
| <div class="navbar-header-items__start"> |
| |
| <div class="navbar-item"> |
| |
| |
| <a class="navbar-brand logo" href="../index.html"> |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| <img src="https://spark.apache.org/images/spark-logo.png" class="logo__image only-light" alt="Logo image"/> |
| <script>document.write(`<img src="https://spark.apache.org/images/spark-logo-rev.svg" class="logo__image only-dark" alt="Logo image"/>`);</script> |
| |
| |
| </a></div> |
| |
| </div> |
| |
| |
| <div class="col-lg-9 navbar-header-items"> |
| |
| <div class="me-auto navbar-header-items__center"> |
| |
| <div class="navbar-item"><nav class="navbar-nav"> |
| <p class="sidebar-header-items__title" |
| role="heading" |
| aria-level="1" |
| aria-label="Site Navigation"> |
| Site Navigation |
| </p> |
| <ul class="bd-navbar-elements navbar-nav"> |
| |
| <li class="nav-item"> |
| <a class="nav-link nav-internal" href="../index.html"> |
| Overview |
| </a> |
| </li> |
| |
| |
| <li class="nav-item"> |
| <a class="nav-link nav-internal" href="../getting_started/index.html"> |
| Getting Started |
| </a> |
| </li> |
| |
| |
| <li class="nav-item"> |
| <a class="nav-link nav-internal" href="../tutorial/index.html"> |
| Tutorials |
| </a> |
| </li> |
| |
| |
| <li class="nav-item current active"> |
| <a class="nav-link nav-internal" href="index.html"> |
| User Guide |
| </a> |
| </li> |
| |
| |
| <li class="nav-item"> |
| <a class="nav-link nav-internal" href="../reference/index.html"> |
| API Reference |
| </a> |
| </li> |
| |
| |
| <li class="nav-item"> |
| <a class="nav-link nav-internal" href="../development/index.html"> |
| Development |
| </a> |
| </li> |
| |
| <div class="nav-item dropdown"> |
| <button class="btn dropdown-toggle nav-item" type="button" data-bs-toggle="dropdown" aria-haspopup="true" aria-expanded="false"> |
| More |
| </button> |
| <div class="dropdown-menu"> |
| |
| <li class="nav-item"> |
| <a class="nav-link nav-internal" href="../migration_guide/index.html"> |
| Migration Guides |
| </a> |
| </li> |
| |
| </div> |
| </div> |
| |
| </ul> |
| </nav></div> |
| |
| </div> |
| |
| |
| <div class="navbar-header-items__end"> |
| |
| <div class="navbar-item navbar-persistent--container"> |
| |
| <script> |
| document.write(` |
| <button class="btn btn-sm navbar-btn search-button search-button__button" title="Search" aria-label="Search" data-bs-placement="bottom" data-bs-toggle="tooltip"> |
| <i class="fa-solid fa-magnifying-glass"></i> |
| </button> |
| `); |
| </script> |
| </div> |
| |
| |
| <div class="navbar-item"> |
| <script> |
| document.write(` |
| <div class="version-switcher__container dropdown"> |
| <button type="button" class="version-switcher__button btn btn-sm navbar-btn dropdown-toggle" data-bs-toggle="dropdown"> |
| 4.1.0-preview1 <!-- this text may get changed later by javascript --> |
| <span class="caret"></span> |
| </button> |
| <div class="version-switcher__menu dropdown-menu list-group-flush py-0"> |
| <!-- dropdown will be populated by javascript on page load --> |
| </div> |
| </div> |
| `); |
| </script></div> |
| |
| <div class="navbar-item"> |
| <script> |
| document.write(` |
| <button class="theme-switch-button btn btn-sm btn-outline-primary navbar-btn rounded-circle" title="light/dark" aria-label="light/dark" data-bs-placement="bottom" data-bs-toggle="tooltip"> |
| <span class="theme-switch" data-mode="light"><i class="fa-solid fa-sun"></i></span> |
| <span class="theme-switch" data-mode="dark"><i class="fa-solid fa-moon"></i></span> |
| <span class="theme-switch" data-mode="auto"><i class="fa-solid fa-circle-half-stroke"></i></span> |
| </button> |
| `); |
| </script></div> |
| |
| <div class="navbar-item"><ul class="navbar-icon-links navbar-nav" |
| aria-label="Icon Links"> |
| <li class="nav-item"> |
| |
| |
| |
| |
| |
| |
| |
| |
| <a href="https://github.com/apache/spark" title="GitHub" class="nav-link" rel="noopener" target="_blank" data-bs-toggle="tooltip" data-bs-placement="bottom"><span><i class="fa-brands fa-github"></i></span> |
| <label class="sr-only">GitHub</label></a> |
| </li> |
| <li class="nav-item"> |
| |
| |
| |
| |
| |
| |
| |
| |
| <a href="https://pypi.org/project/pyspark" title="PyPI" class="nav-link" rel="noopener" target="_blank" data-bs-toggle="tooltip" data-bs-placement="bottom"><span><i class="fa-solid fa-box"></i></span> |
| <label class="sr-only">PyPI</label></a> |
| </li> |
| </ul></div> |
| |
| </div> |
| |
| </div> |
| |
| |
| <div class="navbar-persistent--mobile"> |
| <script> |
| document.write(` |
| <button class="btn btn-sm navbar-btn search-button search-button__button" title="Search" aria-label="Search" data-bs-placement="bottom" data-bs-toggle="tooltip"> |
| <i class="fa-solid fa-magnifying-glass"></i> |
| </button> |
| `); |
| </script> |
| </div> |
| |
| |
| |
| <label class="sidebar-toggle secondary-toggle" for="__secondary"> |
| <span class="fa-solid fa-outdent"></span> |
| </label> |
| |
| </div> |
| |
| </nav> |
| |
| <div class="bd-container"> |
| <div class="bd-container__inner bd-page-width"> |
| |
| <div class="bd-sidebar-primary bd-sidebar"> |
| |
| |
| |
| <div class="sidebar-header-items sidebar-primary__section"> |
| |
| |
| <div class="sidebar-header-items__center"> |
| |
| <div class="navbar-item"><nav class="navbar-nav"> |
| <p class="sidebar-header-items__title" |
| role="heading" |
| aria-level="1" |
| aria-label="Site Navigation"> |
| Site Navigation |
| </p> |
| <ul class="bd-navbar-elements navbar-nav"> |
| |
| <li class="nav-item"> |
| <a class="nav-link nav-internal" href="../index.html"> |
| Overview |
| </a> |
| </li> |
| |
| |
| <li class="nav-item"> |
| <a class="nav-link nav-internal" href="../getting_started/index.html"> |
| Getting Started |
| </a> |
| </li> |
| |
| |
| <li class="nav-item"> |
| <a class="nav-link nav-internal" href="../tutorial/index.html"> |
| Tutorials |
| </a> |
| </li> |
| |
| |
| <li class="nav-item current active"> |
| <a class="nav-link nav-internal" href="index.html"> |
| User Guide |
| </a> |
| </li> |
| |
| |
| <li class="nav-item"> |
| <a class="nav-link nav-internal" href="../reference/index.html"> |
| API Reference |
| </a> |
| </li> |
| |
| |
| <li class="nav-item"> |
| <a class="nav-link nav-internal" href="../development/index.html"> |
| Development |
| </a> |
| </li> |
| |
| <div class="nav-item dropdown"> |
| <button class="btn dropdown-toggle nav-item" type="button" data-bs-toggle="dropdown" aria-haspopup="true" aria-expanded="false"> |
| More |
| </button> |
| <div class="dropdown-menu"> |
| |
| <li class="nav-item"> |
| <a class="nav-link nav-internal" href="../migration_guide/index.html"> |
| Migration Guides |
| </a> |
| </li> |
| |
| </div> |
| </div> |
| |
| </ul> |
| </nav></div> |
| |
| </div> |
| |
| |
| |
| <div class="sidebar-header-items__end"> |
| |
| <div class="navbar-item"> |
| <script> |
| document.write(` |
| <div class="version-switcher__container dropdown"> |
| <button type="button" class="version-switcher__button btn btn-sm navbar-btn dropdown-toggle" data-bs-toggle="dropdown"> |
| 4.1.0-preview1 <!-- this text may get changed later by javascript --> |
| <span class="caret"></span> |
| </button> |
| <div class="version-switcher__menu dropdown-menu list-group-flush py-0"> |
| <!-- dropdown will be populated by javascript on page load --> |
| </div> |
| </div> |
| `); |
| </script></div> |
| |
| <div class="navbar-item"> |
| <script> |
| document.write(` |
| <button class="theme-switch-button btn btn-sm btn-outline-primary navbar-btn rounded-circle" title="light/dark" aria-label="light/dark" data-bs-placement="bottom" data-bs-toggle="tooltip"> |
| <span class="theme-switch" data-mode="light"><i class="fa-solid fa-sun"></i></span> |
| <span class="theme-switch" data-mode="dark"><i class="fa-solid fa-moon"></i></span> |
| <span class="theme-switch" data-mode="auto"><i class="fa-solid fa-circle-half-stroke"></i></span> |
| </button> |
| `); |
| </script></div> |
| |
| <div class="navbar-item"><ul class="navbar-icon-links navbar-nav" |
| aria-label="Icon Links"> |
| <li class="nav-item"> |
| |
| |
| |
| |
| |
| |
| |
| |
| <a href="https://github.com/apache/spark" title="GitHub" class="nav-link" rel="noopener" target="_blank" data-bs-toggle="tooltip" data-bs-placement="bottom"><span><i class="fa-brands fa-github"></i></span> |
| <label class="sr-only">GitHub</label></a> |
| </li> |
| <li class="nav-item"> |
| |
| |
| |
| |
| |
| |
| |
| |
| <a href="https://pypi.org/project/pyspark" title="PyPI" class="nav-link" rel="noopener" target="_blank" data-bs-toggle="tooltip" data-bs-placement="bottom"><span><i class="fa-solid fa-box"></i></span> |
| <label class="sr-only">PyPI</label></a> |
| </li> |
| </ul></div> |
| |
| </div> |
| |
| </div> |
| |
| <div class="sidebar-primary-items__start sidebar-primary__section"> |
| <div class="sidebar-primary-item"><nav class="bd-docs-nav bd-links" |
| aria-label="Section Navigation"> |
| <p class="bd-links__title" role="heading" aria-level="1">Section Navigation</p> |
| <div class="bd-toc-item navbar-nav"><ul class="current nav bd-sidenav"> |
| <li class="toctree-l1"><a class="reference internal" href="dataframes.html">Chapter 1: DataFrames - A view into your structured data</a></li> |
| <li class="toctree-l1"><a class="reference internal" href="touroftypes.html">Chapter 2: A Tour of PySpark Data Types</a></li> |
| <li class="toctree-l1"><a class="reference internal" href="dataprep.html">Chapter 3: Function Junction - Data manipulation with PySpark</a></li> |
| <li class="toctree-l1"><a class="reference internal" href="bugbusting.html">Chapter 4: Bug Busting - Debugging PySpark</a></li> |
| <li class="toctree-l1 current active"><a class="current reference internal" href="#">Chapter 5: Unleashing UDFs & UDTFs</a></li> |
| <li class="toctree-l1"><a class="reference internal" href="sql.html">Chapter 6: Old SQL, New Tricks - Running SQL on PySpark</a></li> |
| <li class="toctree-l1"><a class="reference internal" href="loadandbehold.html">Chapter 7: Load and Behold - Data loading, storage, file formats</a></li> |
| </ul> |
| </div> |
| </nav></div> |
| </div> |
| |
| |
| <div class="sidebar-primary-items__end sidebar-primary__section"> |
| </div> |
| |
| <div id="rtd-footer-container"></div> |
| |
| |
| </div> |
| |
| <main id="main-content" class="bd-main"> |
| |
| |
| <div class="bd-content"> |
| <div class="bd-article-container"> |
| |
| <div class="bd-header-article"> |
| <div class="header-article-items header-article__inner"> |
| |
| <div class="header-article-items__start"> |
| |
| <div class="header-article-item"> |
| |
| |
| |
| <nav aria-label="Breadcrumbs"> |
| <ul class="bd-breadcrumbs" role="navigation" aria-label="Breadcrumb"> |
| |
| <li class="breadcrumb-item breadcrumb-home"> |
| <a href="../index.html" class="nav-link" aria-label="Home"> |
| <i class="fa-solid fa-home"></i> |
| </a> |
| </li> |
| |
| <li class="breadcrumb-item"><a href="index.html" class="nav-link">User Guide</a></li> |
| |
| <li class="breadcrumb-item active" aria-current="page">Chapter 5: Unleashing UDFs & UDTFs</li> |
| </ul> |
| </nav> |
| </div> |
| |
| </div> |
| |
| |
| </div> |
| </div> |
| |
| |
| |
| |
| <div id="searchbox"></div> |
| <article class="bd-article" role="main"> |
| |
| <section id="Chapter-5:-Unleashing-UDFs-&-UDTFs"> |
| <h1>Chapter 5: Unleashing UDFs & UDTFs<a class="headerlink" href="#Chapter-5:-Unleashing-UDFs-&-UDTFs" title="Permalink to this headline">#</a></h1> |
| <p>In large-scale data processing, customization is often necessary to extend the native capabilities of Spark. <em>Python User-Defined Functions (UDFs)</em> and <em>User-Defined Table Functions (UDTFs)</em> offer a way to perform complex transformations and computations using Python, seamlessly integrating them into Spark’s distributed environment.</p> |
| <p>In this section, we’ll explore how to write and use UDFs and UDTFs in Python, leveraging PySpark to perform complex data transformations that go beyond Spark’s built-in functions.</p> |
| <section id="Python-UDFs"> |
| <h2>Python UDFs<a class="headerlink" href="#Python-UDFs" title="Permalink to this headline">#</a></h2> |
| <section id="Categories-of-Python-UDFs"> |
| <h3>Categories of Python UDFs<a class="headerlink" href="#Categories-of-Python-UDFs" title="Permalink to this headline">#</a></h3> |
| <p>There are two main categories of UDFs supported in PySpark: Scalar Python UDFs and Pandas UDFs.</p> |
| <ul class="simple"> |
| <li><p><em>Scalar Python UDFs</em> are user-defined scalar functions that take or return Python objects serialized/deserialized by <a class="reference external" href="https://python.readthedocs.io/en/latest/library/pickle.html">pickle</a> or <a class="reference external" href="https://arrow.readthedocs.io/en/latest/">Arrow</a> and operate one row at a time</p></li> |
| <li><p><em>Pandas UDFs</em> (a.k.a. Vectorized UDFs) are UDFs that take/return pandas Series or DataFrame serialized/deserialized by Apache Arrow and operate block by block. Pandas UDFs have some variations categorized by usage, with specific input and output types: Series to Series, Series to Scalar, and Iterator to Iterator.</p></li> |
| </ul> |
| <p>Based on Pandas UDFs implementation, there are also <em>Pandas Function APIs</em>: Map (i.e., <code class="docutils literal notranslate"><span class="pre">mapInPandas</span></code>) and (Co)Grouped Map (i.e., <code class="docutils literal notranslate"><span class="pre">applyInPandas</span></code>), as well as an Arrow Function API - <code class="docutils literal notranslate"><span class="pre">mapInArrow</span></code>.</p> |
| </section> |
| <section id="To-create-a-Scalar-Python-UDF"> |
| <h3>To create a Scalar Python UDF<a class="headerlink" href="#To-create-a-Scalar-Python-UDF" title="Permalink to this headline">#</a></h3> |
| <p>In the code below, we’ve created a simple scalar Python UDF.</p> |
| <div class="nbinput nblast docutils container"> |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[6]: |
| </pre></div> |
| </div> |
| <div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="kn">from</span><span class="w"> </span><span class="nn">pyspark.sql.functions</span><span class="w"> </span><span class="kn">import</span> <span class="n">udf</span> |
| |
| <span class="nd">@udf</span><span class="p">(</span><span class="n">returnType</span><span class="o">=</span><span class="s1">'int'</span><span class="p">)</span> |
| <span class="k">def</span><span class="w"> </span><span class="nf">slen</span><span class="p">(</span><span class="n">s</span><span class="p">:</span> <span class="nb">str</span><span class="p">):</span> |
| <span class="k">return</span> <span class="nb">len</span><span class="p">(</span><span class="n">s</span><span class="p">)</span> |
| </pre></div> |
| </div> |
| </div> |
| <section id="Arrow-optimization"> |
| <h4>Arrow optimization<a class="headerlink" href="#Arrow-optimization" title="Permalink to this headline">#</a></h4> |
| <p>Scalar Python UDFs rely on <a class="reference external" href="https://pypi.org/project/cloudpickle/">cloudpickle</a> for serialization and deserialization, and encounter performance bottlenecks, particularly when dealing with large data inputs and outputs. We introduce Arrow-optimized Python UDFs to significantly improve performance.</p> |
| <p>At the core of this optimization lies Apache Arrow, a standardized cross-language columnar in-memory data representation. By harnessing Arrow, these UDFs bypass the traditional, slower methods of data (de)serialization, leading to swift data exchange between JVM and Python processes. With Apache Arrow’s rich type system, these optimized UDFs offer a more consistent and standardized way to handle type coercion.</p> |
| <p>We can control whether or not to enable Arrow optimization for individual UDFs by using the <code class="docutils literal notranslate"><span class="pre">useArrow</span></code> boolean parameter of <code class="docutils literal notranslate"><span class="pre">functions.udf</span></code>. An example is as shown below:</p> |
| <div class="highlight-py notranslate"><div class="highlight"><pre><span></span><span class="kn">from</span><span class="w"> </span><span class="nn">pyspark.sql.functions</span><span class="w"> </span><span class="kn">import</span> <span class="n">udf</span> |
| |
| <span class="nd">@udf</span><span class="p">(</span><span class="n">returnType</span><span class="o">=</span><span class="s1">'int'</span><span class="p">,</span> <span class="n">useArrow</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span> <span class="c1"># An Arrow Python UDF</span> |
| <span class="k">def</span><span class="w"> </span><span class="nf">arrow_slen</span><span class="p">(</span><span class="n">s</span><span class="p">:</span> <span class="nb">str</span><span class="p">):</span> |
| <span class="o">...</span> |
| </pre></div> |
| </div> |
| <p>In addition, we can enable Arrow optimization for all UDFs of an entire SparkSession via a Spark configuration: <code class="docutils literal notranslate"><span class="pre">spark.sql.execution.pythonUDF.arrow.enabled</span></code>, as shown below:</p> |
| <div class="highlight-py notranslate"><div class="highlight"><pre><span></span><span class="n">spark</span><span class="o">.</span><span class="n">conf</span><span class="o">.</span><span class="n">set</span><span class="p">(</span><span class="s2">"spark.sql.execution.pythonUDF.arrow.enabled"</span><span class="p">,</span> <span class="kc">True</span><span class="p">)</span> |
| |
| <span class="nd">@udf</span><span class="p">(</span><span class="n">returnType</span><span class="o">=</span><span class="s1">'int'</span><span class="p">)</span> <span class="c1"># An Arrow Python UDF</span> |
| <span class="k">def</span><span class="w"> </span><span class="nf">arrow_slen</span><span class="p">(</span><span class="n">s</span><span class="p">:</span> <span class="nb">str</span><span class="p">):</span> |
| <span class="o">...</span> |
| </pre></div> |
| </div> |
| </section> |
| </section> |
| <section id="To-use-a-Scalar-Python-UDF"> |
| <h3>To use a Scalar Python UDF<a class="headerlink" href="#To-use-a-Scalar-Python-UDF" title="Permalink to this headline">#</a></h3> |
| <p>In Python, we can invoke a UDF directly on column(s), just like a built-in Spark function, as shown below.</p> |
| <div class="nbinput docutils container"> |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[7]: |
| </pre></div> |
| </div> |
| <div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">data</span> <span class="o">=</span> <span class="p">[(</span><span class="s2">"Alice"</span><span class="p">,),</span> <span class="p">(</span><span class="s2">"Bob"</span><span class="p">,),</span> <span class="p">(</span><span class="s2">"Charlie"</span><span class="p">,)]</span> |
| <span class="n">df</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">(</span><span class="n">data</span><span class="p">,</span> <span class="p">[</span><span class="s2">"name"</span><span class="p">])</span> |
| <span class="n">df</span><span class="o">.</span><span class="n">withColumn</span><span class="p">(</span><span class="s2">"name_length"</span><span class="p">,</span> <span class="n">slen</span><span class="p">(</span><span class="n">df</span><span class="p">[</span><span class="s2">"name"</span><span class="p">]))</span><span class="o">.</span><span class="n">show</span><span class="p">()</span> |
| </pre></div> |
| </div> |
| </div> |
| <div class="nboutput docutils container"> |
| <div class="prompt empty docutils container"> |
| </div> |
| <div class="output_area stderr docutils container"> |
| <div class="highlight"><pre> |
| |
| </pre></div></div> |
| </div> |
| <div class="nboutput nblast docutils container"> |
| <div class="prompt empty docutils container"> |
| </div> |
| <div class="output_area docutils container"> |
| <div class="highlight"><pre> |
| +-------+-----------+ |
| | name|name_length| |
| +-------+-----------+ |
| | Alice| 5| |
| | Bob| 3| |
| |Charlie| 7| |
| +-------+-----------+ |
| |
| </pre></div></div> |
| </div> |
| </section> |
| <section id="To-create-a-Pandas-UDF"> |
| <h3>To create a Pandas UDF<a class="headerlink" href="#To-create-a-Pandas-UDF" title="Permalink to this headline">#</a></h3> |
| <p>In the code below, we’ve created a Pandas UDF which takes one <code class="docutils literal notranslate"><span class="pre">pandas.Series</span></code> and outputs one <code class="docutils literal notranslate"><span class="pre">pandas.Series</span></code></p> |
| <div class="nbinput docutils container"> |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[8]: |
| </pre></div> |
| </div> |
| <div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="kn">import</span><span class="w"> </span><span class="nn">pandas</span><span class="w"> </span><span class="k">as</span><span class="w"> </span><span class="nn">pd</span> |
| <span class="kn">from</span><span class="w"> </span><span class="nn">pyspark.sql.functions</span><span class="w"> </span><span class="kn">import</span> <span class="n">pandas_udf</span> |
| |
| <span class="nd">@pandas_udf</span><span class="p">(</span><span class="s2">"string"</span><span class="p">)</span> |
| <span class="k">def</span><span class="w"> </span><span class="nf">to_upper</span><span class="p">(</span><span class="n">s</span><span class="p">:</span> <span class="n">pd</span><span class="o">.</span><span class="n">Series</span><span class="p">)</span> <span class="o">-></span> <span class="n">pd</span><span class="o">.</span><span class="n">Series</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">s</span><span class="o">.</span><span class="n">str</span><span class="o">.</span><span class="n">upper</span><span class="p">()</span> |
| |
| <span class="n">df</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">([(</span><span class="s2">"John Doe"</span><span class="p">,)],</span> <span class="p">(</span><span class="s2">"name"</span><span class="p">,))</span> |
| <span class="n">df</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="n">to_upper</span><span class="p">(</span><span class="s2">"name"</span><span class="p">))</span><span class="o">.</span><span class="n">show</span><span class="p">()</span> |
| <br/></pre></div> |
| </div> |
| </div> |
| <div class="nboutput nblast docutils container"> |
| <div class="prompt empty docutils container"> |
| </div> |
| <div class="output_area docutils container"> |
| <div class="highlight"><pre> |
| +--------------+ |
| |to_upper(name)| |
| +--------------+ |
| | JOHN DOE| |
| +--------------+ |
| |
| </pre></div></div> |
| </div> |
| </section> |
| <section id="To-use-a-Pandas-UDF"> |
| <h3>To use a Pandas UDF<a class="headerlink" href="#To-use-a-Pandas-UDF" title="Permalink to this headline">#</a></h3> |
| <p>Similar to a Scalar Python UDF, we can also invoke a pandas UDF directly on column(s):</p> |
| <div class="nbinput docutils container"> |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[9]: |
| </pre></div> |
| </div> |
| <div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">data</span> <span class="o">=</span> <span class="p">[(</span><span class="s2">"Alice"</span><span class="p">,),</span> <span class="p">(</span><span class="s2">"Bob"</span><span class="p">,),</span> <span class="p">(</span><span class="s2">"Charlie"</span><span class="p">,)]</span> |
| <span class="n">df</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">(</span><span class="n">data</span><span class="p">,</span> <span class="p">[</span><span class="s2">"name"</span><span class="p">])</span> |
| <span class="n">df</span><span class="o">.</span><span class="n">withColumn</span><span class="p">(</span><span class="s2">"name_length"</span><span class="p">,</span> <span class="n">to_upper</span><span class="p">(</span><span class="n">df</span><span class="p">[</span><span class="s2">"name"</span><span class="p">]))</span><span class="o">.</span><span class="n">show</span><span class="p">()</span> |
| </pre></div> |
| </div> |
| </div> |
| <div class="nboutput nblast docutils container"> |
| <div class="prompt empty docutils container"> |
| </div> |
| <div class="output_area docutils container"> |
| <div class="highlight"><pre> |
| +-------+-----------+ |
| | name|name_length| |
| +-------+-----------+ |
| | Alice| ALICE| |
| | Bob| BOB| |
| |Charlie| CHARLIE| |
| +-------+-----------+ |
| |
| </pre></div></div> |
| </div> |
| </section> |
| <section id="More-Examples"> |
| <h3>More Examples<a class="headerlink" href="#More-Examples" title="Permalink to this headline">#</a></h3> |
| <section id="Example-1:-Python-UDF-to-Process-DataFrame-with-String-and-List-Columns"> |
| <h4>Example 1: Python UDF to Process DataFrame with String and List Columns<a class="headerlink" href="#Example-1:-Python-UDF-to-Process-DataFrame-with-String-and-List-Columns" title="Permalink to this headline">#</a></h4> |
| <div class="nbinput docutils container"> |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[10]: |
| </pre></div> |
| </div> |
| <div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="kn">from</span><span class="w"> </span><span class="nn">pyspark.sql.types</span><span class="w"> </span><span class="kn">import</span> <span class="n">ArrayType</span><span class="p">,</span> <span class="n">IntegerType</span><span class="p">,</span> <span class="n">StringType</span> |
| <span class="kn">from</span><span class="w"> </span><span class="nn">pyspark.sql.functions</span><span class="w"> </span><span class="kn">import</span> <span class="n">udf</span> |
| |
| <span class="n">data</span> <span class="o">=</span> <span class="p">[</span> |
| <span class="p">(</span><span class="s2">"Hello World"</span><span class="p">,</span> <span class="p">[</span><span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="mi">3</span><span class="p">]),</span> |
| <span class="p">(</span><span class="s2">"PySpark is Fun"</span><span class="p">,</span> <span class="p">[</span><span class="mi">4</span><span class="p">,</span> <span class="mi">5</span><span class="p">,</span> <span class="mi">6</span><span class="p">]),</span> |
| <span class="p">(</span><span class="s2">"PySpark Rocks"</span><span class="p">,</span> <span class="p">[</span><span class="mi">7</span><span class="p">,</span> <span class="mi">8</span><span class="p">,</span> <span class="mi">9</span><span class="p">])</span> |
| <span class="p">]</span> |
| <span class="n">df</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">(</span><span class="n">data</span><span class="p">,</span> <span class="p">[</span><span class="s2">"text_column"</span><span class="p">,</span> <span class="s2">"list_column"</span><span class="p">])</span> |
| |
| <span class="nd">@udf</span><span class="p">(</span><span class="n">returnType</span><span class="o">=</span><span class="s2">"string"</span><span class="p">)</span> |
| <span class="k">def</span><span class="w"> </span><span class="nf">process_row</span><span class="p">(</span><span class="n">text</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="n">numbers</span><span class="p">):</span> |
| <span class="n">vowels_count</span> <span class="o">=</span> <span class="nb">sum</span><span class="p">(</span><span class="mi">1</span> <span class="k">for</span> <span class="n">char</span> <span class="ow">in</span> <span class="n">text</span> <span class="k">if</span> <span class="n">char</span> <span class="ow">in</span> <span class="s2">"aeiouAEIOU"</span><span class="p">)</span> |
| <span class="n">doubled</span> <span class="o">=</span> <span class="p">[</span><span class="n">x</span> <span class="o">*</span> <span class="mi">2</span> <span class="k">for</span> <span class="n">x</span> <span class="ow">in</span> <span class="n">numbers</span><span class="p">]</span> |
| <span class="k">return</span> <span class="sa">f</span><span class="s2">"Vowels: </span><span class="si">{</span><span class="n">vowels_count</span><span class="si">}</span><span class="s2">, Doubled: </span><span class="si">{</span><span class="n">doubled</span><span class="si">}</span><span class="s2">"</span> |
| |
| <span class="n">df</span><span class="o">.</span><span class="n">withColumn</span><span class="p">(</span><span class="s2">"process_row"</span><span class="p">,</span> <span class="n">process_row</span><span class="p">(</span><span class="n">df</span><span class="p">[</span><span class="s2">"text_column"</span><span class="p">],</span> <span class="n">df</span><span class="p">[</span><span class="s2">"list_column"</span><span class="p">]))</span><span class="o">.</span><span class="n">show</span><span class="p">(</span><span class="n">truncate</span><span class="o">=</span><span class="kc">False</span><span class="p">)</span> |
| </pre></div> |
| </div> |
| </div> |
| <div class="nboutput nblast docutils container"> |
| <div class="prompt empty docutils container"> |
| </div> |
| <div class="output_area docutils container"> |
| <div class="highlight"><pre> |
| +--------------+-----------+--------------------------------+ |
| |text_column |list_column|process_row | |
| +--------------+-----------+--------------------------------+ |
| |Hello World |[1, 2, 3] |Vowels: 3, Doubled: [2, 4, 6] | |
| |PySpark is Fun|[4, 5, 6] |Vowels: 3, Doubled: [8, 10, 12] | |
| |PySpark Rocks |[7, 8, 9] |Vowels: 2, Doubled: [14, 16, 18]| |
| +--------------+-----------+--------------------------------+ |
| |
| </pre></div></div> |
| </div> |
| </section> |
| <section id="Example-2:-Pandas-UDF-for-Statistical-Computations-and-Complex-Transformation"> |
| <h4>Example 2: Pandas UDF for Statistical Computations and Complex Transformation<a class="headerlink" href="#Example-2:-Pandas-UDF-for-Statistical-Computations-and-Complex-Transformation" title="Permalink to this headline">#</a></h4> |
| <div class="nbinput docutils container"> |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[11]: |
| </pre></div> |
| </div> |
| <div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="kn">from</span><span class="w"> </span><span class="nn">pyspark.sql</span><span class="w"> </span><span class="kn">import</span> <span class="n">SparkSession</span> |
| <span class="kn">from</span><span class="w"> </span><span class="nn">pyspark.sql.functions</span><span class="w"> </span><span class="kn">import</span> <span class="n">pandas_udf</span> |
| <span class="kn">from</span><span class="w"> </span><span class="nn">pyspark.sql.types</span><span class="w"> </span><span class="kn">import</span> <span class="n">StructType</span><span class="p">,</span> <span class="n">StructField</span><span class="p">,</span> <span class="n">DoubleType</span><span class="p">,</span> <span class="n">StringType</span> |
| <span class="kn">import</span><span class="w"> </span><span class="nn">pandas</span><span class="w"> </span><span class="k">as</span><span class="w"> </span><span class="nn">pd</span> |
| |
| <span class="n">data</span> <span class="o">=</span> <span class="p">[</span> |
| <span class="p">(</span><span class="mf">10.0</span><span class="p">,</span> <span class="s2">"Spark"</span><span class="p">),</span> |
| <span class="p">(</span><span class="mf">20.0</span><span class="p">,</span> <span class="s2">"Big Data"</span><span class="p">),</span> |
| <span class="p">(</span><span class="mf">30.0</span><span class="p">,</span> <span class="s2">"AI"</span><span class="p">),</span> |
| <span class="p">(</span><span class="mf">40.0</span><span class="p">,</span> <span class="s2">"Machine Learning"</span><span class="p">),</span> |
| <span class="p">(</span><span class="mf">50.0</span><span class="p">,</span> <span class="s2">"Deep Learning"</span><span class="p">)</span> |
| <span class="p">]</span> |
| <span class="n">df</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">(</span><span class="n">data</span><span class="p">,</span> <span class="p">[</span><span class="s2">"numeric_column"</span><span class="p">,</span> <span class="s2">"text_column"</span><span class="p">])</span> |
| |
| <span class="c1"># Schema for the result</span> |
| <span class="n">schema</span> <span class="o">=</span> <span class="n">StructType</span><span class="p">([</span> |
| <span class="n">StructField</span><span class="p">(</span><span class="s2">"mean_value"</span><span class="p">,</span> <span class="n">DoubleType</span><span class="p">(),</span> <span class="kc">True</span><span class="p">),</span> |
| <span class="n">StructField</span><span class="p">(</span><span class="s2">"sum_value"</span><span class="p">,</span> <span class="n">DoubleType</span><span class="p">(),</span> <span class="kc">True</span><span class="p">),</span> |
| <span class="n">StructField</span><span class="p">(</span><span class="s2">"processed_text"</span><span class="p">,</span> <span class="n">StringType</span><span class="p">(),</span> <span class="kc">True</span><span class="p">)</span> |
| <span class="p">])</span> |
| |
| <span class="nd">@pandas_udf</span><span class="p">(</span><span class="n">schema</span><span class="p">)</span> |
| <span class="k">def</span><span class="w"> </span><span class="nf">compute_stats_and_transform_string</span><span class="p">(</span><span class="n">numeric_col</span><span class="p">:</span> <span class="n">pd</span><span class="o">.</span><span class="n">Series</span><span class="p">,</span> <span class="n">text_col</span><span class="p">:</span> <span class="n">pd</span><span class="o">.</span><span class="n">Series</span><span class="p">)</span> <span class="o">-></span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">:</span> |
| <span class="n">mean_value</span> <span class="o">=</span> <span class="n">numeric_col</span><span class="o">.</span><span class="n">mean</span><span class="p">()</span> |
| <span class="n">sum_value</span> <span class="o">=</span> <span class="n">numeric_col</span><span class="o">.</span><span class="n">sum</span><span class="p">()</span> |
| |
| <span class="c1"># Reverse the string if its length is greater than 5, otherwise capitalize it</span> |
| <span class="n">processed_text</span> <span class="o">=</span> <span class="n">text_col</span><span class="o">.</span><span class="n">apply</span><span class="p">(</span><span class="k">lambda</span> <span class="n">x</span><span class="p">:</span> <span class="n">x</span><span class="p">[::</span><span class="o">-</span><span class="mi">1</span><span class="p">]</span> <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">x</span><span class="p">)</span> <span class="o">></span> <span class="mi">5</span> <span class="k">else</span> <span class="n">x</span><span class="o">.</span><span class="n">upper</span><span class="p">())</span> |
| |
| <span class="n">result_df</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">({</span> |
| <span class="s2">"mean_value"</span><span class="p">:</span> <span class="p">[</span><span class="n">mean_value</span><span class="p">]</span> <span class="o">*</span> <span class="nb">len</span><span class="p">(</span><span class="n">text_col</span><span class="p">),</span> |
| <span class="s2">"sum_value"</span><span class="p">:</span> <span class="p">[</span><span class="n">sum_value</span><span class="p">]</span> <span class="o">*</span> <span class="nb">len</span><span class="p">(</span><span class="n">text_col</span><span class="p">),</span> |
| <span class="s2">"processed_text"</span><span class="p">:</span> <span class="n">processed_text</span> |
| <span class="p">})</span> |
| |
| <span class="k">return</span> <span class="n">result_df</span> |
| |
| <span class="n">df</span><span class="o">.</span><span class="n">withColumn</span><span class="p">(</span><span class="s2">"result"</span><span class="p">,</span> <span class="n">compute_stats_and_transform_string</span><span class="p">(</span><span class="n">df</span><span class="p">[</span><span class="s2">"numeric_column"</span><span class="p">],</span> <span class="n">df</span><span class="p">[</span><span class="s2">"text_column"</span><span class="p">]))</span><span class="o">.</span><span class="n">show</span><span class="p">(</span><span class="n">truncate</span><span class="o">=</span><span class="kc">False</span><span class="p">)</span> |
| </pre></div> |
| </div> |
| </div> |
| <div class="nboutput nblast docutils container"> |
| <div class="prompt empty docutils container"> |
| </div> |
| <div class="output_area docutils container"> |
| <div class="highlight"><pre> |
| +--------------+----------------+------------------------------+ |
| |numeric_column|text_column |result | |
| +--------------+----------------+------------------------------+ |
| |10.0 |Spark |{10.0, 10.0, SPARK} | |
| |20.0 |Big Data |{20.0, 20.0, ataD giB} | |
| |30.0 |AI |{30.0, 30.0, AI} | |
| |40.0 |Machine Learning|{40.0, 40.0, gninraeL enihcaM}| |
| |50.0 |Deep Learning |{50.0, 50.0, gninraeL peeD} | |
| +--------------+----------------+------------------------------+ |
| |
| </pre></div></div> |
| </div> |
| </section> |
| </section> |
| </section> |
| <section id="Python-UDTFs"> |
| <h2>Python UDTFs<a class="headerlink" href="#Python-UDTFs" title="Permalink to this headline">#</a></h2> |
| <p>A Python user-defined table function (UDTF) is a new kind of function that returns a table as output instead of a single scalar result value. Once registered, they can appear in the FROM clause of a SQL query.</p> |
| <section id="When-to-use-Python-UDTFs"> |
| <h3>When to use Python UDTFs<a class="headerlink" href="#When-to-use-Python-UDTFs" title="Permalink to this headline">#</a></h3> |
| <p>In short, if you want a function that generates multiple rows and columns, and want to leverage the rich Python ecosystem, Python UDTFs are for you.</p> |
| <ul class="simple"> |
| <li><p><strong>Python UDTFs vs Python UDFs</strong>: While Python UDFs in Spark are designed to each accept zero or more scalar values as input, and return a single value as output, UDTFs offer more flexibility. They can return multiple rows and columns, extending the capabilities of UDFs. Here are a few scenarios where UDTFs are particularly helpful:</p> |
| <ul> |
| <li><p>Exploding nested data types like arrays or structs, transforming it into multiple rows</p></li> |
| <li><p>Dealing with string data that needs to be split into multiple parts, each represented as a separate row or multiple columns</p></li> |
| <li><p>Generating rows based on input ranges, such as creating sequences of numbers, timestamps, or records for different dates</p></li> |
| </ul> |
| </li> |
| <li><p><strong>Python UDTFs vs SQL UDTFs</strong>: SQL UDTFs are efficient and versatile, but Python offers a richer set of libraries and tools. Compared to SQL, Python provides tools to enable advanced transformations or computations (e.g. statistical functions or machine learning inferences).</p></li> |
| </ul> |
| </section> |
| <section id="To-create-a-Python-UDTF"> |
| <h3>To create a Python UDTF<a class="headerlink" href="#To-create-a-Python-UDTF" title="Permalink to this headline">#</a></h3> |
| <p>In the code below, we’ve created a simple UDTF that takes two integers as inputs and produces two columns as output: the original number and its square.</p> |
| <p>Note the use of the <code class="docutils literal notranslate"><span class="pre">yield</span></code> statement; A Python UDTF requires the return type to be either a tuple or a Row object so that the results can be processed properly.</p> |
| <p>Also note the return type must be a <code class="docutils literal notranslate"><span class="pre">StructType</span></code>with block-formatting or DDL string representing a <code class="docutils literal notranslate"><span class="pre">StructType</span></code> with block-formatting in Spark.</p> |
| <div class="nbinput nblast docutils container"> |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[12]: |
| </pre></div> |
| </div> |
| <div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="kn">from</span><span class="w"> </span><span class="nn">pyspark.sql.functions</span><span class="w"> </span><span class="kn">import</span> <span class="n">udtf</span> |
| |
| <span class="nd">@udtf</span><span class="p">(</span><span class="n">returnType</span><span class="o">=</span><span class="s2">"num: int, squared: int"</span><span class="p">)</span> |
| <span class="k">class</span><span class="w"> </span><span class="nc">SquareNumbers</span><span class="p">:</span> |
| <span class="k">def</span><span class="w"> </span><span class="nf">eval</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">start</span><span class="p">:</span> <span class="nb">int</span><span class="p">,</span> <span class="n">end</span><span class="p">:</span> <span class="nb">int</span><span class="p">):</span> |
| <span class="k">for</span> <span class="n">num</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="n">start</span><span class="p">,</span> <span class="n">end</span> <span class="o">+</span> <span class="mi">1</span><span class="p">):</span> |
| <span class="k">yield</span> <span class="p">(</span><span class="n">num</span><span class="p">,</span> <span class="n">num</span> <span class="o">*</span> <span class="n">num</span><span class="p">)</span> |
| </pre></div> |
| </div> |
| </div> |
| <section id="id1"> |
| <h4>Arrow optimization<a class="headerlink" href="#id1" title="Permalink to this headline">#</a></h4> |
| <p>Apache Arrow is an in-memory columnar data format that allows for efficient data transfers between Java and Python processes. It can significantly boost performance when the UDTF outputs many rows. Arrow-optimization can be enabled using <code class="docutils literal notranslate"><span class="pre">useArrow=True</span></code>, for example,</p> |
| <div class="highlight-py notranslate"><div class="highlight"><pre><span></span><span class="kn">from</span><span class="w"> </span><span class="nn">pyspark.sql.functions</span><span class="w"> </span><span class="kn">import</span> <span class="n">udtf</span> |
| |
| <span class="nd">@udtf</span><span class="p">(</span><span class="n">returnType</span><span class="o">=</span><span class="s2">"num: int, squared: int"</span><span class="p">,</span> <span class="n">useArrow</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span> |
| <span class="k">class</span><span class="w"> </span><span class="nc">SquareNumbers</span><span class="p">:</span> |
| <span class="o">...</span> |
| </pre></div> |
| </div> |
| </section> |
| </section> |
| <section id="To-use-a-Python-UDTF"> |
| <h3>To use a Python UDTF<a class="headerlink" href="#To-use-a-Python-UDTF" title="Permalink to this headline">#</a></h3> |
| <p>In Python, we can invoke a UDTF directly using the class name, as shown below.</p> |
| <div class="nbinput docutils container"> |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[13]: |
| </pre></div> |
| </div> |
| <div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="kn">from</span><span class="w"> </span><span class="nn">pyspark.sql.functions</span><span class="w"> </span><span class="kn">import</span> <span class="n">lit</span> |
| |
| <span class="n">SquareNumbers</span><span class="p">(</span><span class="n">lit</span><span class="p">(</span><span class="mi">1</span><span class="p">),</span> <span class="n">lit</span><span class="p">(</span><span class="mi">3</span><span class="p">))</span><span class="o">.</span><span class="n">show</span><span class="p">()</span> |
| </pre></div> |
| </div> |
| </div> |
| <div class="nboutput nblast docutils container"> |
| <div class="prompt empty docutils container"> |
| </div> |
| <div class="output_area docutils container"> |
| <div class="highlight"><pre> |
| +---+-------+ |
| |num|squared| |
| +---+-------+ |
| | 1| 1| |
| | 2| 4| |
| | 3| 9| |
| +---+-------+ |
| |
| </pre></div></div> |
| </div> |
| <p>In SQL, we can register the Python UDTF and then use it in SQL as a table-valued function in the FROM clause of a query.</p> |
| <div class="highlight-none notranslate"><div class="highlight"><pre><span></span>spark.sql("SELECT * FROM square_numbers(1, 3)").show() |
| </pre></div> |
| </div> |
| </section> |
| <section id="id2"> |
| <h3>More Examples<a class="headerlink" href="#id2" title="Permalink to this headline">#</a></h3> |
| <section id="Example-1:-Generating-Numbers,-Their-Squares,-Cubes,-and-Factorials-for-a-Range"> |
| <h4>Example 1: Generating Numbers, Their Squares, Cubes, and Factorials for a Range<a class="headerlink" href="#Example-1:-Generating-Numbers,-Their-Squares,-Cubes,-and-Factorials-for-a-Range" title="Permalink to this headline">#</a></h4> |
| <div class="nbinput docutils container"> |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[14]: |
| </pre></div> |
| </div> |
| <div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="kn">from</span><span class="w"> </span><span class="nn">pyspark.sql.functions</span><span class="w"> </span><span class="kn">import</span> <span class="n">lit</span><span class="p">,</span> <span class="n">udtf</span> |
| <span class="kn">import</span><span class="w"> </span><span class="nn">math</span> |
| |
| <span class="nd">@udtf</span><span class="p">(</span><span class="n">returnType</span><span class="o">=</span><span class="s2">"num: int, square: int, cube: int, factorial: int"</span><span class="p">)</span> |
| <span class="k">class</span><span class="w"> </span><span class="nc">GenerateComplexNumbers</span><span class="p">:</span> |
| <span class="k">def</span><span class="w"> </span><span class="nf">eval</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">start</span><span class="p">:</span> <span class="nb">int</span><span class="p">,</span> <span class="n">end</span><span class="p">:</span> <span class="nb">int</span><span class="p">):</span> |
| <span class="k">for</span> <span class="n">num</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="n">start</span><span class="p">,</span> <span class="n">end</span> <span class="o">+</span> <span class="mi">1</span><span class="p">):</span> |
| <span class="k">yield</span> <span class="p">(</span><span class="n">num</span><span class="p">,</span> <span class="n">num</span> <span class="o">**</span> <span class="mi">2</span><span class="p">,</span> <span class="n">num</span> <span class="o">**</span> <span class="mi">3</span><span class="p">,</span> <span class="n">math</span><span class="o">.</span><span class="n">factorial</span><span class="p">(</span><span class="n">num</span><span class="p">))</span> |
| |
| <span class="n">GenerateComplexNumbers</span><span class="p">(</span><span class="n">lit</span><span class="p">(</span><span class="mi">1</span><span class="p">),</span> <span class="n">lit</span><span class="p">(</span><span class="mi">5</span><span class="p">))</span><span class="o">.</span><span class="n">show</span><span class="p">()</span> |
| </pre></div> |
| </div> |
| </div> |
| <div class="nboutput nblast docutils container"> |
| <div class="prompt empty docutils container"> |
| </div> |
| <div class="output_area docutils container"> |
| <div class="highlight"><pre> |
| +---+------+----+---------+ |
| |num|square|cube|factorial| |
| +---+------+----+---------+ |
| | 1| 1| 1| 1| |
| | 2| 4| 8| 2| |
| | 3| 9| 27| 6| |
| | 4| 16| 64| 24| |
| | 5| 25| 125| 120| |
| +---+------+----+---------+ |
| |
| </pre></div></div> |
| </div> |
| </section> |
| <section id="Example-2:-Splitting-a-Sentence-into-Words-and-Performing-Multiple-Operations"> |
| <h4>Example 2: Splitting a Sentence into Words and Performing Multiple Operations<a class="headerlink" href="#Example-2:-Splitting-a-Sentence-into-Words-and-Performing-Multiple-Operations" title="Permalink to this headline">#</a></h4> |
| <div class="nbinput docutils container"> |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[15]: |
| </pre></div> |
| </div> |
| <div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="kn">from</span><span class="w"> </span><span class="nn">pyspark.sql.functions</span><span class="w"> </span><span class="kn">import</span> <span class="n">lit</span><span class="p">,</span> <span class="n">udtf</span> |
| |
| <span class="nd">@udtf</span><span class="p">(</span><span class="n">returnType</span><span class="o">=</span><span class="s2">"word: string, length: int, is_palindrome: boolean"</span><span class="p">)</span> |
| <span class="k">class</span><span class="w"> </span><span class="nc">ProcessWords</span><span class="p">:</span> |
| <span class="k">def</span><span class="w"> </span><span class="nf">eval</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">sentence</span><span class="p">:</span> <span class="nb">str</span><span class="p">):</span> |
| <span class="n">words</span> <span class="o">=</span> <span class="n">sentence</span><span class="o">.</span><span class="n">split</span><span class="p">()</span> <span class="c1"># Split sentence into words</span> |
| <span class="k">for</span> <span class="n">word</span> <span class="ow">in</span> <span class="n">words</span><span class="p">:</span> |
| <span class="n">is_palindrome</span> <span class="o">=</span> <span class="n">word</span> <span class="o">==</span> <span class="n">word</span><span class="p">[::</span><span class="o">-</span><span class="mi">1</span><span class="p">]</span> <span class="c1"># Check if the word is a palindrome</span> |
| <span class="k">yield</span> <span class="p">(</span><span class="n">word</span><span class="p">,</span> <span class="nb">len</span><span class="p">(</span><span class="n">word</span><span class="p">),</span> <span class="n">is_palindrome</span><span class="p">)</span> |
| |
| <span class="n">ProcessWords</span><span class="p">(</span><span class="n">lit</span><span class="p">(</span><span class="s2">"hello world"</span><span class="p">))</span><span class="o">.</span><span class="n">show</span><span class="p">()</span> |
| </pre></div> |
| </div> |
| </div> |
| <div class="nboutput nblast docutils container"> |
| <div class="prompt empty docutils container"> |
| </div> |
| <div class="output_area docutils container"> |
| <div class="highlight"><pre> |
| +-----+------+-------------+ |
| | word|length|is_palindrome| |
| +-----+------+-------------+ |
| |hello| 5| false| |
| |world| 5| false| |
| +-----+------+-------------+ |
| |
| </pre></div></div> |
| </div> |
| </section> |
| <section id="Example-3:-Parsing-JSON-String-into-Key-Value-Pairs-with-Data-Types"> |
| <h4>Example 3: Parsing JSON String into Key-Value Pairs with Data Types<a class="headerlink" href="#Example-3:-Parsing-JSON-String-into-Key-Value-Pairs-with-Data-Types" title="Permalink to this headline">#</a></h4> |
| <div class="nbinput docutils container"> |
| <div class="prompt highlight-none notranslate"><div class="highlight"><pre><span></span>[16]: |
| </pre></div> |
| </div> |
| <div class="input_area highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="kn">import</span><span class="w"> </span><span class="nn">json</span> |
| <span class="kn">from</span><span class="w"> </span><span class="nn">pyspark.sql.functions</span><span class="w"> </span><span class="kn">import</span> <span class="n">lit</span><span class="p">,</span> <span class="n">udtf</span> |
| |
| <span class="nd">@udtf</span><span class="p">(</span><span class="n">returnType</span><span class="o">=</span><span class="s2">"key: string, value: string, value_type: string"</span><span class="p">)</span> |
| <span class="k">class</span><span class="w"> </span><span class="nc">ParseJSON</span><span class="p">:</span> |
| <span class="k">def</span><span class="w"> </span><span class="nf">eval</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">json_str</span><span class="p">:</span> <span class="nb">str</span><span class="p">):</span> |
| <span class="k">try</span><span class="p">:</span> |
| <span class="n">json_data</span> <span class="o">=</span> <span class="n">json</span><span class="o">.</span><span class="n">loads</span><span class="p">(</span><span class="n">json_str</span><span class="p">)</span> |
| <span class="k">for</span> <span class="n">key</span><span class="p">,</span> <span class="n">value</span> <span class="ow">in</span> <span class="n">json_data</span><span class="o">.</span><span class="n">items</span><span class="p">():</span> |
| <span class="n">value_type</span> <span class="o">=</span> <span class="nb">type</span><span class="p">(</span><span class="n">value</span><span class="p">)</span><span class="o">.</span><span class="vm">__name__</span> |
| <span class="k">yield</span> <span class="p">(</span><span class="n">key</span><span class="p">,</span> <span class="nb">str</span><span class="p">(</span><span class="n">value</span><span class="p">),</span> <span class="n">value_type</span><span class="p">)</span> |
| <span class="k">except</span> <span class="n">json</span><span class="o">.</span><span class="n">JSONDecodeError</span><span class="p">:</span> |
| <span class="k">yield</span> <span class="p">(</span><span class="s2">"Invalid JSON"</span><span class="p">,</span> <span class="s2">""</span><span class="p">,</span> <span class="s2">""</span><span class="p">)</span> |
| |
| <span class="n">ParseJSON</span><span class="p">(</span><span class="n">lit</span><span class="p">(</span><span class="s1">'{"name": "Alice", "age": 25, "is_student": false}'</span><span class="p">))</span><span class="o">.</span><span class="n">show</span><span class="p">()</span> |
| </pre></div> |
| </div> |
| </div> |
| <div class="nboutput nblast docutils container"> |
| <div class="prompt empty docutils container"> |
| </div> |
| <div class="output_area docutils container"> |
| <div class="highlight"><pre> |
| +----------+-----+----------+ |
| | key|value|value_type| |
| +----------+-----+----------+ |
| | name|Alice| str| |
| | age| 25| int| |
| |is_student|False| bool| |
| +----------+-----+----------+ |
| |
| </pre></div></div> |
| </div> |
| </section> |
| </section> |
| </section> |
| </section> |
| |
| |
| </article> |
| |
| |
| |
| <footer class="bd-footer-article"> |
| |
| <div class="footer-article-items footer-article__inner"> |
| |
| <div class="footer-article-item"><!-- Previous / next buttons --> |
| <div class="prev-next-area"> |
| <a class="left-prev" |
| href="bugbusting.html" |
| title="previous page"> |
| <i class="fa-solid fa-angle-left"></i> |
| <div class="prev-next-info"> |
| <p class="prev-next-subtitle">previous</p> |
| <p class="prev-next-title">Chapter 4: Bug Busting - Debugging PySpark</p> |
| </div> |
| </a> |
| <a class="right-next" |
| href="sql.html" |
| title="next page"> |
| <div class="prev-next-info"> |
| <p class="prev-next-subtitle">next</p> |
| <p class="prev-next-title">Chapter 6: Old SQL, New Tricks - Running SQL on PySpark</p> |
| </div> |
| <i class="fa-solid fa-angle-right"></i> |
| </a> |
| </div></div> |
| |
| </div> |
| |
| </footer> |
| |
| </div> |
| |
| |
| |
| <div class="bd-sidebar-secondary bd-toc"><div class="sidebar-secondary-items sidebar-secondary__inner"> |
| |
| <div class="sidebar-secondary-item"> |
| <div class="page-toc tocsection onthispage"> |
| <i class="fa-solid fa-list"></i> On this page |
| </div> |
| <nav class="bd-toc-nav page-toc"> |
| <ul class="visible nav section-nav flex-column"> |
| <li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#Python-UDFs">Python UDFs</a><ul class="nav section-nav flex-column"> |
| <li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#Categories-of-Python-UDFs">Categories of Python UDFs</a></li> |
| <li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#To-create-a-Scalar-Python-UDF">To create a Scalar Python UDF</a><ul class="nav section-nav flex-column"> |
| <li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#Arrow-optimization">Arrow optimization</a></li> |
| </ul> |
| </li> |
| <li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#To-use-a-Scalar-Python-UDF">To use a Scalar Python UDF</a></li> |
| <li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#To-create-a-Pandas-UDF">To create a Pandas UDF</a></li> |
| <li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#To-use-a-Pandas-UDF">To use a Pandas UDF</a></li> |
| <li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#More-Examples">More Examples</a><ul class="nav section-nav flex-column"> |
| <li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#Example-1:-Python-UDF-to-Process-DataFrame-with-String-and-List-Columns">Example 1: Python UDF to Process DataFrame with String and List Columns</a></li> |
| <li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#Example-2:-Pandas-UDF-for-Statistical-Computations-and-Complex-Transformation">Example 2: Pandas UDF for Statistical Computations and Complex Transformation</a></li> |
| </ul> |
| </li> |
| </ul> |
| </li> |
| <li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#Python-UDTFs">Python UDTFs</a><ul class="nav section-nav flex-column"> |
| <li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#When-to-use-Python-UDTFs">When to use Python UDTFs</a></li> |
| <li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#To-create-a-Python-UDTF">To create a Python UDTF</a><ul class="nav section-nav flex-column"> |
| <li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#id1">Arrow optimization</a></li> |
| </ul> |
| </li> |
| <li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#To-use-a-Python-UDTF">To use a Python UDTF</a></li> |
| <li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#id2">More Examples</a><ul class="nav section-nav flex-column"> |
| <li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#Example-1:-Generating-Numbers,-Their-Squares,-Cubes,-and-Factorials-for-a-Range">Example 1: Generating Numbers, Their Squares, Cubes, and Factorials for a Range</a></li> |
| <li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#Example-2:-Splitting-a-Sentence-into-Words-and-Performing-Multiple-Operations">Example 2: Splitting a Sentence into Words and Performing Multiple Operations</a></li> |
| <li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#Example-3:-Parsing-JSON-String-into-Key-Value-Pairs-with-Data-Types">Example 3: Parsing JSON String into Key-Value Pairs with Data Types</a></li> |
| </ul> |
| </li> |
| </ul> |
| </li> |
| </ul> |
| </nav></div> |
| |
| <div class="sidebar-secondary-item"> |
| <div class="tocsection sourcelink"> |
| <a href="../_sources/user_guide/udfandudtf.ipynb.txt"> |
| <i class="fa-solid fa-file-lines"></i> Show Source |
| </a> |
| </div> |
| </div> |
| |
| </div></div> |
| |
| |
| </div> |
| <footer class="bd-footer-content"> |
| |
| </footer> |
| |
| </main> |
| </div> |
| </div> |
| |
| <!-- Scripts loaded after <body> so the DOM is not blocked --> |
| <script src="../_static/scripts/bootstrap.js?digest=e353d410970836974a52"></script> |
| <script src="../_static/scripts/pydata-sphinx-theme.js?digest=e353d410970836974a52"></script> |
| |
| <footer class="bd-footer"> |
| <div class="bd-footer__inner bd-page-width"> |
| |
| <div class="footer-items__start"> |
| |
| <div class="footer-item"><p class="copyright"> |
| Copyright @ 2025 The Apache Software Foundation, Licensed under the <a href="https://www.apache.org/licenses/LICENSE-2.0">Apache License, Version 2.0</a>. |
| </p></div> |
| |
| <div class="footer-item"> |
| <p class="sphinx-version"> |
| Created using <a href="https://www.sphinx-doc.org/">Sphinx</a> 4.5.0. |
| <br/> |
| </p> |
| </div> |
| |
| </div> |
| |
| |
| <div class="footer-items__end"> |
| |
| <div class="footer-item"><p class="theme-version"> |
| Built with the <a href="https://pydata-sphinx-theme.readthedocs.io/en/stable/index.html">PyData Sphinx Theme</a> 0.13.3. |
| </p></div> |
| |
| </div> |
| |
| </div> |
| |
| </footer> |
| </body> |
| </html> |