blob: da1669037d65187c44bcb2268648223a51394431 [file] [log] [blame]
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8" />
<title>pyspark.sql.functions &#8212; PySpark master documentation</title>
<link href="../../../_static/styles/theme.css?digest=1999514e3f237ded88cf" rel="stylesheet">
<link href="../../../_static/styles/pydata-sphinx-theme.css?digest=1999514e3f237ded88cf" rel="stylesheet">
<link rel="stylesheet"
href="../../../_static/vendor/fontawesome/5.13.0/css/all.min.css">
<link rel="preload" as="font" type="font/woff2" crossorigin
href="../../../_static/vendor/fontawesome/5.13.0/webfonts/fa-solid-900.woff2">
<link rel="preload" as="font" type="font/woff2" crossorigin
href="../../../_static/vendor/fontawesome/5.13.0/webfonts/fa-brands-400.woff2">
<link rel="stylesheet" href="../../../_static/styles/pydata-sphinx-theme.css" type="text/css" />
<link rel="stylesheet" href="../../../_static/pygments.css" type="text/css" />
<link rel="stylesheet" type="text/css" href="../../../_static/copybutton.css" />
<link rel="stylesheet" type="text/css" href="../../../_static/css/pyspark.css" />
<link rel="preload" as="script" href="../../../_static/scripts/pydata-sphinx-theme.js?digest=1999514e3f237ded88cf">
<script id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
<script src="../../../_static/jquery.js"></script>
<script src="../../../_static/underscore.js"></script>
<script src="../../../_static/doctools.js"></script>
<script src="../../../_static/language_data.js"></script>
<script src="../../../_static/clipboard.min.js"></script>
<script src="../../../_static/copybutton.js"></script>
<script crossorigin="anonymous" integrity="sha256-Ae2Vz/4ePdIu6ZyI/5ZGsYnb+m0JlOmKPjt6XZ9JJkA=" src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.4/require.min.js"></script>
<script async="async" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
<script type="text/x-mathjax-config">MathJax.Hub.Config({"tex2jax": {"inlineMath": [["$", "$"], ["\\(", "\\)"]], "processEscapes": true, "ignoreClass": "tex2jax_ignore|mathjax_ignore|document", "processClass": "tex2jax_process|mathjax_process|math|output_area"}})</script>
<link rel="canonical" href="https://spark.apache.org/docs/latest/api/python/_modules/pyspark/sql/functions.html" />
<link rel="search" title="Search" href="../../../search.html" />
<meta name="viewport" content="width=device-width, initial-scale=1" />
<meta name="docsearch:language" content="None">
<!-- Google Analytics -->
</head>
<body data-spy="scroll" data-target="#bd-toc-nav" data-offset="80">
<div class="container-fluid" id="banner"></div>
<nav class="navbar navbar-light navbar-expand-lg bg-light fixed-top bd-navbar" id="navbar-main"><div class="container-xl">
<div id="navbar-start">
<a class="navbar-brand" href="../../../index.html">
<img src="../../../_static/spark-logo-reverse.png" class="logo" alt="logo">
</a>
</div>
<button class="navbar-toggler" type="button" data-toggle="collapse" data-target="#navbar-collapsible" aria-controls="navbar-collapsible" aria-expanded="false" aria-label="Toggle navigation">
<span class="navbar-toggler-icon"></span>
</button>
<div id="navbar-collapsible" class="col-lg-9 collapse navbar-collapse">
<div id="navbar-center" class="mr-auto">
<div class="navbar-center-item">
<ul id="navbar-main-elements" class="navbar-nav">
<li class="toctree-l1 nav-item">
<a class="reference internal nav-link" href="../../../index.html">
Overview
</a>
</li>
<li class="toctree-l1 nav-item">
<a class="reference internal nav-link" href="../../../getting_started/index.html">
Getting Started
</a>
</li>
<li class="toctree-l1 nav-item">
<a class="reference internal nav-link" href="../../../user_guide/index.html">
User Guides
</a>
</li>
<li class="toctree-l1 nav-item">
<a class="reference internal nav-link" href="../../../reference/index.html">
API Reference
</a>
</li>
<li class="toctree-l1 nav-item">
<a class="reference internal nav-link" href="../../../development/index.html">
Development
</a>
</li>
<li class="toctree-l1 nav-item">
<a class="reference internal nav-link" href="../../../migration_guide/index.html">
Migration Guides
</a>
</li>
</ul>
</div>
</div>
<div id="navbar-end">
<div class="navbar-end-item">
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<div id="version-button" class="dropdown">
<button type="button" class="btn btn-secondary btn-sm navbar-btn dropdown-toggle" id="version_switcher_button" data-toggle="dropdown">
master
<span class="caret"></span>
</button>
<div id="version_switcher" class="dropdown-menu list-group-flush py-0" aria-labelledby="version_switcher_button">
<!-- dropdown will be populated by javascript on page load -->
</div>
</div>
<script type="text/javascript">
// Function to construct the target URL from the JSON components
function buildURL(entry) {
var template = "https://spark.apache.org/docs/{version}/api/python/index.html"; // supplied by jinja
template = template.replace("{version}", entry.version);
return template;
}
// Function to check if corresponding page path exists in other version of docs
// and, if so, go there instead of the homepage of the other docs version
function checkPageExistsAndRedirect(event) {
const currentFilePath = "_modules/pyspark/sql/functions.html",
otherDocsHomepage = event.target.getAttribute("href");
let tryUrl = `${otherDocsHomepage}${currentFilePath}`;
$.ajax({
type: 'HEAD',
url: tryUrl,
// if the page exists, go there
success: function() {
location.href = tryUrl;
}
}).fail(function() {
location.href = otherDocsHomepage;
});
return false;
}
// Function to populate the version switcher
(function () {
// get JSON config
$.getJSON("_static/versions.json", function(data, textStatus, jqXHR) {
// create the nodes first (before AJAX calls) to ensure the order is
// correct (for now, links will go to doc version homepage)
$.each(data, function(index, entry) {
// if no custom name specified (e.g., "latest"), use version string
if (!("name" in entry)) {
entry.name = entry.version;
}
// construct the appropriate URL, and add it to the dropdown
entry.url = buildURL(entry);
const node = document.createElement("a");
node.setAttribute("class", "list-group-item list-group-item-action py-1");
node.setAttribute("href", `${entry.url}`);
node.textContent = `${entry.name}`;
node.onclick = checkPageExistsAndRedirect;
$("#version_switcher").append(node);
});
});
})();
</script>
</div>
</div>
</div>
</div>
</nav>
<div class="container-xl">
<div class="row">
<!-- Only show if we have sidebars configured, else just a small margin -->
<div class="col-12 col-md-3 bd-sidebar">
<div class="sidebar-start-items"><form class="bd-search d-flex align-items-center" action="../../../search.html" method="get">
<i class="icon fas fa-search"></i>
<input type="search" class="form-control" name="q" id="search-input" placeholder="Search the docs ..." aria-label="Search the docs ..." autocomplete="off" >
</form><nav class="bd-links" id="bd-docs-nav" aria-label="Main navigation">
<div class="bd-toc-item active">
</div>
</nav>
</div>
<div class="sidebar-end-items">
</div>
</div>
<div class="d-none d-xl-block col-xl-2 bd-toc">
</div>
<main class="col-12 col-md-9 col-xl-7 py-md-5 pl-md-5 pr-md-4 bd-content" role="main">
<div>
<h1>Source code for pyspark.sql.functions</h1><div class="highlight"><pre>
<span></span><span class="c1">#</span>
<span class="c1"># Licensed to the Apache Software Foundation (ASF) under one or more</span>
<span class="c1"># contributor license agreements. See the NOTICE file distributed with</span>
<span class="c1"># this work for additional information regarding copyright ownership.</span>
<span class="c1"># The ASF licenses this file to You under the Apache License, Version 2.0</span>
<span class="c1"># (the &quot;License&quot;); you may not use this file except in compliance with</span>
<span class="c1"># the License. You may obtain a copy of the License at</span>
<span class="c1">#</span>
<span class="c1"># http://www.apache.org/licenses/LICENSE-2.0</span>
<span class="c1">#</span>
<span class="c1"># Unless required by applicable law or agreed to in writing, software</span>
<span class="c1"># distributed under the License is distributed on an &quot;AS IS&quot; BASIS,</span>
<span class="c1"># WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.</span>
<span class="c1"># See the License for the specific language governing permissions and</span>
<span class="c1"># limitations under the License.</span>
<span class="c1">#</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd">A collections of builtin functions</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="kn">import</span> <span class="nn">inspect</span>
<span class="kn">import</span> <span class="nn">decimal</span>
<span class="kn">import</span> <span class="nn">sys</span>
<span class="kn">import</span> <span class="nn">functools</span>
<span class="kn">import</span> <span class="nn">warnings</span>
<span class="kn">from</span> <span class="nn">typing</span> <span class="kn">import</span> <span class="p">(</span>
<span class="n">Any</span><span class="p">,</span>
<span class="n">cast</span><span class="p">,</span>
<span class="n">Callable</span><span class="p">,</span>
<span class="n">Dict</span><span class="p">,</span>
<span class="n">List</span><span class="p">,</span>
<span class="n">Iterable</span><span class="p">,</span>
<span class="n">overload</span><span class="p">,</span>
<span class="n">Optional</span><span class="p">,</span>
<span class="n">Tuple</span><span class="p">,</span>
<span class="n">Type</span><span class="p">,</span>
<span class="n">TYPE_CHECKING</span><span class="p">,</span>
<span class="n">Union</span><span class="p">,</span>
<span class="n">ValuesView</span><span class="p">,</span>
<span class="p">)</span>
<span class="kn">from</span> <span class="nn">py4j.java_gateway</span> <span class="kn">import</span> <span class="n">JVMView</span>
<span class="kn">from</span> <span class="nn">pyspark</span> <span class="kn">import</span> <span class="n">SparkContext</span>
<span class="kn">from</span> <span class="nn">pyspark.errors</span> <span class="kn">import</span> <span class="n">PySparkTypeError</span><span class="p">,</span> <span class="n">PySparkValueError</span>
<span class="kn">from</span> <span class="nn">pyspark.sql.column</span> <span class="kn">import</span> <span class="n">Column</span><span class="p">,</span> <span class="n">_to_java_column</span><span class="p">,</span> <span class="n">_to_seq</span><span class="p">,</span> <span class="n">_create_column_from_literal</span>
<span class="kn">from</span> <span class="nn">pyspark.sql.dataframe</span> <span class="kn">import</span> <span class="n">DataFrame</span>
<span class="kn">from</span> <span class="nn">pyspark.sql.types</span> <span class="kn">import</span> <span class="n">ArrayType</span><span class="p">,</span> <span class="n">DataType</span><span class="p">,</span> <span class="n">StringType</span><span class="p">,</span> <span class="n">StructType</span><span class="p">,</span> <span class="n">_from_numpy_type</span>
<span class="c1"># Keep UserDefinedFunction import for backwards compatible import; moved in SPARK-22409</span>
<span class="kn">from</span> <span class="nn">pyspark.sql.udf</span> <span class="kn">import</span> <span class="n">UserDefinedFunction</span><span class="p">,</span> <span class="n">_create_py_udf</span> <span class="c1"># noqa: F401</span>
<span class="kn">from</span> <span class="nn">pyspark.sql.udtf</span> <span class="kn">import</span> <span class="n">UserDefinedTableFunction</span><span class="p">,</span> <span class="n">_create_py_udtf</span>
<span class="c1"># Keep pandas_udf and PandasUDFType import for backwards compatible import; moved in SPARK-28264</span>
<span class="kn">from</span> <span class="nn">pyspark.sql.pandas.functions</span> <span class="kn">import</span> <span class="n">pandas_udf</span><span class="p">,</span> <span class="n">PandasUDFType</span> <span class="c1"># noqa: F401</span>
<span class="kn">from</span> <span class="nn">pyspark.sql.utils</span> <span class="kn">import</span> <span class="p">(</span>
<span class="n">to_str</span><span class="p">,</span>
<span class="n">has_numpy</span><span class="p">,</span>
<span class="n">try_remote_functions</span><span class="p">,</span>
<span class="n">get_active_spark_context</span><span class="p">,</span>
<span class="p">)</span>
<span class="k">if</span> <span class="n">TYPE_CHECKING</span><span class="p">:</span>
<span class="kn">from</span> <span class="nn">pyspark.sql._typing</span> <span class="kn">import</span> <span class="p">(</span>
<span class="n">ColumnOrName</span><span class="p">,</span>
<span class="n">ColumnOrName_</span><span class="p">,</span>
<span class="n">DataTypeOrString</span><span class="p">,</span>
<span class="n">UserDefinedFunctionLike</span><span class="p">,</span>
<span class="p">)</span>
<span class="k">if</span> <span class="n">has_numpy</span><span class="p">:</span>
<span class="kn">import</span> <span class="nn">numpy</span> <span class="k">as</span> <span class="nn">np</span>
<span class="c1"># Note to developers: all of PySpark functions here take string as column names whenever possible.</span>
<span class="c1"># Namely, if columns are referred as arguments, they can always be both Column or string,</span>
<span class="c1"># even though there might be few exceptions for legacy or inevitable reasons.</span>
<span class="c1"># If you are fixing other language APIs together, also please note that Scala side is not the case</span>
<span class="c1"># since it requires making every single overridden definition.</span>
<span class="k">def</span> <span class="nf">_get_jvm_function</span><span class="p">(</span><span class="n">name</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="n">sc</span><span class="p">:</span> <span class="n">SparkContext</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Callable</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Retrieves JVM function identified by name from</span>
<span class="sd"> Java gateway associated with sc.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">assert</span> <span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span>
<span class="k">return</span> <span class="nb">getattr</span><span class="p">(</span><span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">functions</span><span class="p">,</span> <span class="n">name</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">_invoke_function</span><span class="p">(</span><span class="n">name</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="o">*</span><span class="n">args</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Invokes JVM function identified by name with args</span>
<span class="sd"> and wraps the result with :class:`~pyspark.sql.Column`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">assert</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span>
<span class="n">jf</span> <span class="o">=</span> <span class="n">_get_jvm_function</span><span class="p">(</span><span class="n">name</span><span class="p">,</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span><span class="p">)</span>
<span class="k">return</span> <span class="n">Column</span><span class="p">(</span><span class="n">jf</span><span class="p">(</span><span class="o">*</span><span class="n">args</span><span class="p">))</span>
<span class="k">def</span> <span class="nf">_invoke_function_over_columns</span><span class="p">(</span><span class="n">name</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="o">*</span><span class="n">cols</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Invokes n-ary JVM function identified by name</span>
<span class="sd"> and wraps the result with :class:`~pyspark.sql.Column`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="n">name</span><span class="p">,</span> <span class="o">*</span><span class="p">(</span><span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">)</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">cols</span><span class="p">))</span>
<span class="k">def</span> <span class="nf">_invoke_function_over_seq_of_columns</span><span class="p">(</span><span class="n">name</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="n">cols</span><span class="p">:</span> <span class="s2">&quot;Iterable[ColumnOrName]&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Invokes unary JVM function identified by name with</span>
<span class="sd"> and wraps the result with :class:`~pyspark.sql.Column`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">get_active_spark_context</span><span class="p">()</span>
<span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="n">name</span><span class="p">,</span> <span class="n">_to_seq</span><span class="p">(</span><span class="n">sc</span><span class="p">,</span> <span class="n">cols</span><span class="p">,</span> <span class="n">_to_java_column</span><span class="p">))</span>
<span class="k">def</span> <span class="nf">_invoke_binary_math_function</span><span class="p">(</span><span class="n">name</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="n">col1</span><span class="p">:</span> <span class="n">Any</span><span class="p">,</span> <span class="n">col2</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Invokes binary JVM math function identified by name</span>
<span class="sd"> and wraps the result with :class:`~pyspark.sql.Column`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="c1"># For legacy reasons, the arguments here can be implicitly converted into column</span>
<span class="n">cols</span> <span class="o">=</span> <span class="p">[</span>
<span class="n">_to_java_column</span><span class="p">(</span><span class="n">c</span><span class="p">)</span> <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">c</span><span class="p">,</span> <span class="p">(</span><span class="nb">str</span><span class="p">,</span> <span class="n">Column</span><span class="p">))</span> <span class="k">else</span> <span class="n">_create_column_from_literal</span><span class="p">(</span><span class="n">c</span><span class="p">)</span>
<span class="k">for</span> <span class="n">c</span> <span class="ow">in</span> <span class="p">(</span><span class="n">col1</span><span class="p">,</span> <span class="n">col2</span><span class="p">)</span>
<span class="p">]</span>
<span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="n">name</span><span class="p">,</span> <span class="o">*</span><span class="n">cols</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">_options_to_str</span><span class="p">(</span><span class="n">options</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Any</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]]:</span>
<span class="k">if</span> <span class="n">options</span><span class="p">:</span>
<span class="k">return</span> <span class="p">{</span><span class="n">key</span><span class="p">:</span> <span class="n">to_str</span><span class="p">(</span><span class="n">value</span><span class="p">)</span> <span class="k">for</span> <span class="p">(</span><span class="n">key</span><span class="p">,</span> <span class="n">value</span><span class="p">)</span> <span class="ow">in</span> <span class="n">options</span><span class="o">.</span><span class="n">items</span><span class="p">()}</span>
<span class="k">return</span> <span class="p">{}</span>
<div class="viewcode-block" id="lit"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.lit.html#pyspark.sql.functions.lit">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">lit</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Creates a :class:`~pyspark.sql.Column` of literal value.</span>
<span class="sd"> .. versionadded:: 1.3.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column`, str, int, float, bool or list, NumPy literals or ndarray.</span>
<span class="sd"> the value to make it as a PySpark literal. If a column is passed,</span>
<span class="sd"> it returns the column as is.</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Since 3.4.0, it supports the list type.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> the literal instance.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.range(1)</span>
<span class="sd"> &gt;&gt;&gt; df.select(lit(5).alias(&#39;height&#39;), df.id).show()</span>
<span class="sd"> +------+---+</span>
<span class="sd"> |height| id|</span>
<span class="sd"> +------+---+</span>
<span class="sd"> | 5| 0|</span>
<span class="sd"> +------+---+</span>
<span class="sd"> Create a literal from a list.</span>
<span class="sd"> &gt;&gt;&gt; spark.range(1).select(lit([1, 2, 3])).show()</span>
<span class="sd"> +--------------+</span>
<span class="sd"> |array(1, 2, 3)|</span>
<span class="sd"> +--------------+</span>
<span class="sd"> | [1, 2, 3]|</span>
<span class="sd"> +--------------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">col</span><span class="p">,</span> <span class="n">Column</span><span class="p">):</span>
<span class="k">return</span> <span class="n">col</span>
<span class="k">elif</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">col</span><span class="p">,</span> <span class="nb">list</span><span class="p">):</span>
<span class="k">if</span> <span class="nb">any</span><span class="p">(</span><span class="nb">isinstance</span><span class="p">(</span><span class="n">c</span><span class="p">,</span> <span class="n">Column</span><span class="p">)</span> <span class="k">for</span> <span class="n">c</span> <span class="ow">in</span> <span class="n">col</span><span class="p">):</span>
<span class="k">raise</span> <span class="n">PySparkValueError</span><span class="p">(</span>
<span class="n">error_class</span><span class="o">=</span><span class="s2">&quot;COLUMN_IN_LIST&quot;</span><span class="p">,</span> <span class="n">message_parameters</span><span class="o">=</span><span class="p">{</span><span class="s2">&quot;func_name&quot;</span><span class="p">:</span> <span class="s2">&quot;lit&quot;</span><span class="p">}</span>
<span class="p">)</span>
<span class="k">return</span> <span class="n">array</span><span class="p">(</span><span class="o">*</span><span class="p">[</span><span class="n">lit</span><span class="p">(</span><span class="n">item</span><span class="p">)</span> <span class="k">for</span> <span class="n">item</span> <span class="ow">in</span> <span class="n">col</span><span class="p">])</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">if</span> <span class="n">has_numpy</span> <span class="ow">and</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">col</span><span class="p">,</span> <span class="n">np</span><span class="o">.</span><span class="n">generic</span><span class="p">):</span>
<span class="n">dt</span> <span class="o">=</span> <span class="n">_from_numpy_type</span><span class="p">(</span><span class="n">col</span><span class="o">.</span><span class="n">dtype</span><span class="p">)</span>
<span class="k">if</span> <span class="n">dt</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">&quot;lit&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span><span class="o">.</span><span class="n">astype</span><span class="p">(</span><span class="n">dt</span><span class="p">)</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="nb">str</span><span class="p">(</span><span class="n">col</span><span class="p">))</span>
<span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">&quot;lit&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="col"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.col.html#pyspark.sql.functions.col">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">col</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns a :class:`~pyspark.sql.Column` based on the given column name.</span>
<span class="sd"> .. versionadded:: 1.3.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : str</span>
<span class="sd"> the name for the column</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> the corresponding column instance.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; col(&#39;x&#39;)</span>
<span class="sd"> Column&lt;&#39;x&#39;&gt;</span>
<span class="sd"> &gt;&gt;&gt; column(&#39;x&#39;)</span>
<span class="sd"> Column&lt;&#39;x&#39;&gt;</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">&quot;col&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<span class="n">column</span> <span class="o">=</span> <span class="n">col</span>
<div class="viewcode-block" id="asc"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.asc.html#pyspark.sql.functions.asc">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">asc</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns a sort expression based on the ascending order of the given column name.</span>
<span class="sd"> .. versionadded:: 1.3.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> target column to sort by in the ascending order.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> the column specifying the order.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> Sort by the column &#39;id&#39; in the descending order.</span>
<span class="sd"> &gt;&gt;&gt; df = spark.range(5)</span>
<span class="sd"> &gt;&gt;&gt; df = df.sort(desc(&quot;id&quot;))</span>
<span class="sd"> &gt;&gt;&gt; df.show()</span>
<span class="sd"> +---+</span>
<span class="sd"> | id|</span>
<span class="sd"> +---+</span>
<span class="sd"> | 4|</span>
<span class="sd"> | 3|</span>
<span class="sd"> | 2|</span>
<span class="sd"> | 1|</span>
<span class="sd"> | 0|</span>
<span class="sd"> +---+</span>
<span class="sd"> Sort by the column &#39;id&#39; in the ascending order.</span>
<span class="sd"> &gt;&gt;&gt; df.orderBy(asc(&quot;id&quot;)).show()</span>
<span class="sd"> +---+</span>
<span class="sd"> | id|</span>
<span class="sd"> +---+</span>
<span class="sd"> | 0|</span>
<span class="sd"> | 1|</span>
<span class="sd"> | 2|</span>
<span class="sd"> | 3|</span>
<span class="sd"> | 4|</span>
<span class="sd"> +---+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">col</span><span class="o">.</span><span class="n">asc</span><span class="p">()</span> <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">col</span><span class="p">,</span> <span class="n">Column</span><span class="p">)</span> <span class="k">else</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">&quot;asc&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="desc"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.desc.html#pyspark.sql.functions.desc">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">desc</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns a sort expression based on the descending order of the given column name.</span>
<span class="sd"> .. versionadded:: 1.3.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> target column to sort by in the descending order.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> the column specifying the order.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> Sort by the column &#39;id&#39; in the descending order.</span>
<span class="sd"> &gt;&gt;&gt; spark.range(5).orderBy(desc(&quot;id&quot;)).show()</span>
<span class="sd"> +---+</span>
<span class="sd"> | id|</span>
<span class="sd"> +---+</span>
<span class="sd"> | 4|</span>
<span class="sd"> | 3|</span>
<span class="sd"> | 2|</span>
<span class="sd"> | 1|</span>
<span class="sd"> | 0|</span>
<span class="sd"> +---+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">col</span><span class="o">.</span><span class="n">desc</span><span class="p">()</span> <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">col</span><span class="p">,</span> <span class="n">Column</span><span class="p">)</span> <span class="k">else</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">&quot;desc&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="sqrt"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.sqrt.html#pyspark.sql.functions.sqrt">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">sqrt</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Computes the square root of the specified float value.</span>
<span class="sd"> .. versionadded:: 1.3.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> target column to compute on.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> column for computed results.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.range(1)</span>
<span class="sd"> &gt;&gt;&gt; df.select(sqrt(lit(4))).show()</span>
<span class="sd"> +-------+</span>
<span class="sd"> |SQRT(4)|</span>
<span class="sd"> +-------+</span>
<span class="sd"> | 2.0|</span>
<span class="sd"> +-------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;sqrt&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="try_add"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.try_add.html#pyspark.sql.functions.try_add">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">try_add</span><span class="p">(</span><span class="n">left</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="n">right</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns the sum of `left`and `right` and the result is null on overflow.</span>
<span class="sd"> The acceptable input types are the same with the `+` operator.</span>
<span class="sd"> .. versionadded:: 3.5.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> left : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> right : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(1982, 15), (1990, 2)], [&quot;birth&quot;, &quot;age&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(try_add(df.birth, df.age).alias(&#39;r&#39;)).collect()</span>
<span class="sd"> [Row(r=1997), Row(r=1992)]</span>
<span class="sd"> &gt;&gt;&gt; from pyspark.sql.types import StructType, StructField, IntegerType, StringType</span>
<span class="sd"> &gt;&gt;&gt; schema = StructType([</span>
<span class="sd"> ... StructField(&quot;i&quot;, IntegerType(), True),</span>
<span class="sd"> ... StructField(&quot;d&quot;, StringType(), True),</span>
<span class="sd"> ... ])</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(1, &#39;2015-09-30&#39;)], schema)</span>
<span class="sd"> &gt;&gt;&gt; df = df.select(df.i, to_date(df.d).alias(&#39;d&#39;))</span>
<span class="sd"> &gt;&gt;&gt; df.select(try_add(df.d, df.i).alias(&#39;r&#39;)).collect()</span>
<span class="sd"> [Row(r=datetime.date(2015, 10, 1))]</span>
<span class="sd"> &gt;&gt;&gt; df.select(try_add(df.d, make_interval(df.i)).alias(&#39;r&#39;)).collect()</span>
<span class="sd"> [Row(r=datetime.date(2016, 9, 30))]</span>
<span class="sd"> &gt;&gt;&gt; df.select(</span>
<span class="sd"> ... try_add(df.d, make_interval(lit(0), lit(0), lit(0), df.i)).alias(&#39;r&#39;)</span>
<span class="sd"> ... ).collect()</span>
<span class="sd"> [Row(r=datetime.date(2015, 10, 1))]</span>
<span class="sd"> &gt;&gt;&gt; df.select(</span>
<span class="sd"> ... try_add(make_interval(df.i), make_interval(df.i)).alias(&#39;r&#39;)</span>
<span class="sd"> ... ).show(truncate=False)</span>
<span class="sd"> +-------+</span>
<span class="sd"> |r |</span>
<span class="sd"> +-------+</span>
<span class="sd"> |2 years|</span>
<span class="sd"> +-------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;try_add&quot;</span><span class="p">,</span> <span class="n">left</span><span class="p">,</span> <span class="n">right</span><span class="p">)</span></div>
<div class="viewcode-block" id="try_avg"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.try_avg.html#pyspark.sql.functions.try_avg">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">try_avg</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns the mean calculated from values of a group and the result is null on overflow.</span>
<span class="sd"> .. versionadded:: 3.5.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; import pyspark.sql.functions as sf</span>
<span class="sd"> &gt;&gt;&gt; spark.createDataFrame(</span>
<span class="sd"> ... [(1982, 15), (1990, 2)], [&quot;birth&quot;, &quot;age&quot;]</span>
<span class="sd"> ... ).select(sf.try_avg(&quot;age&quot;)).show()</span>
<span class="sd"> +------------+</span>
<span class="sd"> |try_avg(age)|</span>
<span class="sd"> +------------+</span>
<span class="sd"> | 8.5|</span>
<span class="sd"> +------------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;try_avg&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="try_divide"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.try_divide.html#pyspark.sql.functions.try_divide">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">try_divide</span><span class="p">(</span><span class="n">left</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="n">right</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns `dividend`/`divisor`. It always performs floating point division. Its result is</span>
<span class="sd"> always null if `divisor` is 0.</span>
<span class="sd"> .. versionadded:: 3.5.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> left : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> dividend</span>
<span class="sd"> right : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> divisor</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(6000, 15), (1990, 2)], [&quot;a&quot;, &quot;b&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(try_divide(df.a, df.b).alias(&#39;r&#39;)).collect()</span>
<span class="sd"> [Row(r=400.0), Row(r=995.0)]</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(1, 2)], [&quot;year&quot;, &quot;month&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(</span>
<span class="sd"> ... try_divide(make_interval(df.year), df.month).alias(&#39;r&#39;)</span>
<span class="sd"> ... ).show(truncate=False)</span>
<span class="sd"> +--------+</span>
<span class="sd"> |r |</span>
<span class="sd"> +--------+</span>
<span class="sd"> |6 months|</span>
<span class="sd"> +--------+</span>
<span class="sd"> &gt;&gt;&gt; df.select(</span>
<span class="sd"> ... try_divide(make_interval(df.year, df.month), lit(2)).alias(&#39;r&#39;)</span>
<span class="sd"> ... ).show(truncate=False)</span>
<span class="sd"> +--------+</span>
<span class="sd"> |r |</span>
<span class="sd"> +--------+</span>
<span class="sd"> |7 months|</span>
<span class="sd"> +--------+</span>
<span class="sd"> &gt;&gt;&gt; df.select(</span>
<span class="sd"> ... try_divide(make_interval(df.year, df.month), lit(0)).alias(&#39;r&#39;)</span>
<span class="sd"> ... ).show(truncate=False)</span>
<span class="sd"> +----+</span>
<span class="sd"> |r |</span>
<span class="sd"> +----+</span>
<span class="sd"> |NULL|</span>
<span class="sd"> +----+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;try_divide&quot;</span><span class="p">,</span> <span class="n">left</span><span class="p">,</span> <span class="n">right</span><span class="p">)</span></div>
<div class="viewcode-block" id="try_multiply"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.try_multiply.html#pyspark.sql.functions.try_multiply">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">try_multiply</span><span class="p">(</span><span class="n">left</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="n">right</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns `left`*`right` and the result is null on overflow. The acceptable input types are the</span>
<span class="sd"> same with the `*` operator.</span>
<span class="sd"> .. versionadded:: 3.5.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> left : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> multiplicand</span>
<span class="sd"> right : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> multiplier</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(6000, 15), (1990, 2)], [&quot;a&quot;, &quot;b&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(try_multiply(df.a, df.b).alias(&#39;r&#39;)).collect()</span>
<span class="sd"> [Row(r=90000), Row(r=3980)]</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(2, 3),], [&quot;a&quot;, &quot;b&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(try_multiply(make_interval(df.a), df.b).alias(&#39;r&#39;)).show(truncate=False)</span>
<span class="sd"> +-------+</span>
<span class="sd"> |r |</span>
<span class="sd"> +-------+</span>
<span class="sd"> |6 years|</span>
<span class="sd"> +-------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;try_multiply&quot;</span><span class="p">,</span> <span class="n">left</span><span class="p">,</span> <span class="n">right</span><span class="p">)</span></div>
<div class="viewcode-block" id="try_subtract"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.try_subtract.html#pyspark.sql.functions.try_subtract">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">try_subtract</span><span class="p">(</span><span class="n">left</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="n">right</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns `left`-`right` and the result is null on overflow. The acceptable input types are the</span>
<span class="sd"> same with the `-` operator.</span>
<span class="sd"> .. versionadded:: 3.5.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> left : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> right : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(6000, 15), (1990, 2)], [&quot;a&quot;, &quot;b&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(try_subtract(df.a, df.b).alias(&#39;r&#39;)).collect()</span>
<span class="sd"> [Row(r=5985), Row(r=1988)]</span>
<span class="sd"> &gt;&gt;&gt; from pyspark.sql.types import StructType, StructField, IntegerType, StringType</span>
<span class="sd"> &gt;&gt;&gt; schema = StructType([</span>
<span class="sd"> ... StructField(&quot;i&quot;, IntegerType(), True),</span>
<span class="sd"> ... StructField(&quot;d&quot;, StringType(), True),</span>
<span class="sd"> ... ])</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(1, &#39;2015-09-30&#39;)], schema)</span>
<span class="sd"> &gt;&gt;&gt; df = df.select(df.i, to_date(df.d).alias(&#39;d&#39;))</span>
<span class="sd"> &gt;&gt;&gt; df.select(try_subtract(df.d, df.i).alias(&#39;r&#39;)).collect()</span>
<span class="sd"> [Row(r=datetime.date(2015, 9, 29))]</span>
<span class="sd"> &gt;&gt;&gt; df.select(try_subtract(df.d, make_interval(df.i)).alias(&#39;r&#39;)).collect()</span>
<span class="sd"> [Row(r=datetime.date(2014, 9, 30))]</span>
<span class="sd"> &gt;&gt;&gt; df.select(</span>
<span class="sd"> ... try_subtract(df.d, make_interval(lit(0), lit(0), lit(0), df.i)).alias(&#39;r&#39;)</span>
<span class="sd"> ... ).collect()</span>
<span class="sd"> [Row(r=datetime.date(2015, 9, 29))]</span>
<span class="sd"> &gt;&gt;&gt; df.select(</span>
<span class="sd"> ... try_subtract(make_interval(df.i), make_interval(df.i)).alias(&#39;r&#39;)</span>
<span class="sd"> ... ).show(truncate=False)</span>
<span class="sd"> +---------+</span>
<span class="sd"> |r |</span>
<span class="sd"> +---------+</span>
<span class="sd"> |0 seconds|</span>
<span class="sd"> +---------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;try_subtract&quot;</span><span class="p">,</span> <span class="n">left</span><span class="p">,</span> <span class="n">right</span><span class="p">)</span></div>
<div class="viewcode-block" id="try_sum"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.try_sum.html#pyspark.sql.functions.try_sum">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">try_sum</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns the sum calculated from values of a group and the result is null on overflow.</span>
<span class="sd"> .. versionadded:: 3.5.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; import pyspark.sql.functions as sf</span>
<span class="sd"> &gt;&gt;&gt; spark.range(10).select(sf.try_sum(&quot;id&quot;)).show()</span>
<span class="sd"> +-----------+</span>
<span class="sd"> |try_sum(id)|</span>
<span class="sd"> +-----------+</span>
<span class="sd"> | 45|</span>
<span class="sd"> +-----------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;try_sum&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="abs"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.abs.html#pyspark.sql.functions.abs">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">abs</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Computes the absolute value.</span>
<span class="sd"> .. versionadded:: 1.3.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> target column to compute on.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> column for computed results.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.range(1)</span>
<span class="sd"> &gt;&gt;&gt; df.select(abs(lit(-1))).show()</span>
<span class="sd"> +-------+</span>
<span class="sd"> |abs(-1)|</span>
<span class="sd"> +-------+</span>
<span class="sd"> | 1|</span>
<span class="sd"> +-------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;abs&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="mode"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.mode.html#pyspark.sql.functions.mode">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">mode</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns the most frequent value in a group.</span>
<span class="sd"> .. versionadded:: 3.4.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> target column to compute on.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> the most frequent value in a group.</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([</span>
<span class="sd"> ... (&quot;Java&quot;, 2012, 20000), (&quot;dotNET&quot;, 2012, 5000),</span>
<span class="sd"> ... (&quot;Java&quot;, 2012, 20000), (&quot;dotNET&quot;, 2012, 5000),</span>
<span class="sd"> ... (&quot;dotNET&quot;, 2013, 48000), (&quot;Java&quot;, 2013, 30000)],</span>
<span class="sd"> ... schema=(&quot;course&quot;, &quot;year&quot;, &quot;earnings&quot;))</span>
<span class="sd"> &gt;&gt;&gt; df.groupby(&quot;course&quot;).agg(mode(&quot;year&quot;)).show()</span>
<span class="sd"> +------+----------+</span>
<span class="sd"> |course|mode(year)|</span>
<span class="sd"> +------+----------+</span>
<span class="sd"> | Java| 2012|</span>
<span class="sd"> |dotNET| 2012|</span>
<span class="sd"> +------+----------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;mode&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="max"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.max.html#pyspark.sql.functions.max">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">max</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Aggregate function: returns the maximum value of the expression in a group.</span>
<span class="sd"> .. versionadded:: 1.3.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> target column to compute on.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> column for computed results.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.range(10)</span>
<span class="sd"> &gt;&gt;&gt; df.select(max(col(&quot;id&quot;))).show()</span>
<span class="sd"> +-------+</span>
<span class="sd"> |max(id)|</span>
<span class="sd"> +-------+</span>
<span class="sd"> | 9|</span>
<span class="sd"> +-------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;max&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="min"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.min.html#pyspark.sql.functions.min">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">min</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Aggregate function: returns the minimum value of the expression in a group.</span>
<span class="sd"> .. versionadded:: 1.3.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> target column to compute on.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> column for computed results.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.range(10)</span>
<span class="sd"> &gt;&gt;&gt; df.select(min(df.id)).show()</span>
<span class="sd"> +-------+</span>
<span class="sd"> |min(id)|</span>
<span class="sd"> +-------+</span>
<span class="sd"> | 0|</span>
<span class="sd"> +-------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;min&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="max_by"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.max_by.html#pyspark.sql.functions.max_by">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">max_by</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="nb">ord</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns the value associated with the maximum value of ord.</span>
<span class="sd"> .. versionadded:: 3.3.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> target column to compute on.</span>
<span class="sd"> ord : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> column to be maximized</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> value associated with the maximum value of ord.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([</span>
<span class="sd"> ... (&quot;Java&quot;, 2012, 20000), (&quot;dotNET&quot;, 2012, 5000),</span>
<span class="sd"> ... (&quot;dotNET&quot;, 2013, 48000), (&quot;Java&quot;, 2013, 30000)],</span>
<span class="sd"> ... schema=(&quot;course&quot;, &quot;year&quot;, &quot;earnings&quot;))</span>
<span class="sd"> &gt;&gt;&gt; df.groupby(&quot;course&quot;).agg(max_by(&quot;year&quot;, &quot;earnings&quot;)).show()</span>
<span class="sd"> +------+----------------------+</span>
<span class="sd"> |course|max_by(year, earnings)|</span>
<span class="sd"> +------+----------------------+</span>
<span class="sd"> | Java| 2013|</span>
<span class="sd"> |dotNET| 2013|</span>
<span class="sd"> +------+----------------------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;max_by&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">,</span> <span class="nb">ord</span><span class="p">)</span></div>
<div class="viewcode-block" id="min_by"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.min_by.html#pyspark.sql.functions.min_by">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">min_by</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="nb">ord</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns the value associated with the minimum value of ord.</span>
<span class="sd"> .. versionadded:: 3.3.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> target column to compute on.</span>
<span class="sd"> ord : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> column to be minimized</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> value associated with the minimum value of ord.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([</span>
<span class="sd"> ... (&quot;Java&quot;, 2012, 20000), (&quot;dotNET&quot;, 2012, 5000),</span>
<span class="sd"> ... (&quot;dotNET&quot;, 2013, 48000), (&quot;Java&quot;, 2013, 30000)],</span>
<span class="sd"> ... schema=(&quot;course&quot;, &quot;year&quot;, &quot;earnings&quot;))</span>
<span class="sd"> &gt;&gt;&gt; df.groupby(&quot;course&quot;).agg(min_by(&quot;year&quot;, &quot;earnings&quot;)).show()</span>
<span class="sd"> +------+----------------------+</span>
<span class="sd"> |course|min_by(year, earnings)|</span>
<span class="sd"> +------+----------------------+</span>
<span class="sd"> | Java| 2012|</span>
<span class="sd"> |dotNET| 2012|</span>
<span class="sd"> +------+----------------------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;min_by&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">,</span> <span class="nb">ord</span><span class="p">)</span></div>
<div class="viewcode-block" id="count"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.count.html#pyspark.sql.functions.count">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">count</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Aggregate function: returns the number of items in a group.</span>
<span class="sd"> .. versionadded:: 1.3.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> target column to compute on.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> column for computed results.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> Count by all columns (start), and by a column that does not count ``None``.</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(None,), (&quot;a&quot;,), (&quot;b&quot;,), (&quot;c&quot;,)], schema=[&quot;alphabets&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(count(expr(&quot;*&quot;)), count(df.alphabets)).show()</span>
<span class="sd"> +--------+----------------+</span>
<span class="sd"> |count(1)|count(alphabets)|</span>
<span class="sd"> +--------+----------------+</span>
<span class="sd"> | 4| 3|</span>
<span class="sd"> +--------+----------------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;count&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="sum"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.sum.html#pyspark.sql.functions.sum">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">sum</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Aggregate function: returns the sum of all values in the expression.</span>
<span class="sd"> .. versionadded:: 1.3.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> target column to compute on.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> the column for computed results.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.range(10)</span>
<span class="sd"> &gt;&gt;&gt; df.select(sum(df[&quot;id&quot;])).show()</span>
<span class="sd"> +-------+</span>
<span class="sd"> |sum(id)|</span>
<span class="sd"> +-------+</span>
<span class="sd"> | 45|</span>
<span class="sd"> +-------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;sum&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="avg"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.avg.html#pyspark.sql.functions.avg">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">avg</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Aggregate function: returns the average of the values in a group.</span>
<span class="sd"> .. versionadded:: 1.3.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> target column to compute on.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> the column for computed results.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.range(10)</span>
<span class="sd"> &gt;&gt;&gt; df.select(avg(col(&quot;id&quot;))).show()</span>
<span class="sd"> +-------+</span>
<span class="sd"> |avg(id)|</span>
<span class="sd"> +-------+</span>
<span class="sd"> | 4.5|</span>
<span class="sd"> +-------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;avg&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="mean"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.mean.html#pyspark.sql.functions.mean">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">mean</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Aggregate function: returns the average of the values in a group.</span>
<span class="sd"> An alias of :func:`avg`.</span>
<span class="sd"> .. versionadded:: 1.4.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> target column to compute on.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> the column for computed results.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.range(10)</span>
<span class="sd"> &gt;&gt;&gt; df.select(mean(df.id)).show()</span>
<span class="sd"> +-------+</span>
<span class="sd"> |avg(id)|</span>
<span class="sd"> +-------+</span>
<span class="sd"> | 4.5|</span>
<span class="sd"> +-------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;mean&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="median"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.median.html#pyspark.sql.functions.median">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">median</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns the median of the values in a group.</span>
<span class="sd"> .. versionadded:: 3.4.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> target column to compute on.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> the median of the values in a group.</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([</span>
<span class="sd"> ... (&quot;Java&quot;, 2012, 20000), (&quot;dotNET&quot;, 2012, 5000),</span>
<span class="sd"> ... (&quot;Java&quot;, 2012, 22000), (&quot;dotNET&quot;, 2012, 10000),</span>
<span class="sd"> ... (&quot;dotNET&quot;, 2013, 48000), (&quot;Java&quot;, 2013, 30000)],</span>
<span class="sd"> ... schema=(&quot;course&quot;, &quot;year&quot;, &quot;earnings&quot;))</span>
<span class="sd"> &gt;&gt;&gt; df.groupby(&quot;course&quot;).agg(median(&quot;earnings&quot;)).show()</span>
<span class="sd"> +------+----------------+</span>
<span class="sd"> |course|median(earnings)|</span>
<span class="sd"> +------+----------------+</span>
<span class="sd"> | Java| 22000.0|</span>
<span class="sd"> |dotNET| 10000.0|</span>
<span class="sd"> +------+----------------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;median&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="sumDistinct"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.sumDistinct.html#pyspark.sql.functions.sumDistinct">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">sumDistinct</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Aggregate function: returns the sum of distinct values in the expression.</span>
<span class="sd"> .. versionadded:: 1.3.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> .. deprecated:: 3.2.0</span>
<span class="sd"> Use :func:`sum_distinct` instead.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">warnings</span><span class="o">.</span><span class="n">warn</span><span class="p">(</span><span class="s2">&quot;Deprecated in 3.2, use sum_distinct instead.&quot;</span><span class="p">,</span> <span class="ne">FutureWarning</span><span class="p">)</span>
<span class="k">return</span> <span class="n">sum_distinct</span><span class="p">(</span><span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="sum_distinct"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.sum_distinct.html#pyspark.sql.functions.sum_distinct">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">sum_distinct</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Aggregate function: returns the sum of distinct values in the expression.</span>
<span class="sd"> .. versionadded:: 3.2.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> target column to compute on.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> the column for computed results.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(None,), (1,), (1,), (2,)], schema=[&quot;numbers&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(sum_distinct(col(&quot;numbers&quot;))).show()</span>
<span class="sd"> +---------------------+</span>
<span class="sd"> |sum(DISTINCT numbers)|</span>
<span class="sd"> +---------------------+</span>
<span class="sd"> | 3|</span>
<span class="sd"> +---------------------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;sum_distinct&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="product"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.product.html#pyspark.sql.functions.product">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">product</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Aggregate function: returns the product of the values in a group.</span>
<span class="sd"> .. versionadded:: 3.2.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : str, :class:`Column`</span>
<span class="sd"> column containing values to be multiplied together</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> the column for computed results.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.range(1, 10).toDF(&#39;x&#39;).withColumn(&#39;mod3&#39;, col(&#39;x&#39;) % 3)</span>
<span class="sd"> &gt;&gt;&gt; prods = df.groupBy(&#39;mod3&#39;).agg(product(&#39;x&#39;).alias(&#39;product&#39;))</span>
<span class="sd"> &gt;&gt;&gt; prods.orderBy(&#39;mod3&#39;).show()</span>
<span class="sd"> +----+-------+</span>
<span class="sd"> |mod3|product|</span>
<span class="sd"> +----+-------+</span>
<span class="sd"> | 0| 162.0|</span>
<span class="sd"> | 1| 28.0|</span>
<span class="sd"> | 2| 80.0|</span>
<span class="sd"> +----+-------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;product&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="acos"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.acos.html#pyspark.sql.functions.acos">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">acos</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Computes inverse cosine of the input column.</span>
<span class="sd"> .. versionadded:: 1.4.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> target column to compute on.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> inverse cosine of `col`, as if computed by `java.lang.Math.acos()`</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.range(1, 3)</span>
<span class="sd"> &gt;&gt;&gt; df.select(acos(df.id)).show()</span>
<span class="sd"> +--------+</span>
<span class="sd"> |ACOS(id)|</span>
<span class="sd"> +--------+</span>
<span class="sd"> | 0.0|</span>
<span class="sd"> | NaN|</span>
<span class="sd"> +--------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;acos&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="acosh"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.acosh.html#pyspark.sql.functions.acosh">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">acosh</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Computes inverse hyperbolic cosine of the input column.</span>
<span class="sd"> .. versionadded:: 3.1.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> target column to compute on.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> the column for computed results.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.range(2)</span>
<span class="sd"> &gt;&gt;&gt; df.select(acosh(col(&quot;id&quot;))).show()</span>
<span class="sd"> +---------+</span>
<span class="sd"> |ACOSH(id)|</span>
<span class="sd"> +---------+</span>
<span class="sd"> | NaN|</span>
<span class="sd"> | 0.0|</span>
<span class="sd"> +---------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;acosh&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="asin"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.asin.html#pyspark.sql.functions.asin">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">asin</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Computes inverse sine of the input column.</span>
<span class="sd"> .. versionadded:: 1.4.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> target column to compute on.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> inverse sine of `col`, as if computed by `java.lang.Math.asin()`</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(0,), (2,)])</span>
<span class="sd"> &gt;&gt;&gt; df.select(asin(df.schema.fieldNames()[0])).show()</span>
<span class="sd"> +--------+</span>
<span class="sd"> |ASIN(_1)|</span>
<span class="sd"> +--------+</span>
<span class="sd"> | 0.0|</span>
<span class="sd"> | NaN|</span>
<span class="sd"> +--------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;asin&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="asinh"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.asinh.html#pyspark.sql.functions.asinh">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">asinh</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Computes inverse hyperbolic sine of the input column.</span>
<span class="sd"> .. versionadded:: 3.1.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> target column to compute on.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> the column for computed results.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.range(1)</span>
<span class="sd"> &gt;&gt;&gt; df.select(asinh(col(&quot;id&quot;))).show()</span>
<span class="sd"> +---------+</span>
<span class="sd"> |ASINH(id)|</span>
<span class="sd"> +---------+</span>
<span class="sd"> | 0.0|</span>
<span class="sd"> +---------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;asinh&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="atan"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.atan.html#pyspark.sql.functions.atan">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">atan</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Compute inverse tangent of the input column.</span>
<span class="sd"> .. versionadded:: 1.4.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> target column to compute on.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> inverse tangent of `col`, as if computed by `java.lang.Math.atan()`</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.range(1)</span>
<span class="sd"> &gt;&gt;&gt; df.select(atan(df.id)).show()</span>
<span class="sd"> +--------+</span>
<span class="sd"> |ATAN(id)|</span>
<span class="sd"> +--------+</span>
<span class="sd"> | 0.0|</span>
<span class="sd"> +--------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;atan&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="atanh"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.atanh.html#pyspark.sql.functions.atanh">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">atanh</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Computes inverse hyperbolic tangent of the input column.</span>
<span class="sd"> .. versionadded:: 3.1.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> target column to compute on.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> the column for computed results.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(0,), (2,)], schema=[&quot;numbers&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(atanh(df[&quot;numbers&quot;])).show()</span>
<span class="sd"> +--------------+</span>
<span class="sd"> |ATANH(numbers)|</span>
<span class="sd"> +--------------+</span>
<span class="sd"> | 0.0|</span>
<span class="sd"> | NaN|</span>
<span class="sd"> +--------------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;atanh&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="cbrt"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.cbrt.html#pyspark.sql.functions.cbrt">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">cbrt</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Computes the cube-root of the given value.</span>
<span class="sd"> .. versionadded:: 1.4.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> target column to compute on.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> the column for computed results.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.range(1)</span>
<span class="sd"> &gt;&gt;&gt; df.select(cbrt(lit(27))).show()</span>
<span class="sd"> +--------+</span>
<span class="sd"> |CBRT(27)|</span>
<span class="sd"> +--------+</span>
<span class="sd"> | 3.0|</span>
<span class="sd"> +--------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;cbrt&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="ceil"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.ceil.html#pyspark.sql.functions.ceil">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">ceil</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Computes the ceiling of the given value.</span>
<span class="sd"> .. versionadded:: 1.4.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> target column to compute on.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> the column for computed results.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.range(1)</span>
<span class="sd"> &gt;&gt;&gt; df.select(ceil(lit(-0.1))).show()</span>
<span class="sd"> +----------+</span>
<span class="sd"> |CEIL(-0.1)|</span>
<span class="sd"> +----------+</span>
<span class="sd"> | 0|</span>
<span class="sd"> +----------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;ceil&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="ceiling"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.ceiling.html#pyspark.sql.functions.ceiling">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">ceiling</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Computes the ceiling of the given value.</span>
<span class="sd"> .. versionadded:: 1.4.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> target column to compute on.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> the column for computed results.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; import pyspark.sql.functions as sf</span>
<span class="sd"> &gt;&gt;&gt; spark.range(1).select(sf.ceil(sf.lit(-0.1))).show()</span>
<span class="sd"> +----------+</span>
<span class="sd"> |CEIL(-0.1)|</span>
<span class="sd"> +----------+</span>
<span class="sd"> | 0|</span>
<span class="sd"> +----------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;ceiling&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="cos"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.cos.html#pyspark.sql.functions.cos">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">cos</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Computes cosine of the input column.</span>
<span class="sd"> .. versionadded:: 1.4.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> angle in radians</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> cosine of the angle, as if computed by `java.lang.Math.cos()`.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; import math</span>
<span class="sd"> &gt;&gt;&gt; df = spark.range(1)</span>
<span class="sd"> &gt;&gt;&gt; df.select(cos(lit(math.pi))).first()</span>
<span class="sd"> Row(COS(3.14159...)=-1.0)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;cos&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="cosh"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.cosh.html#pyspark.sql.functions.cosh">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">cosh</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Computes hyperbolic cosine of the input column.</span>
<span class="sd"> .. versionadded:: 1.4.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> hyperbolic angle</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> hyperbolic cosine of the angle, as if computed by `java.lang.Math.cosh()`</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.range(1)</span>
<span class="sd"> &gt;&gt;&gt; df.select(cosh(lit(1))).first()</span>
<span class="sd"> Row(COSH(1)=1.54308...)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;cosh&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="cot"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.cot.html#pyspark.sql.functions.cot">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">cot</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Computes cotangent of the input column.</span>
<span class="sd"> .. versionadded:: 3.3.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> angle in radians.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> cotangent of the angle.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; import math</span>
<span class="sd"> &gt;&gt;&gt; df = spark.range(1)</span>
<span class="sd"> &gt;&gt;&gt; df.select(cot(lit(math.radians(45)))).first()</span>
<span class="sd"> Row(COT(0.78539...)=1.00000...)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;cot&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="csc"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.csc.html#pyspark.sql.functions.csc">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">csc</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Computes cosecant of the input column.</span>
<span class="sd"> .. versionadded:: 3.3.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> angle in radians.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> cosecant of the angle.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; import math</span>
<span class="sd"> &gt;&gt;&gt; df = spark.range(1)</span>
<span class="sd"> &gt;&gt;&gt; df.select(csc(lit(math.radians(90)))).first()</span>
<span class="sd"> Row(CSC(1.57079...)=1.0)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;csc&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="e"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.e.html#pyspark.sql.functions.e">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">e</span><span class="p">()</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Returns Euler&#39;s number.</span>
<span class="sd"> .. versionadded:: 3.5.0</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; spark.range(1).select(e()).show()</span>
<span class="sd"> +-----------------+</span>
<span class="sd"> | E()|</span>
<span class="sd"> +-----------------+</span>
<span class="sd"> |2.718281828459045|</span>
<span class="sd"> +-----------------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">&quot;e&quot;</span><span class="p">)</span></div>
<div class="viewcode-block" id="exp"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.exp.html#pyspark.sql.functions.exp">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">exp</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Computes the exponential of the given value.</span>
<span class="sd"> .. versionadded:: 1.4.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> column to calculate exponential for.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> exponential of the given value.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.range(1)</span>
<span class="sd"> &gt;&gt;&gt; df.select(exp(lit(0))).show()</span>
<span class="sd"> +------+</span>
<span class="sd"> |EXP(0)|</span>
<span class="sd"> +------+</span>
<span class="sd"> | 1.0|</span>
<span class="sd"> +------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;exp&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="expm1"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.expm1.html#pyspark.sql.functions.expm1">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">expm1</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Computes the exponential of the given value minus one.</span>
<span class="sd"> .. versionadded:: 1.4.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> column to calculate exponential for.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> exponential less one.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.range(1)</span>
<span class="sd"> &gt;&gt;&gt; df.select(expm1(lit(1))).first()</span>
<span class="sd"> Row(EXPM1(1)=1.71828...)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;expm1&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="floor"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.floor.html#pyspark.sql.functions.floor">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">floor</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Computes the floor of the given value.</span>
<span class="sd"> .. versionadded:: 1.4.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> column to find floor for.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> nearest integer that is less than or equal to given value.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.range(1)</span>
<span class="sd"> &gt;&gt;&gt; df.select(floor(lit(2.5))).show()</span>
<span class="sd"> +----------+</span>
<span class="sd"> |FLOOR(2.5)|</span>
<span class="sd"> +----------+</span>
<span class="sd"> | 2|</span>
<span class="sd"> +----------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;floor&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">log</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Computes the natural logarithm of the given value.</span>
<span class="sd"> .. versionadded:: 1.4.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> column to calculate natural logarithm for.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> natural logarithm of the given value.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; import math</span>
<span class="sd"> &gt;&gt;&gt; df = spark.range(1)</span>
<span class="sd"> &gt;&gt;&gt; df.select(log(lit(math.e))).first()</span>
<span class="sd"> Row(ln(2.71828...)=1.0)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;log&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span>
<div class="viewcode-block" id="log10"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.log10.html#pyspark.sql.functions.log10">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">log10</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Computes the logarithm of the given value in Base 10.</span>
<span class="sd"> .. versionadded:: 1.4.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> column to calculate logarithm for.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> logarithm of the given value in Base 10.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.range(1)</span>
<span class="sd"> &gt;&gt;&gt; df.select(log10(lit(100))).show()</span>
<span class="sd"> +----------+</span>
<span class="sd"> |LOG10(100)|</span>
<span class="sd"> +----------+</span>
<span class="sd"> | 2.0|</span>
<span class="sd"> +----------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;log10&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="log1p"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.log1p.html#pyspark.sql.functions.log1p">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">log1p</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Computes the natural logarithm of the &quot;given value plus one&quot;.</span>
<span class="sd"> .. versionadded:: 1.4.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> column to calculate natural logarithm for.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> natural logarithm of the &quot;given value plus one&quot;.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; import math</span>
<span class="sd"> &gt;&gt;&gt; df = spark.range(1)</span>
<span class="sd"> &gt;&gt;&gt; df.select(log1p(lit(math.e))).first()</span>
<span class="sd"> Row(LOG1P(2.71828...)=1.31326...)</span>
<span class="sd"> Same as:</span>
<span class="sd"> &gt;&gt;&gt; df.select(log(lit(math.e+1))).first()</span>
<span class="sd"> Row(ln(3.71828...)=1.31326...)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;log1p&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="negative"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.negative.html#pyspark.sql.functions.negative">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">negative</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns the negative value.</span>
<span class="sd"> .. versionadded:: 3.5.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> column to calculate negative value for.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> negative value.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; import pyspark.sql.functions as sf</span>
<span class="sd"> &gt;&gt;&gt; spark.range(3).select(sf.negative(&quot;id&quot;)).show()</span>
<span class="sd"> +------------+</span>
<span class="sd"> |negative(id)|</span>
<span class="sd"> +------------+</span>
<span class="sd"> | 0|</span>
<span class="sd"> | -1|</span>
<span class="sd"> | -2|</span>
<span class="sd"> +------------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;negative&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<span class="n">negate</span> <span class="o">=</span> <span class="n">negative</span>
<div class="viewcode-block" id="pi"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.pi.html#pyspark.sql.functions.pi">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">pi</span><span class="p">()</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Returns Pi.</span>
<span class="sd"> .. versionadded:: 3.5.0</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; spark.range(1).select(pi()).show()</span>
<span class="sd"> +-----------------+</span>
<span class="sd"> | PI()|</span>
<span class="sd"> +-----------------+</span>
<span class="sd"> |3.141592653589793|</span>
<span class="sd"> +-----------------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">&quot;pi&quot;</span><span class="p">)</span></div>
<div class="viewcode-block" id="positive"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.positive.html#pyspark.sql.functions.positive">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">positive</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns the value.</span>
<span class="sd"> .. versionadded:: 3.5.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> input value column.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> value.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(-1,), (0,), (1,)], [&#39;v&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(positive(&quot;v&quot;).alias(&quot;p&quot;)).show()</span>
<span class="sd"> +---+</span>
<span class="sd"> | p|</span>
<span class="sd"> +---+</span>
<span class="sd"> | -1|</span>
<span class="sd"> | 0|</span>
<span class="sd"> | 1|</span>
<span class="sd"> +---+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;positive&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="rint"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.rint.html#pyspark.sql.functions.rint">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">rint</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns the double value that is closest in value to the argument and</span>
<span class="sd"> is equal to a mathematical integer.</span>
<span class="sd"> .. versionadded:: 1.4.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> target column to compute on.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> the column for computed results.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.range(1)</span>
<span class="sd"> &gt;&gt;&gt; df.select(rint(lit(10.6))).show()</span>
<span class="sd"> +----------+</span>
<span class="sd"> |rint(10.6)|</span>
<span class="sd"> +----------+</span>
<span class="sd"> | 11.0|</span>
<span class="sd"> +----------+</span>
<span class="sd"> &gt;&gt;&gt; df.select(rint(lit(10.3))).show()</span>
<span class="sd"> +----------+</span>
<span class="sd"> |rint(10.3)|</span>
<span class="sd"> +----------+</span>
<span class="sd"> | 10.0|</span>
<span class="sd"> +----------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;rint&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="sec"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.sec.html#pyspark.sql.functions.sec">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">sec</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Computes secant of the input column.</span>
<span class="sd"> .. versionadded:: 3.3.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> Angle in radians</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> Secant of the angle.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.range(1)</span>
<span class="sd"> &gt;&gt;&gt; df.select(sec(lit(1.5))).first()</span>
<span class="sd"> Row(SEC(1.5)=14.13683...)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;sec&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="signum"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.signum.html#pyspark.sql.functions.signum">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">signum</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Computes the signum of the given value.</span>
<span class="sd"> .. versionadded:: 1.4.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> target column to compute on.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> the column for computed results.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; import pyspark.sql.functions as sf</span>
<span class="sd"> &gt;&gt;&gt; spark.range(1).select(</span>
<span class="sd"> ... sf.signum(sf.lit(-5)),</span>
<span class="sd"> ... sf.signum(sf.lit(6))</span>
<span class="sd"> ... ).show()</span>
<span class="sd"> +----------+---------+</span>
<span class="sd"> |SIGNUM(-5)|SIGNUM(6)|</span>
<span class="sd"> +----------+---------+</span>
<span class="sd"> | -1.0| 1.0|</span>
<span class="sd"> +----------+---------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;signum&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="sign"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.sign.html#pyspark.sql.functions.sign">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">sign</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Computes the signum of the given value.</span>
<span class="sd"> .. versionadded:: 1.4.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> target column to compute on.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> the column for computed results.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; import pyspark.sql.functions as sf</span>
<span class="sd"> &gt;&gt;&gt; spark.range(1).select(</span>
<span class="sd"> ... sf.sign(sf.lit(-5)),</span>
<span class="sd"> ... sf.sign(sf.lit(6))</span>
<span class="sd"> ... ).show()</span>
<span class="sd"> +--------+-------+</span>
<span class="sd"> |sign(-5)|sign(6)|</span>
<span class="sd"> +--------+-------+</span>
<span class="sd"> | -1.0| 1.0|</span>
<span class="sd"> +--------+-------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;sign&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="sin"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.sin.html#pyspark.sql.functions.sin">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">sin</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Computes sine of the input column.</span>
<span class="sd"> .. versionadded:: 1.4.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> target column to compute on.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> sine of the angle, as if computed by `java.lang.Math.sin()`</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; import math</span>
<span class="sd"> &gt;&gt;&gt; df = spark.range(1)</span>
<span class="sd"> &gt;&gt;&gt; df.select(sin(lit(math.radians(90)))).first()</span>
<span class="sd"> Row(SIN(1.57079...)=1.0)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;sin&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="sinh"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.sinh.html#pyspark.sql.functions.sinh">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">sinh</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Computes hyperbolic sine of the input column.</span>
<span class="sd"> .. versionadded:: 1.4.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> hyperbolic angle.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> hyperbolic sine of the given value,</span>
<span class="sd"> as if computed by `java.lang.Math.sinh()`</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.range(1)</span>
<span class="sd"> &gt;&gt;&gt; df.select(sinh(lit(1.1))).first()</span>
<span class="sd"> Row(SINH(1.1)=1.33564...)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;sinh&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="tan"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.tan.html#pyspark.sql.functions.tan">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">tan</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Computes tangent of the input column.</span>
<span class="sd"> .. versionadded:: 1.4.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> angle in radians</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> tangent of the given value, as if computed by `java.lang.Math.tan()`</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; import math</span>
<span class="sd"> &gt;&gt;&gt; df = spark.range(1)</span>
<span class="sd"> &gt;&gt;&gt; df.select(tan(lit(math.radians(45)))).first()</span>
<span class="sd"> Row(TAN(0.78539...)=0.99999...)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;tan&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="tanh"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.tanh.html#pyspark.sql.functions.tanh">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">tanh</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Computes hyperbolic tangent of the input column.</span>
<span class="sd"> .. versionadded:: 1.4.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> hyperbolic angle</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> hyperbolic tangent of the given value</span>
<span class="sd"> as if computed by `java.lang.Math.tanh()`</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; import math</span>
<span class="sd"> &gt;&gt;&gt; df = spark.range(1)</span>
<span class="sd"> &gt;&gt;&gt; df.select(tanh(lit(math.radians(90)))).first()</span>
<span class="sd"> Row(TANH(1.57079...)=0.91715...)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;tanh&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="toDegrees"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.toDegrees.html#pyspark.sql.functions.toDegrees">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">toDegrees</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> .. versionadded:: 1.4.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> .. deprecated:: 2.1.0</span>
<span class="sd"> Use :func:`degrees` instead.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">warnings</span><span class="o">.</span><span class="n">warn</span><span class="p">(</span><span class="s2">&quot;Deprecated in 2.1, use degrees instead.&quot;</span><span class="p">,</span> <span class="ne">FutureWarning</span><span class="p">)</span>
<span class="k">return</span> <span class="n">degrees</span><span class="p">(</span><span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="toRadians"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.toRadians.html#pyspark.sql.functions.toRadians">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">toRadians</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> .. versionadded:: 1.4.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> .. deprecated:: 2.1.0</span>
<span class="sd"> Use :func:`radians` instead.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">warnings</span><span class="o">.</span><span class="n">warn</span><span class="p">(</span><span class="s2">&quot;Deprecated in 2.1, use radians instead.&quot;</span><span class="p">,</span> <span class="ne">FutureWarning</span><span class="p">)</span>
<span class="k">return</span> <span class="n">radians</span><span class="p">(</span><span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="bitwiseNOT"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.bitwiseNOT.html#pyspark.sql.functions.bitwiseNOT">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">bitwiseNOT</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Computes bitwise not.</span>
<span class="sd"> .. versionadded:: 1.4.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> .. deprecated:: 3.2.0</span>
<span class="sd"> Use :func:`bitwise_not` instead.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">warnings</span><span class="o">.</span><span class="n">warn</span><span class="p">(</span><span class="s2">&quot;Deprecated in 3.2, use bitwise_not instead.&quot;</span><span class="p">,</span> <span class="ne">FutureWarning</span><span class="p">)</span>
<span class="k">return</span> <span class="n">bitwise_not</span><span class="p">(</span><span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="bitwise_not"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.bitwise_not.html#pyspark.sql.functions.bitwise_not">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">bitwise_not</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Computes bitwise not.</span>
<span class="sd"> .. versionadded:: 3.2.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> target column to compute on.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> the column for computed results.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.range(1)</span>
<span class="sd"> &gt;&gt;&gt; df.select(bitwise_not(lit(0))).show()</span>
<span class="sd"> +---+</span>
<span class="sd"> | ~0|</span>
<span class="sd"> +---+</span>
<span class="sd"> | -1|</span>
<span class="sd"> +---+</span>
<span class="sd"> &gt;&gt;&gt; df.select(bitwise_not(lit(1))).show()</span>
<span class="sd"> +---+</span>
<span class="sd"> | ~1|</span>
<span class="sd"> +---+</span>
<span class="sd"> | -2|</span>
<span class="sd"> +---+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;bitwise_not&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="bit_count"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.bit_count.html#pyspark.sql.functions.bit_count">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">bit_count</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns the number of bits that are set in the argument expr as an unsigned 64-bit integer,</span>
<span class="sd"> or NULL if the argument is NULL.</span>
<span class="sd"> .. versionadded:: 3.5.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> target column to compute on.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> the number of bits that are set in the argument expr as an unsigned 64-bit integer,</span>
<span class="sd"> or NULL if the argument is NULL.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([[1],[1],[2]], [&quot;c&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(bit_count(&quot;c&quot;)).show()</span>
<span class="sd"> +------------+</span>
<span class="sd"> |bit_count(c)|</span>
<span class="sd"> +------------+</span>
<span class="sd"> | 1|</span>
<span class="sd"> | 1|</span>
<span class="sd"> | 1|</span>
<span class="sd"> +------------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;bit_count&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="bit_get"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.bit_get.html#pyspark.sql.functions.bit_get">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">bit_get</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="n">pos</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns the value of the bit (0 or 1) at the specified position.</span>
<span class="sd"> The positions are numbered from right to left, starting at zero.</span>
<span class="sd"> The position argument cannot be negative.</span>
<span class="sd"> .. versionadded:: 3.5.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> target column to compute on.</span>
<span class="sd"> pos : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> The positions are numbered from right to left, starting at zero.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> the value of the bit (0 or 1) at the specified position.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([[1],[1],[2]], [&quot;c&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(bit_get(&quot;c&quot;, lit(1))).show()</span>
<span class="sd"> +-------------+</span>
<span class="sd"> |bit_get(c, 1)|</span>
<span class="sd"> +-------------+</span>
<span class="sd"> | 0|</span>
<span class="sd"> | 0|</span>
<span class="sd"> | 1|</span>
<span class="sd"> +-------------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;bit_get&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">,</span> <span class="n">pos</span><span class="p">)</span></div>
<div class="viewcode-block" id="getbit"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.getbit.html#pyspark.sql.functions.getbit">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">getbit</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="n">pos</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns the value of the bit (0 or 1) at the specified position.</span>
<span class="sd"> The positions are numbered from right to left, starting at zero.</span>
<span class="sd"> The position argument cannot be negative.</span>
<span class="sd"> .. versionadded:: 3.5.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> target column to compute on.</span>
<span class="sd"> pos : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> The positions are numbered from right to left, starting at zero.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> the value of the bit (0 or 1) at the specified position.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; import pyspark.sql.functions as sf</span>
<span class="sd"> &gt;&gt;&gt; spark.createDataFrame(</span>
<span class="sd"> ... [[1], [1], [2]], [&quot;c&quot;]</span>
<span class="sd"> ... ).select(sf.getbit(&quot;c&quot;, sf.lit(1))).show()</span>
<span class="sd"> +------------+</span>
<span class="sd"> |getbit(c, 1)|</span>
<span class="sd"> +------------+</span>
<span class="sd"> | 0|</span>
<span class="sd"> | 0|</span>
<span class="sd"> | 1|</span>
<span class="sd"> +------------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;getbit&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">,</span> <span class="n">pos</span><span class="p">)</span></div>
<div class="viewcode-block" id="asc_nulls_first"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.asc_nulls_first.html#pyspark.sql.functions.asc_nulls_first">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">asc_nulls_first</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns a sort expression based on the ascending order of the given</span>
<span class="sd"> column name, and null values return before non-null values.</span>
<span class="sd"> .. versionadded:: 2.4.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> target column to sort by in the ascending order.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> the column specifying the order.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df1 = spark.createDataFrame([(1, &quot;Bob&quot;),</span>
<span class="sd"> ... (0, None),</span>
<span class="sd"> ... (2, &quot;Alice&quot;)], [&quot;age&quot;, &quot;name&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df1.sort(asc_nulls_first(df1.name)).show()</span>
<span class="sd"> +---+-----+</span>
<span class="sd"> |age| name|</span>
<span class="sd"> +---+-----+</span>
<span class="sd"> | 0| NULL|</span>
<span class="sd"> | 2|Alice|</span>
<span class="sd"> | 1| Bob|</span>
<span class="sd"> +---+-----+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="p">(</span>
<span class="n">col</span><span class="o">.</span><span class="n">asc_nulls_first</span><span class="p">()</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">col</span><span class="p">,</span> <span class="n">Column</span><span class="p">)</span>
<span class="k">else</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">&quot;asc_nulls_first&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span>
<span class="p">)</span></div>
<div class="viewcode-block" id="asc_nulls_last"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.asc_nulls_last.html#pyspark.sql.functions.asc_nulls_last">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">asc_nulls_last</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns a sort expression based on the ascending order of the given</span>
<span class="sd"> column name, and null values appear after non-null values.</span>
<span class="sd"> .. versionadded:: 2.4.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> target column to sort by in the ascending order.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> the column specifying the order.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df1 = spark.createDataFrame([(0, None),</span>
<span class="sd"> ... (1, &quot;Bob&quot;),</span>
<span class="sd"> ... (2, &quot;Alice&quot;)], [&quot;age&quot;, &quot;name&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df1.sort(asc_nulls_last(df1.name)).show()</span>
<span class="sd"> +---+-----+</span>
<span class="sd"> |age| name|</span>
<span class="sd"> +---+-----+</span>
<span class="sd"> | 2|Alice|</span>
<span class="sd"> | 1| Bob|</span>
<span class="sd"> | 0| NULL|</span>
<span class="sd"> +---+-----+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="p">(</span>
<span class="n">col</span><span class="o">.</span><span class="n">asc_nulls_last</span><span class="p">()</span> <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">col</span><span class="p">,</span> <span class="n">Column</span><span class="p">)</span> <span class="k">else</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">&quot;asc_nulls_last&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span>
<span class="p">)</span></div>
<div class="viewcode-block" id="desc_nulls_first"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.desc_nulls_first.html#pyspark.sql.functions.desc_nulls_first">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">desc_nulls_first</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns a sort expression based on the descending order of the given</span>
<span class="sd"> column name, and null values appear before non-null values.</span>
<span class="sd"> .. versionadded:: 2.4.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> target column to sort by in the descending order.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> the column specifying the order.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df1 = spark.createDataFrame([(0, None),</span>
<span class="sd"> ... (1, &quot;Bob&quot;),</span>
<span class="sd"> ... (2, &quot;Alice&quot;)], [&quot;age&quot;, &quot;name&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df1.sort(desc_nulls_first(df1.name)).show()</span>
<span class="sd"> +---+-----+</span>
<span class="sd"> |age| name|</span>
<span class="sd"> +---+-----+</span>
<span class="sd"> | 0| NULL|</span>
<span class="sd"> | 1| Bob|</span>
<span class="sd"> | 2|Alice|</span>
<span class="sd"> +---+-----+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="p">(</span>
<span class="n">col</span><span class="o">.</span><span class="n">desc_nulls_first</span><span class="p">()</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">col</span><span class="p">,</span> <span class="n">Column</span><span class="p">)</span>
<span class="k">else</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">&quot;desc_nulls_first&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span>
<span class="p">)</span></div>
<div class="viewcode-block" id="desc_nulls_last"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.desc_nulls_last.html#pyspark.sql.functions.desc_nulls_last">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">desc_nulls_last</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns a sort expression based on the descending order of the given</span>
<span class="sd"> column name, and null values appear after non-null values.</span>
<span class="sd"> .. versionadded:: 2.4.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> target column to sort by in the descending order.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> the column specifying the order.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df1 = spark.createDataFrame([(0, None),</span>
<span class="sd"> ... (1, &quot;Bob&quot;),</span>
<span class="sd"> ... (2, &quot;Alice&quot;)], [&quot;age&quot;, &quot;name&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df1.sort(desc_nulls_last(df1.name)).show()</span>
<span class="sd"> +---+-----+</span>
<span class="sd"> |age| name|</span>
<span class="sd"> +---+-----+</span>
<span class="sd"> | 1| Bob|</span>
<span class="sd"> | 2|Alice|</span>
<span class="sd"> | 0| NULL|</span>
<span class="sd"> +---+-----+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="p">(</span>
<span class="n">col</span><span class="o">.</span><span class="n">desc_nulls_last</span><span class="p">()</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">col</span><span class="p">,</span> <span class="n">Column</span><span class="p">)</span>
<span class="k">else</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">&quot;desc_nulls_last&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span>
<span class="p">)</span></div>
<div class="viewcode-block" id="stddev"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.stddev.html#pyspark.sql.functions.stddev">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">stddev</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Aggregate function: alias for stddev_samp.</span>
<span class="sd"> .. versionadded:: 1.6.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> target column to compute on.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> standard deviation of given column.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; import pyspark.sql.functions as sf</span>
<span class="sd"> &gt;&gt;&gt; spark.range(6).select(sf.stddev(&quot;id&quot;)).show()</span>
<span class="sd"> +------------------+</span>
<span class="sd"> | stddev(id)|</span>
<span class="sd"> +------------------+</span>
<span class="sd"> |1.8708286933869...|</span>
<span class="sd"> +------------------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;stddev&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="std"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.std.html#pyspark.sql.functions.std">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">std</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Aggregate function: alias for stddev_samp.</span>
<span class="sd"> .. versionadded:: 3.5.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> target column to compute on.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> standard deviation of given column.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; import pyspark.sql.functions as sf</span>
<span class="sd"> &gt;&gt;&gt; spark.range(6).select(sf.std(&quot;id&quot;)).show()</span>
<span class="sd"> +------------------+</span>
<span class="sd"> | std(id)|</span>
<span class="sd"> +------------------+</span>
<span class="sd"> |1.8708286933869...|</span>
<span class="sd"> +------------------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;std&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="stddev_samp"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.stddev_samp.html#pyspark.sql.functions.stddev_samp">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">stddev_samp</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Aggregate function: returns the unbiased sample standard deviation of</span>
<span class="sd"> the expression in a group.</span>
<span class="sd"> .. versionadded:: 1.6.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> target column to compute on.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> standard deviation of given column.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; import pyspark.sql.functions as sf</span>
<span class="sd"> &gt;&gt;&gt; spark.range(6).select(sf.stddev_samp(&quot;id&quot;)).show()</span>
<span class="sd"> +------------------+</span>
<span class="sd"> | stddev_samp(id)|</span>
<span class="sd"> +------------------+</span>
<span class="sd"> |1.8708286933869...|</span>
<span class="sd"> +------------------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;stddev_samp&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="stddev_pop"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.stddev_pop.html#pyspark.sql.functions.stddev_pop">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">stddev_pop</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Aggregate function: returns population standard deviation of</span>
<span class="sd"> the expression in a group.</span>
<span class="sd"> .. versionadded:: 1.6.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> target column to compute on.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> standard deviation of given column.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; import pyspark.sql.functions as sf</span>
<span class="sd"> &gt;&gt;&gt; spark.range(6).select(sf.stddev_pop(&quot;id&quot;)).show()</span>
<span class="sd"> +-----------------+</span>
<span class="sd"> | stddev_pop(id)|</span>
<span class="sd"> +-----------------+</span>
<span class="sd"> |1.707825127659...|</span>
<span class="sd"> +-----------------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;stddev_pop&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="variance"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.variance.html#pyspark.sql.functions.variance">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">variance</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Aggregate function: alias for var_samp</span>
<span class="sd"> .. versionadded:: 1.6.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> target column to compute on.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> variance of given column.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.range(6)</span>
<span class="sd"> &gt;&gt;&gt; df.select(variance(df.id)).show()</span>
<span class="sd"> +------------+</span>
<span class="sd"> |var_samp(id)|</span>
<span class="sd"> +------------+</span>
<span class="sd"> | 3.5|</span>
<span class="sd"> +------------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;variance&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="var_samp"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.var_samp.html#pyspark.sql.functions.var_samp">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">var_samp</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Aggregate function: returns the unbiased sample variance of</span>
<span class="sd"> the values in a group.</span>
<span class="sd"> .. versionadded:: 1.6.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> target column to compute on.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> variance of given column.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.range(6)</span>
<span class="sd"> &gt;&gt;&gt; df.select(var_samp(df.id)).show()</span>
<span class="sd"> +------------+</span>
<span class="sd"> |var_samp(id)|</span>
<span class="sd"> +------------+</span>
<span class="sd"> | 3.5|</span>
<span class="sd"> +------------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;var_samp&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="var_pop"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.var_pop.html#pyspark.sql.functions.var_pop">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">var_pop</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Aggregate function: returns the population variance of the values in a group.</span>
<span class="sd"> .. versionadded:: 1.6.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> target column to compute on.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> variance of given column.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.range(6)</span>
<span class="sd"> &gt;&gt;&gt; df.select(var_pop(df.id)).first()</span>
<span class="sd"> Row(var_pop(id)=2.91666...)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;var_pop&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="regr_avgx"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.regr_avgx.html#pyspark.sql.functions.regr_avgx">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">regr_avgx</span><span class="p">(</span><span class="n">y</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="n">x</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Aggregate function: returns the average of the independent variable for non-null pairs</span>
<span class="sd"> in a group, where `y` is the dependent variable and `x` is the independent variable.</span>
<span class="sd"> .. versionadded:: 3.5.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> y : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> the dependent variable.</span>
<span class="sd"> x : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> the independent variable.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> the average of the independent variable for non-null pairs in a group.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; x = (col(&quot;id&quot;) % 3).alias(&quot;x&quot;)</span>
<span class="sd"> &gt;&gt;&gt; y = (randn(42) + x * 10).alias(&quot;y&quot;)</span>
<span class="sd"> &gt;&gt;&gt; df = spark.range(0, 1000, 1, 1).select(x, y)</span>
<span class="sd"> &gt;&gt;&gt; df.select(regr_avgx(&quot;y&quot;, &quot;x&quot;)).first()</span>
<span class="sd"> Row(regr_avgx(y, x)=0.999)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;regr_avgx&quot;</span><span class="p">,</span> <span class="n">y</span><span class="p">,</span> <span class="n">x</span><span class="p">)</span></div>
<div class="viewcode-block" id="regr_avgy"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.regr_avgy.html#pyspark.sql.functions.regr_avgy">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">regr_avgy</span><span class="p">(</span><span class="n">y</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="n">x</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Aggregate function: returns the average of the dependent variable for non-null pairs</span>
<span class="sd"> in a group, where `y` is the dependent variable and `x` is the independent variable.</span>
<span class="sd"> .. versionadded:: 3.5.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> y : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> the dependent variable.</span>
<span class="sd"> x : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> the independent variable.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> the average of the dependent variable for non-null pairs in a group.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; x = (col(&quot;id&quot;) % 3).alias(&quot;x&quot;)</span>
<span class="sd"> &gt;&gt;&gt; y = (randn(42) + x * 10).alias(&quot;y&quot;)</span>
<span class="sd"> &gt;&gt;&gt; df = spark.range(0, 1000, 1, 1).select(x, y)</span>
<span class="sd"> &gt;&gt;&gt; df.select(regr_avgy(&quot;y&quot;, &quot;x&quot;)).first()</span>
<span class="sd"> Row(regr_avgy(y, x)=9.980732994136464)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;regr_avgy&quot;</span><span class="p">,</span> <span class="n">y</span><span class="p">,</span> <span class="n">x</span><span class="p">)</span></div>
<div class="viewcode-block" id="regr_count"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.regr_count.html#pyspark.sql.functions.regr_count">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">regr_count</span><span class="p">(</span><span class="n">y</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="n">x</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Aggregate function: returns the number of non-null number pairs</span>
<span class="sd"> in a group, where `y` is the dependent variable and `x` is the independent variable.</span>
<span class="sd"> .. versionadded:: 3.5.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> y : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> the dependent variable.</span>
<span class="sd"> x : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> the independent variable.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> the number of non-null number pairs in a group.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; x = (col(&quot;id&quot;) % 3).alias(&quot;x&quot;)</span>
<span class="sd"> &gt;&gt;&gt; y = (randn(42) + x * 10).alias(&quot;y&quot;)</span>
<span class="sd"> &gt;&gt;&gt; df = spark.range(0, 1000, 1, 1).select(x, y)</span>
<span class="sd"> &gt;&gt;&gt; df.select(regr_count(&quot;y&quot;, &quot;x&quot;)).first()</span>
<span class="sd"> Row(regr_count(y, x)=1000)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;regr_count&quot;</span><span class="p">,</span> <span class="n">y</span><span class="p">,</span> <span class="n">x</span><span class="p">)</span></div>
<div class="viewcode-block" id="regr_intercept"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.regr_intercept.html#pyspark.sql.functions.regr_intercept">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">regr_intercept</span><span class="p">(</span><span class="n">y</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="n">x</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Aggregate function: returns the intercept of the univariate linear regression line</span>
<span class="sd"> for non-null pairs in a group, where `y` is the dependent variable and</span>
<span class="sd"> `x` is the independent variable.</span>
<span class="sd"> .. versionadded:: 3.5.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> y : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> the dependent variable.</span>
<span class="sd"> x : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> the independent variable.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> the intercept of the univariate linear regression line for non-null pairs in a group.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; x = (col(&quot;id&quot;) % 3).alias(&quot;x&quot;)</span>
<span class="sd"> &gt;&gt;&gt; y = (randn(42) + x * 10).alias(&quot;y&quot;)</span>
<span class="sd"> &gt;&gt;&gt; df = spark.range(0, 1000, 1, 1).select(x, y)</span>
<span class="sd"> &gt;&gt;&gt; df.select(regr_intercept(&quot;y&quot;, &quot;x&quot;)).first()</span>
<span class="sd"> Row(regr_intercept(y, x)=-0.04961745990969568)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;regr_intercept&quot;</span><span class="p">,</span> <span class="n">y</span><span class="p">,</span> <span class="n">x</span><span class="p">)</span></div>
<div class="viewcode-block" id="regr_r2"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.regr_r2.html#pyspark.sql.functions.regr_r2">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">regr_r2</span><span class="p">(</span><span class="n">y</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="n">x</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Aggregate function: returns the coefficient of determination for non-null pairs</span>
<span class="sd"> in a group, where `y` is the dependent variable and `x` is the independent variable.</span>
<span class="sd"> .. versionadded:: 3.5.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> y : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> the dependent variable.</span>
<span class="sd"> x : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> the independent variable.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> the coefficient of determination for non-null pairs in a group.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; x = (col(&quot;id&quot;) % 3).alias(&quot;x&quot;)</span>
<span class="sd"> &gt;&gt;&gt; y = (randn(42) + x * 10).alias(&quot;y&quot;)</span>
<span class="sd"> &gt;&gt;&gt; df = spark.range(0, 1000, 1, 1).select(x, y)</span>
<span class="sd"> &gt;&gt;&gt; df.select(regr_r2(&quot;y&quot;, &quot;x&quot;)).first()</span>
<span class="sd"> Row(regr_r2(y, x)=0.9851908293645436)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;regr_r2&quot;</span><span class="p">,</span> <span class="n">y</span><span class="p">,</span> <span class="n">x</span><span class="p">)</span></div>
<div class="viewcode-block" id="regr_slope"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.regr_slope.html#pyspark.sql.functions.regr_slope">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">regr_slope</span><span class="p">(</span><span class="n">y</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="n">x</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Aggregate function: returns the slope of the linear regression line for non-null pairs</span>
<span class="sd"> in a group, where `y` is the dependent variable and `x` is the independent variable.</span>
<span class="sd"> .. versionadded:: 3.5.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> y : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> the dependent variable.</span>
<span class="sd"> x : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> the independent variable.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> the slope of the linear regression line for non-null pairs in a group.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; x = (col(&quot;id&quot;) % 3).alias(&quot;x&quot;)</span>
<span class="sd"> &gt;&gt;&gt; y = (randn(42) + x * 10).alias(&quot;y&quot;)</span>
<span class="sd"> &gt;&gt;&gt; df = spark.range(0, 1000, 1, 1).select(x, y)</span>
<span class="sd"> &gt;&gt;&gt; df.select(regr_slope(&quot;y&quot;, &quot;x&quot;)).first()</span>
<span class="sd"> Row(regr_slope(y, x)=10.040390844891048)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;regr_slope&quot;</span><span class="p">,</span> <span class="n">y</span><span class="p">,</span> <span class="n">x</span><span class="p">)</span></div>
<div class="viewcode-block" id="regr_sxx"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.regr_sxx.html#pyspark.sql.functions.regr_sxx">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">regr_sxx</span><span class="p">(</span><span class="n">y</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="n">x</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Aggregate function: returns REGR_COUNT(y, x) * VAR_POP(x) for non-null pairs</span>
<span class="sd"> in a group, where `y` is the dependent variable and `x` is the independent variable.</span>
<span class="sd"> .. versionadded:: 3.5.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> y : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> the dependent variable.</span>
<span class="sd"> x : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> the independent variable.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> REGR_COUNT(y, x) * VAR_POP(x) for non-null pairs in a group.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; x = (col(&quot;id&quot;) % 3).alias(&quot;x&quot;)</span>
<span class="sd"> &gt;&gt;&gt; y = (randn(42) + x * 10).alias(&quot;y&quot;)</span>
<span class="sd"> &gt;&gt;&gt; df = spark.range(0, 1000, 1, 1).select(x, y)</span>
<span class="sd"> &gt;&gt;&gt; df.select(regr_sxx(&quot;y&quot;, &quot;x&quot;)).first()</span>
<span class="sd"> Row(regr_sxx(y, x)=666.9989999999996)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;regr_sxx&quot;</span><span class="p">,</span> <span class="n">y</span><span class="p">,</span> <span class="n">x</span><span class="p">)</span></div>
<div class="viewcode-block" id="regr_sxy"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.regr_sxy.html#pyspark.sql.functions.regr_sxy">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">regr_sxy</span><span class="p">(</span><span class="n">y</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="n">x</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Aggregate function: returns REGR_COUNT(y, x) * COVAR_POP(y, x) for non-null pairs</span>
<span class="sd"> in a group, where `y` is the dependent variable and `x` is the independent variable.</span>
<span class="sd"> .. versionadded:: 3.5.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> y : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> the dependent variable.</span>
<span class="sd"> x : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> the independent variable.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> REGR_COUNT(y, x) * COVAR_POP(y, x) for non-null pairs in a group.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; x = (col(&quot;id&quot;) % 3).alias(&quot;x&quot;)</span>
<span class="sd"> &gt;&gt;&gt; y = (randn(42) + x * 10).alias(&quot;y&quot;)</span>
<span class="sd"> &gt;&gt;&gt; df = spark.range(0, 1000, 1, 1).select(x, y)</span>
<span class="sd"> &gt;&gt;&gt; df.select(regr_sxy(&quot;y&quot;, &quot;x&quot;)).first()</span>
<span class="sd"> Row(regr_sxy(y, x)=6696.93065315148)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;regr_sxy&quot;</span><span class="p">,</span> <span class="n">y</span><span class="p">,</span> <span class="n">x</span><span class="p">)</span></div>
<div class="viewcode-block" id="regr_syy"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.regr_syy.html#pyspark.sql.functions.regr_syy">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">regr_syy</span><span class="p">(</span><span class="n">y</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="n">x</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Aggregate function: returns REGR_COUNT(y, x) * VAR_POP(y) for non-null pairs</span>
<span class="sd"> in a group, where `y` is the dependent variable and `x` is the independent variable.</span>
<span class="sd"> .. versionadded:: 3.5.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> y : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> the dependent variable.</span>
<span class="sd"> x : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> the independent variable.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> REGR_COUNT(y, x) * VAR_POP(y) for non-null pairs in a group.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; x = (col(&quot;id&quot;) % 3).alias(&quot;x&quot;)</span>
<span class="sd"> &gt;&gt;&gt; y = (randn(42) + x * 10).alias(&quot;y&quot;)</span>
<span class="sd"> &gt;&gt;&gt; df = spark.range(0, 1000, 1, 1).select(x, y)</span>
<span class="sd"> &gt;&gt;&gt; df.select(regr_syy(&quot;y&quot;, &quot;x&quot;)).first()</span>
<span class="sd"> Row(regr_syy(y, x)=68250.53503811295)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;regr_syy&quot;</span><span class="p">,</span> <span class="n">y</span><span class="p">,</span> <span class="n">x</span><span class="p">)</span></div>
<div class="viewcode-block" id="every"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.every.html#pyspark.sql.functions.every">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">every</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Aggregate function: returns true if all values of `col` are true.</span>
<span class="sd"> .. versionadded:: 3.5.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> column to check if all values are true.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> true if all values of `col` are true, false otherwise.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; import pyspark.sql.functions as sf</span>
<span class="sd"> &gt;&gt;&gt; spark.createDataFrame(</span>
<span class="sd"> ... [[True], [True], [True]], [&quot;flag&quot;]</span>
<span class="sd"> ... ).select(sf.every(&quot;flag&quot;)).show()</span>
<span class="sd"> +-----------+</span>
<span class="sd"> |every(flag)|</span>
<span class="sd"> +-----------+</span>
<span class="sd"> | true|</span>
<span class="sd"> +-----------+</span>
<span class="sd"> &gt;&gt;&gt; import pyspark.sql.functions as sf</span>
<span class="sd"> &gt;&gt;&gt; spark.createDataFrame(</span>
<span class="sd"> ... [[True], [False], [True]], [&quot;flag&quot;]</span>
<span class="sd"> ... ).select(sf.every(&quot;flag&quot;)).show()</span>
<span class="sd"> +-----------+</span>
<span class="sd"> |every(flag)|</span>
<span class="sd"> +-----------+</span>
<span class="sd"> | false|</span>
<span class="sd"> +-----------+</span>
<span class="sd"> &gt;&gt;&gt; import pyspark.sql.functions as sf</span>
<span class="sd"> &gt;&gt;&gt; spark.createDataFrame(</span>
<span class="sd"> ... [[False], [False], [False]], [&quot;flag&quot;]</span>
<span class="sd"> ... ).select(sf.every(&quot;flag&quot;)).show()</span>
<span class="sd"> +-----------+</span>
<span class="sd"> |every(flag)|</span>
<span class="sd"> +-----------+</span>
<span class="sd"> | false|</span>
<span class="sd"> +-----------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;every&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="bool_and"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.bool_and.html#pyspark.sql.functions.bool_and">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">bool_and</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Aggregate function: returns true if all values of `col` are true.</span>
<span class="sd"> .. versionadded:: 3.5.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> column to check if all values are true.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> true if all values of `col` are true, false otherwise.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([[True], [True], [True]], [&quot;flag&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(bool_and(&quot;flag&quot;)).show()</span>
<span class="sd"> +--------------+</span>
<span class="sd"> |bool_and(flag)|</span>
<span class="sd"> +--------------+</span>
<span class="sd"> | true|</span>
<span class="sd"> +--------------+</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([[True], [False], [True]], [&quot;flag&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(bool_and(&quot;flag&quot;)).show()</span>
<span class="sd"> +--------------+</span>
<span class="sd"> |bool_and(flag)|</span>
<span class="sd"> +--------------+</span>
<span class="sd"> | false|</span>
<span class="sd"> +--------------+</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([[False], [False], [False]], [&quot;flag&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(bool_and(&quot;flag&quot;)).show()</span>
<span class="sd"> +--------------+</span>
<span class="sd"> |bool_and(flag)|</span>
<span class="sd"> +--------------+</span>
<span class="sd"> | false|</span>
<span class="sd"> +--------------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;bool_and&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="some"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.some.html#pyspark.sql.functions.some">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">some</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Aggregate function: returns true if at least one value of `col` is true.</span>
<span class="sd"> .. versionadded:: 3.5.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> column to check if at least one value is true.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> true if at least one value of `col` is true, false otherwise.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; import pyspark.sql.functions as sf</span>
<span class="sd"> &gt;&gt;&gt; spark.createDataFrame(</span>
<span class="sd"> ... [[True], [True], [True]], [&quot;flag&quot;]</span>
<span class="sd"> ... ).select(sf.some(&quot;flag&quot;)).show()</span>
<span class="sd"> +----------+</span>
<span class="sd"> |some(flag)|</span>
<span class="sd"> +----------+</span>
<span class="sd"> | true|</span>
<span class="sd"> +----------+</span>
<span class="sd"> &gt;&gt;&gt; import pyspark.sql.functions as sf</span>
<span class="sd"> &gt;&gt;&gt; spark.createDataFrame(</span>
<span class="sd"> ... [[True], [False], [True]], [&quot;flag&quot;]</span>
<span class="sd"> ... ).select(sf.some(&quot;flag&quot;)).show()</span>
<span class="sd"> +----------+</span>
<span class="sd"> |some(flag)|</span>
<span class="sd"> +----------+</span>
<span class="sd"> | true|</span>
<span class="sd"> +----------+</span>
<span class="sd"> &gt;&gt;&gt; import pyspark.sql.functions as sf</span>
<span class="sd"> &gt;&gt;&gt; spark.createDataFrame(</span>
<span class="sd"> ... [[False], [False], [False]], [&quot;flag&quot;]</span>
<span class="sd"> ... ).select(sf.some(&quot;flag&quot;)).show()</span>
<span class="sd"> +----------+</span>
<span class="sd"> |some(flag)|</span>
<span class="sd"> +----------+</span>
<span class="sd"> | false|</span>
<span class="sd"> +----------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;some&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="bool_or"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.bool_or.html#pyspark.sql.functions.bool_or">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">bool_or</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Aggregate function: returns true if at least one value of `col` is true.</span>
<span class="sd"> .. versionadded:: 3.5.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> column to check if at least one value is true.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> true if at least one value of `col` is true, false otherwise.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([[True], [True], [True]], [&quot;flag&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(bool_or(&quot;flag&quot;)).show()</span>
<span class="sd"> +-------------+</span>
<span class="sd"> |bool_or(flag)|</span>
<span class="sd"> +-------------+</span>
<span class="sd"> | true|</span>
<span class="sd"> +-------------+</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([[True], [False], [True]], [&quot;flag&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(bool_or(&quot;flag&quot;)).show()</span>
<span class="sd"> +-------------+</span>
<span class="sd"> |bool_or(flag)|</span>
<span class="sd"> +-------------+</span>
<span class="sd"> | true|</span>
<span class="sd"> +-------------+</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([[False], [False], [False]], [&quot;flag&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(bool_or(&quot;flag&quot;)).show()</span>
<span class="sd"> +-------------+</span>
<span class="sd"> |bool_or(flag)|</span>
<span class="sd"> +-------------+</span>
<span class="sd"> | false|</span>
<span class="sd"> +-------------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;bool_or&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="bit_and"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.bit_and.html#pyspark.sql.functions.bit_and">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">bit_and</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Aggregate function: returns the bitwise AND of all non-null input values, or null if none.</span>
<span class="sd"> .. versionadded:: 3.5.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> target column to compute on.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> the bitwise AND of all non-null input values, or null if none.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([[1],[1],[2]], [&quot;c&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(bit_and(&quot;c&quot;)).first()</span>
<span class="sd"> Row(bit_and(c)=0)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;bit_and&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="bit_or"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.bit_or.html#pyspark.sql.functions.bit_or">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">bit_or</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Aggregate function: returns the bitwise OR of all non-null input values, or null if none.</span>
<span class="sd"> .. versionadded:: 3.5.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> target column to compute on.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> the bitwise OR of all non-null input values, or null if none.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([[1],[1],[2]], [&quot;c&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(bit_or(&quot;c&quot;)).first()</span>
<span class="sd"> Row(bit_or(c)=3)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;bit_or&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="bit_xor"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.bit_xor.html#pyspark.sql.functions.bit_xor">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">bit_xor</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Aggregate function: returns the bitwise XOR of all non-null input values, or null if none.</span>
<span class="sd"> .. versionadded:: 3.5.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> target column to compute on.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> the bitwise XOR of all non-null input values, or null if none.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([[1],[1],[2]], [&quot;c&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(bit_xor(&quot;c&quot;)).first()</span>
<span class="sd"> Row(bit_xor(c)=2)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;bit_xor&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="skewness"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.skewness.html#pyspark.sql.functions.skewness">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">skewness</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Aggregate function: returns the skewness of the values in a group.</span>
<span class="sd"> .. versionadded:: 1.6.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> target column to compute on.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> skewness of given column.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([[1],[1],[2]], [&quot;c&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(skewness(df.c)).first()</span>
<span class="sd"> Row(skewness(c)=0.70710...)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;skewness&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="kurtosis"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.kurtosis.html#pyspark.sql.functions.kurtosis">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">kurtosis</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Aggregate function: returns the kurtosis of the values in a group.</span>
<span class="sd"> .. versionadded:: 1.6.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> target column to compute on.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> kurtosis of given column.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([[1],[1],[2]], [&quot;c&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(kurtosis(df.c)).show()</span>
<span class="sd"> +-----------+</span>
<span class="sd"> |kurtosis(c)|</span>
<span class="sd"> +-----------+</span>
<span class="sd"> | -1.5|</span>
<span class="sd"> +-----------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;kurtosis&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="collect_list"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.collect_list.html#pyspark.sql.functions.collect_list">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">collect_list</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Aggregate function: returns a list of objects with duplicates.</span>
<span class="sd"> .. versionadded:: 1.6.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> The function is non-deterministic because the order of collected results depends</span>
<span class="sd"> on the order of the rows which may be non-deterministic after a shuffle.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> target column to compute on.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> list of objects with duplicates.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df2 = spark.createDataFrame([(2,), (5,), (5,)], (&#39;age&#39;,))</span>
<span class="sd"> &gt;&gt;&gt; df2.agg(collect_list(&#39;age&#39;)).collect()</span>
<span class="sd"> [Row(collect_list(age)=[2, 5, 5])]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;collect_list&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="array_agg"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.array_agg.html#pyspark.sql.functions.array_agg">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">array_agg</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Aggregate function: returns a list of objects with duplicates.</span>
<span class="sd"> .. versionadded:: 3.5.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> target column to compute on.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> list of objects with duplicates.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([[1],[1],[2]], [&quot;c&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df.agg(array_agg(&#39;c&#39;).alias(&#39;r&#39;)).collect()</span>
<span class="sd"> [Row(r=[1, 1, 2])]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;array_agg&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="collect_set"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.collect_set.html#pyspark.sql.functions.collect_set">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">collect_set</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Aggregate function: returns a set of objects with duplicate elements eliminated.</span>
<span class="sd"> .. versionadded:: 1.6.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> The function is non-deterministic because the order of collected results depends</span>
<span class="sd"> on the order of the rows which may be non-deterministic after a shuffle.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> target column to compute on.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> list of objects with no duplicates.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df2 = spark.createDataFrame([(2,), (5,), (5,)], (&#39;age&#39;,))</span>
<span class="sd"> &gt;&gt;&gt; df2.agg(array_sort(collect_set(&#39;age&#39;)).alias(&#39;c&#39;)).collect()</span>
<span class="sd"> [Row(c=[2, 5])]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;collect_set&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="degrees"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.degrees.html#pyspark.sql.functions.degrees">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">degrees</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Converts an angle measured in radians to an approximately equivalent angle</span>
<span class="sd"> measured in degrees.</span>
<span class="sd"> .. versionadded:: 2.1.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> angle in radians</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> angle in degrees, as if computed by `java.lang.Math.toDegrees()`</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; import math</span>
<span class="sd"> &gt;&gt;&gt; df = spark.range(1)</span>
<span class="sd"> &gt;&gt;&gt; df.select(degrees(lit(math.pi))).first()</span>
<span class="sd"> Row(DEGREES(3.14159...)=180.0)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;degrees&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="radians"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.radians.html#pyspark.sql.functions.radians">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">radians</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Converts an angle measured in degrees to an approximately equivalent angle</span>
<span class="sd"> measured in radians.</span>
<span class="sd"> .. versionadded:: 2.1.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> angle in degrees</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> angle in radians, as if computed by `java.lang.Math.toRadians()`</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.range(1)</span>
<span class="sd"> &gt;&gt;&gt; df.select(radians(lit(180))).first()</span>
<span class="sd"> Row(RADIANS(180)=3.14159...)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;radians&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="atan2"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.atan2.html#pyspark.sql.functions.atan2">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">atan2</span><span class="p">(</span><span class="n">col1</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="nb">float</span><span class="p">],</span> <span class="n">col2</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="nb">float</span><span class="p">])</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> .. versionadded:: 1.4.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col1 : str, :class:`~pyspark.sql.Column` or float</span>
<span class="sd"> coordinate on y-axis</span>
<span class="sd"> col2 : str, :class:`~pyspark.sql.Column` or float</span>
<span class="sd"> coordinate on x-axis</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> the `theta` component of the point</span>
<span class="sd"> (`r`, `theta`)</span>
<span class="sd"> in polar coordinates that corresponds to the point</span>
<span class="sd"> (`x`, `y`) in Cartesian coordinates,</span>
<span class="sd"> as if computed by `java.lang.Math.atan2()`</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.range(1)</span>
<span class="sd"> &gt;&gt;&gt; df.select(atan2(lit(1), lit(2))).first()</span>
<span class="sd"> Row(ATAN2(1, 2)=0.46364...)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_binary_math_function</span><span class="p">(</span><span class="s2">&quot;atan2&quot;</span><span class="p">,</span> <span class="n">col1</span><span class="p">,</span> <span class="n">col2</span><span class="p">)</span></div>
<div class="viewcode-block" id="hypot"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.hypot.html#pyspark.sql.functions.hypot">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">hypot</span><span class="p">(</span><span class="n">col1</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="nb">float</span><span class="p">],</span> <span class="n">col2</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="nb">float</span><span class="p">])</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Computes ``sqrt(a^2 + b^2)`` without intermediate overflow or underflow.</span>
<span class="sd"> .. versionadded:: 1.4.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col1 : str, :class:`~pyspark.sql.Column` or float</span>
<span class="sd"> a leg.</span>
<span class="sd"> col2 : str, :class:`~pyspark.sql.Column` or float</span>
<span class="sd"> b leg.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> length of the hypotenuse.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.range(1)</span>
<span class="sd"> &gt;&gt;&gt; df.select(hypot(lit(1), lit(2))).first()</span>
<span class="sd"> Row(HYPOT(1, 2)=2.23606...)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_binary_math_function</span><span class="p">(</span><span class="s2">&quot;hypot&quot;</span><span class="p">,</span> <span class="n">col1</span><span class="p">,</span> <span class="n">col2</span><span class="p">)</span></div>
<div class="viewcode-block" id="pow"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.pow.html#pyspark.sql.functions.pow">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">pow</span><span class="p">(</span><span class="n">col1</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="nb">float</span><span class="p">],</span> <span class="n">col2</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="nb">float</span><span class="p">])</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns the value of the first argument raised to the power of the second argument.</span>
<span class="sd"> .. versionadded:: 1.4.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col1 : str, :class:`~pyspark.sql.Column` or float</span>
<span class="sd"> the base number.</span>
<span class="sd"> col2 : str, :class:`~pyspark.sql.Column` or float</span>
<span class="sd"> the exponent number.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> the base rased to the power the argument.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.range(1)</span>
<span class="sd"> &gt;&gt;&gt; df.select(pow(lit(3), lit(2))).first()</span>
<span class="sd"> Row(POWER(3, 2)=9.0)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_binary_math_function</span><span class="p">(</span><span class="s2">&quot;pow&quot;</span><span class="p">,</span> <span class="n">col1</span><span class="p">,</span> <span class="n">col2</span><span class="p">)</span></div>
<span class="n">power</span> <span class="o">=</span> <span class="nb">pow</span>
<div class="viewcode-block" id="pmod"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.pmod.html#pyspark.sql.functions.pmod">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">pmod</span><span class="p">(</span><span class="n">dividend</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="nb">float</span><span class="p">],</span> <span class="n">divisor</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="nb">float</span><span class="p">])</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns the positive value of dividend mod divisor.</span>
<span class="sd"> .. versionadded:: 3.4.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> dividend : str, :class:`~pyspark.sql.Column` or float</span>
<span class="sd"> the column that contains dividend, or the specified dividend value</span>
<span class="sd"> divisor : str, :class:`~pyspark.sql.Column` or float</span>
<span class="sd"> the column that contains divisor, or the specified divisor value</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> positive value of dividend mod divisor.</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; from pyspark.sql.functions import pmod</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([</span>
<span class="sd"> ... (1.0, float(&#39;nan&#39;)), (float(&#39;nan&#39;), 2.0), (10.0, 3.0),</span>
<span class="sd"> ... (float(&#39;nan&#39;), float(&#39;nan&#39;)), (-3.0, 4.0), (-10.0, 3.0),</span>
<span class="sd"> ... (-5.0, -6.0), (7.0, -8.0), (1.0, 2.0)],</span>
<span class="sd"> ... (&quot;a&quot;, &quot;b&quot;))</span>
<span class="sd"> &gt;&gt;&gt; df.select(pmod(&quot;a&quot;, &quot;b&quot;)).show()</span>
<span class="sd"> +----------+</span>
<span class="sd"> |pmod(a, b)|</span>
<span class="sd"> +----------+</span>
<span class="sd"> | NaN|</span>
<span class="sd"> | NaN|</span>
<span class="sd"> | 1.0|</span>
<span class="sd"> | NaN|</span>
<span class="sd"> | 1.0|</span>
<span class="sd"> | 2.0|</span>
<span class="sd"> | -5.0|</span>
<span class="sd"> | 7.0|</span>
<span class="sd"> | 1.0|</span>
<span class="sd"> +----------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_binary_math_function</span><span class="p">(</span><span class="s2">&quot;pmod&quot;</span><span class="p">,</span> <span class="n">dividend</span><span class="p">,</span> <span class="n">divisor</span><span class="p">)</span></div>
<div class="viewcode-block" id="width_bucket"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.width_bucket.html#pyspark.sql.functions.width_bucket">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">width_bucket</span><span class="p">(</span>
<span class="n">v</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span>
<span class="nb">min</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span>
<span class="nb">max</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span>
<span class="n">numBucket</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="nb">int</span><span class="p">],</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns the bucket number into which the value of this expression would fall</span>
<span class="sd"> after being evaluated. Note that input arguments must follow conditions listed below;</span>
<span class="sd"> otherwise, the method will return null.</span>
<span class="sd"> .. versionadded:: 3.5.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> v : str or :class:`~pyspark.sql.Column`</span>
<span class="sd"> value to compute a bucket number in the histogram</span>
<span class="sd"> min : str or :class:`~pyspark.sql.Column`</span>
<span class="sd"> minimum value of the histogram</span>
<span class="sd"> max : str or :class:`~pyspark.sql.Column`</span>
<span class="sd"> maximum value of the histogram</span>
<span class="sd"> numBucket : str, :class:`~pyspark.sql.Column` or int</span>
<span class="sd"> the number of buckets</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> the bucket number into which the value would fall after being evaluated</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([</span>
<span class="sd"> ... (5.3, 0.2, 10.6, 5),</span>
<span class="sd"> ... (-2.1, 1.3, 3.4, 3),</span>
<span class="sd"> ... (8.1, 0.0, 5.7, 4),</span>
<span class="sd"> ... (-0.9, 5.2, 0.5, 2)],</span>
<span class="sd"> ... [&#39;v&#39;, &#39;min&#39;, &#39;max&#39;, &#39;n&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(width_bucket(&#39;v&#39;, &#39;min&#39;, &#39;max&#39;, &#39;n&#39;)).show()</span>
<span class="sd"> +----------------------------+</span>
<span class="sd"> |width_bucket(v, min, max, n)|</span>
<span class="sd"> +----------------------------+</span>
<span class="sd"> | 3|</span>
<span class="sd"> | 0|</span>
<span class="sd"> | 5|</span>
<span class="sd"> | 3|</span>
<span class="sd"> +----------------------------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">numBucket</span> <span class="o">=</span> <span class="n">lit</span><span class="p">(</span><span class="n">numBucket</span><span class="p">)</span> <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">numBucket</span><span class="p">,</span> <span class="nb">int</span><span class="p">)</span> <span class="k">else</span> <span class="n">numBucket</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;width_bucket&quot;</span><span class="p">,</span> <span class="n">v</span><span class="p">,</span> <span class="nb">min</span><span class="p">,</span> <span class="nb">max</span><span class="p">,</span> <span class="n">numBucket</span><span class="p">)</span></div>
<div class="viewcode-block" id="row_number"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.row_number.html#pyspark.sql.functions.row_number">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">row_number</span><span class="p">()</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Window function: returns a sequential number starting at 1 within a window partition.</span>
<span class="sd"> .. versionadded:: 1.6.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> the column for calculating row numbers.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; from pyspark.sql import Window</span>
<span class="sd"> &gt;&gt;&gt; df = spark.range(3)</span>
<span class="sd"> &gt;&gt;&gt; w = Window.orderBy(df.id.desc())</span>
<span class="sd"> &gt;&gt;&gt; df.withColumn(&quot;desc_order&quot;, row_number().over(w)).show()</span>
<span class="sd"> +---+----------+</span>
<span class="sd"> | id|desc_order|</span>
<span class="sd"> +---+----------+</span>
<span class="sd"> | 2| 1|</span>
<span class="sd"> | 1| 2|</span>
<span class="sd"> | 0| 3|</span>
<span class="sd"> +---+----------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">&quot;row_number&quot;</span><span class="p">)</span></div>
<div class="viewcode-block" id="dense_rank"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.dense_rank.html#pyspark.sql.functions.dense_rank">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">dense_rank</span><span class="p">()</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Window function: returns the rank of rows within a window partition, without any gaps.</span>
<span class="sd"> The difference between rank and dense_rank is that dense_rank leaves no gaps in ranking</span>
<span class="sd"> sequence when there are ties. That is, if you were ranking a competition using dense_rank</span>
<span class="sd"> and had three people tie for second place, you would say that all three were in second</span>
<span class="sd"> place and that the next person came in third. Rank would give me sequential numbers, making</span>
<span class="sd"> the person that came in third place (after the ties) would register as coming in fifth.</span>
<span class="sd"> This is equivalent to the DENSE_RANK function in SQL.</span>
<span class="sd"> .. versionadded:: 1.6.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> the column for calculating ranks.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; from pyspark.sql import Window, types</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([1, 1, 2, 3, 3, 4], types.IntegerType())</span>
<span class="sd"> &gt;&gt;&gt; w = Window.orderBy(&quot;value&quot;)</span>
<span class="sd"> &gt;&gt;&gt; df.withColumn(&quot;drank&quot;, dense_rank().over(w)).show()</span>
<span class="sd"> +-----+-----+</span>
<span class="sd"> |value|drank|</span>
<span class="sd"> +-----+-----+</span>
<span class="sd"> | 1| 1|</span>
<span class="sd"> | 1| 1|</span>
<span class="sd"> | 2| 2|</span>
<span class="sd"> | 3| 3|</span>
<span class="sd"> | 3| 3|</span>
<span class="sd"> | 4| 4|</span>
<span class="sd"> +-----+-----+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">&quot;dense_rank&quot;</span><span class="p">)</span></div>
<div class="viewcode-block" id="rank"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.rank.html#pyspark.sql.functions.rank">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">rank</span><span class="p">()</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Window function: returns the rank of rows within a window partition.</span>
<span class="sd"> The difference between rank and dense_rank is that dense_rank leaves no gaps in ranking</span>
<span class="sd"> sequence when there are ties. That is, if you were ranking a competition using dense_rank</span>
<span class="sd"> and had three people tie for second place, you would say that all three were in second</span>
<span class="sd"> place and that the next person came in third. Rank would give me sequential numbers, making</span>
<span class="sd"> the person that came in third place (after the ties) would register as coming in fifth.</span>
<span class="sd"> This is equivalent to the RANK function in SQL.</span>
<span class="sd"> .. versionadded:: 1.6.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> the column for calculating ranks.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; from pyspark.sql import Window, types</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([1, 1, 2, 3, 3, 4], types.IntegerType())</span>
<span class="sd"> &gt;&gt;&gt; w = Window.orderBy(&quot;value&quot;)</span>
<span class="sd"> &gt;&gt;&gt; df.withColumn(&quot;drank&quot;, rank().over(w)).show()</span>
<span class="sd"> +-----+-----+</span>
<span class="sd"> |value|drank|</span>
<span class="sd"> +-----+-----+</span>
<span class="sd"> | 1| 1|</span>
<span class="sd"> | 1| 1|</span>
<span class="sd"> | 2| 3|</span>
<span class="sd"> | 3| 4|</span>
<span class="sd"> | 3| 4|</span>
<span class="sd"> | 4| 6|</span>
<span class="sd"> +-----+-----+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">&quot;rank&quot;</span><span class="p">)</span></div>
<div class="viewcode-block" id="cume_dist"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.cume_dist.html#pyspark.sql.functions.cume_dist">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">cume_dist</span><span class="p">()</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Window function: returns the cumulative distribution of values within a window partition,</span>
<span class="sd"> i.e. the fraction of rows that are below the current row.</span>
<span class="sd"> .. versionadded:: 1.6.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> the column for calculating cumulative distribution.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; from pyspark.sql import Window, types</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([1, 2, 3, 3, 4], types.IntegerType())</span>
<span class="sd"> &gt;&gt;&gt; w = Window.orderBy(&quot;value&quot;)</span>
<span class="sd"> &gt;&gt;&gt; df.withColumn(&quot;cd&quot;, cume_dist().over(w)).show()</span>
<span class="sd"> +-----+---+</span>
<span class="sd"> |value| cd|</span>
<span class="sd"> +-----+---+</span>
<span class="sd"> | 1|0.2|</span>
<span class="sd"> | 2|0.4|</span>
<span class="sd"> | 3|0.8|</span>
<span class="sd"> | 3|0.8|</span>
<span class="sd"> | 4|1.0|</span>
<span class="sd"> +-----+---+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">&quot;cume_dist&quot;</span><span class="p">)</span></div>
<div class="viewcode-block" id="percent_rank"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.percent_rank.html#pyspark.sql.functions.percent_rank">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">percent_rank</span><span class="p">()</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Window function: returns the relative rank (i.e. percentile) of rows within a window partition.</span>
<span class="sd"> .. versionadded:: 1.6.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> the column for calculating relative rank.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; from pyspark.sql import Window, types</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([1, 1, 2, 3, 3, 4], types.IntegerType())</span>
<span class="sd"> &gt;&gt;&gt; w = Window.orderBy(&quot;value&quot;)</span>
<span class="sd"> &gt;&gt;&gt; df.withColumn(&quot;pr&quot;, percent_rank().over(w)).show()</span>
<span class="sd"> +-----+---+</span>
<span class="sd"> |value| pr|</span>
<span class="sd"> +-----+---+</span>
<span class="sd"> | 1|0.0|</span>
<span class="sd"> | 1|0.0|</span>
<span class="sd"> | 2|0.4|</span>
<span class="sd"> | 3|0.6|</span>
<span class="sd"> | 3|0.6|</span>
<span class="sd"> | 4|1.0|</span>
<span class="sd"> +-----+---+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">&quot;percent_rank&quot;</span><span class="p">)</span></div>
<div class="viewcode-block" id="approxCountDistinct"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.approxCountDistinct.html#pyspark.sql.functions.approxCountDistinct">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">approxCountDistinct</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="n">rsd</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">float</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> .. versionadded:: 1.3.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> .. deprecated:: 2.1.0</span>
<span class="sd"> Use :func:`approx_count_distinct` instead.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">warnings</span><span class="o">.</span><span class="n">warn</span><span class="p">(</span><span class="s2">&quot;Deprecated in 2.1, use approx_count_distinct instead.&quot;</span><span class="p">,</span> <span class="ne">FutureWarning</span><span class="p">)</span>
<span class="k">return</span> <span class="n">approx_count_distinct</span><span class="p">(</span><span class="n">col</span><span class="p">,</span> <span class="n">rsd</span><span class="p">)</span></div>
<div class="viewcode-block" id="approx_count_distinct"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.approx_count_distinct.html#pyspark.sql.functions.approx_count_distinct">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">approx_count_distinct</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="n">rsd</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">float</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Aggregate function: returns a new :class:`~pyspark.sql.Column` for approximate distinct count</span>
<span class="sd"> of column `col`.</span>
<span class="sd"> .. versionadded:: 2.1.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> rsd : float, optional</span>
<span class="sd"> maximum relative standard deviation allowed (default = 0.05).</span>
<span class="sd"> For rsd &lt; 0.01, it is more efficient to use :func:`count_distinct`</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> the column of computed results.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([1,2,2,3], &quot;INT&quot;)</span>
<span class="sd"> &gt;&gt;&gt; df.agg(approx_count_distinct(&quot;value&quot;).alias(&#39;distinct_values&#39;)).show()</span>
<span class="sd"> +---------------+</span>
<span class="sd"> |distinct_values|</span>
<span class="sd"> +---------------+</span>
<span class="sd"> | 3|</span>
<span class="sd"> +---------------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="n">rsd</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;approx_count_distinct&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">&quot;approx_count_distinct&quot;</span><span class="p">,</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">),</span> <span class="n">rsd</span><span class="p">)</span></div>
<div class="viewcode-block" id="broadcast"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.broadcast.html#pyspark.sql.functions.broadcast">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">broadcast</span><span class="p">(</span><span class="n">df</span><span class="p">:</span> <span class="n">DataFrame</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">DataFrame</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Marks a DataFrame as small enough for use in broadcast joins.</span>
<span class="sd"> .. versionadded:: 1.6.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.DataFrame`</span>
<span class="sd"> DataFrame marked as ready for broadcast join.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; from pyspark.sql import types</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([1, 2, 3, 3, 4], types.IntegerType())</span>
<span class="sd"> &gt;&gt;&gt; df_small = spark.range(3)</span>
<span class="sd"> &gt;&gt;&gt; df_b = broadcast(df_small)</span>
<span class="sd"> &gt;&gt;&gt; df.join(df_b, df.value == df_small.id).show()</span>
<span class="sd"> +-----+---+</span>
<span class="sd"> |value| id|</span>
<span class="sd"> +-----+---+</span>
<span class="sd"> | 1| 1|</span>
<span class="sd"> | 2| 2|</span>
<span class="sd"> +-----+---+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">get_active_spark_context</span><span class="p">()</span>
<span class="k">return</span> <span class="n">DataFrame</span><span class="p">(</span><span class="n">cast</span><span class="p">(</span><span class="n">JVMView</span><span class="p">,</span> <span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="p">)</span><span class="o">.</span><span class="n">functions</span><span class="o">.</span><span class="n">broadcast</span><span class="p">(</span><span class="n">df</span><span class="o">.</span><span class="n">_jdf</span><span class="p">),</span> <span class="n">df</span><span class="o">.</span><span class="n">sparkSession</span><span class="p">)</span></div>
<div class="viewcode-block" id="coalesce"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.coalesce.html#pyspark.sql.functions.coalesce">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">coalesce</span><span class="p">(</span><span class="o">*</span><span class="n">cols</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Returns the first column that is not null.</span>
<span class="sd"> .. versionadded:: 1.4.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> cols : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> list of columns to work on.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> value of the first column that is not null.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; cDf = spark.createDataFrame([(None, None), (1, None), (None, 2)], (&quot;a&quot;, &quot;b&quot;))</span>
<span class="sd"> &gt;&gt;&gt; cDf.show()</span>
<span class="sd"> +----+----+</span>
<span class="sd"> | a| b|</span>
<span class="sd"> +----+----+</span>
<span class="sd"> |NULL|NULL|</span>
<span class="sd"> | 1|NULL|</span>
<span class="sd"> |NULL| 2|</span>
<span class="sd"> +----+----+</span>
<span class="sd"> &gt;&gt;&gt; cDf.select(coalesce(cDf[&quot;a&quot;], cDf[&quot;b&quot;])).show()</span>
<span class="sd"> +--------------+</span>
<span class="sd"> |coalesce(a, b)|</span>
<span class="sd"> +--------------+</span>
<span class="sd"> | NULL|</span>
<span class="sd"> | 1|</span>
<span class="sd"> | 2|</span>
<span class="sd"> +--------------+</span>
<span class="sd"> &gt;&gt;&gt; cDf.select(&#39;*&#39;, coalesce(cDf[&quot;a&quot;], lit(0.0))).show()</span>
<span class="sd"> +----+----+----------------+</span>
<span class="sd"> | a| b|coalesce(a, 0.0)|</span>
<span class="sd"> +----+----+----------------+</span>
<span class="sd"> |NULL|NULL| 0.0|</span>
<span class="sd"> | 1|NULL| 1.0|</span>
<span class="sd"> |NULL| 2| 0.0|</span>
<span class="sd"> +----+----+----------------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_seq_of_columns</span><span class="p">(</span><span class="s2">&quot;coalesce&quot;</span><span class="p">,</span> <span class="n">cols</span><span class="p">)</span></div>
<div class="viewcode-block" id="corr"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.corr.html#pyspark.sql.functions.corr">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">corr</span><span class="p">(</span><span class="n">col1</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="n">col2</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Returns a new :class:`~pyspark.sql.Column` for the Pearson Correlation Coefficient for</span>
<span class="sd"> ``col1`` and ``col2``.</span>
<span class="sd"> .. versionadded:: 1.6.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col1 : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> first column to calculate correlation.</span>
<span class="sd"> col1 : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> second column to calculate correlation.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> Pearson Correlation Coefficient of these two column values.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; a = range(20)</span>
<span class="sd"> &gt;&gt;&gt; b = [2 * x for x in range(20)]</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame(zip(a, b), [&quot;a&quot;, &quot;b&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df.agg(corr(&quot;a&quot;, &quot;b&quot;).alias(&#39;c&#39;)).collect()</span>
<span class="sd"> [Row(c=1.0)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;corr&quot;</span><span class="p">,</span> <span class="n">col1</span><span class="p">,</span> <span class="n">col2</span><span class="p">)</span></div>
<div class="viewcode-block" id="covar_pop"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.covar_pop.html#pyspark.sql.functions.covar_pop">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">covar_pop</span><span class="p">(</span><span class="n">col1</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="n">col2</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Returns a new :class:`~pyspark.sql.Column` for the population covariance of ``col1`` and</span>
<span class="sd"> ``col2``.</span>
<span class="sd"> .. versionadded:: 2.0.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col1 : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> first column to calculate covariance.</span>
<span class="sd"> col1 : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> second column to calculate covariance.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> covariance of these two column values.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; a = [1] * 10</span>
<span class="sd"> &gt;&gt;&gt; b = [1] * 10</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame(zip(a, b), [&quot;a&quot;, &quot;b&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df.agg(covar_pop(&quot;a&quot;, &quot;b&quot;).alias(&#39;c&#39;)).collect()</span>
<span class="sd"> [Row(c=0.0)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;covar_pop&quot;</span><span class="p">,</span> <span class="n">col1</span><span class="p">,</span> <span class="n">col2</span><span class="p">)</span></div>
<div class="viewcode-block" id="covar_samp"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.covar_samp.html#pyspark.sql.functions.covar_samp">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">covar_samp</span><span class="p">(</span><span class="n">col1</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="n">col2</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Returns a new :class:`~pyspark.sql.Column` for the sample covariance of ``col1`` and</span>
<span class="sd"> ``col2``.</span>
<span class="sd"> .. versionadded:: 2.0.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col1 : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> first column to calculate covariance.</span>
<span class="sd"> col1 : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> second column to calculate covariance.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> sample covariance of these two column values.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; a = [1] * 10</span>
<span class="sd"> &gt;&gt;&gt; b = [1] * 10</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame(zip(a, b), [&quot;a&quot;, &quot;b&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df.agg(covar_samp(&quot;a&quot;, &quot;b&quot;).alias(&#39;c&#39;)).collect()</span>
<span class="sd"> [Row(c=0.0)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;covar_samp&quot;</span><span class="p">,</span> <span class="n">col1</span><span class="p">,</span> <span class="n">col2</span><span class="p">)</span></div>
<div class="viewcode-block" id="countDistinct"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.countDistinct.html#pyspark.sql.functions.countDistinct">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">countDistinct</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="o">*</span><span class="n">cols</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Returns a new :class:`~pyspark.sql.Column` for distinct count of ``col`` or ``cols``.</span>
<span class="sd"> An alias of :func:`count_distinct`, and it is encouraged to use :func:`count_distinct`</span>
<span class="sd"> directly.</span>
<span class="sd"> .. versionadded:: 1.3.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">count_distinct</span><span class="p">(</span><span class="n">col</span><span class="p">,</span> <span class="o">*</span><span class="n">cols</span><span class="p">)</span></div>
<div class="viewcode-block" id="count_distinct"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.count_distinct.html#pyspark.sql.functions.count_distinct">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">count_distinct</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="o">*</span><span class="n">cols</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Returns a new :class:`Column` for distinct count of ``col`` or ``cols``.</span>
<span class="sd"> .. versionadded:: 3.2.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> first column to compute on.</span>
<span class="sd"> cols : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> other columns to compute on.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> distinct values of these two column values.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; from pyspark.sql import types</span>
<span class="sd"> &gt;&gt;&gt; df1 = spark.createDataFrame([1, 1, 3], types.IntegerType())</span>
<span class="sd"> &gt;&gt;&gt; df2 = spark.createDataFrame([1, 2], types.IntegerType())</span>
<span class="sd"> &gt;&gt;&gt; df1.join(df2).show()</span>
<span class="sd"> +-----+-----+</span>
<span class="sd"> |value|value|</span>
<span class="sd"> +-----+-----+</span>
<span class="sd"> | 1| 1|</span>
<span class="sd"> | 1| 2|</span>
<span class="sd"> | 1| 1|</span>
<span class="sd"> | 1| 2|</span>
<span class="sd"> | 3| 1|</span>
<span class="sd"> | 3| 2|</span>
<span class="sd"> +-----+-----+</span>
<span class="sd"> &gt;&gt;&gt; df1.join(df2).select(count_distinct(df1.value, df2.value)).show()</span>
<span class="sd"> +----------------------------+</span>
<span class="sd"> |count(DISTINCT value, value)|</span>
<span class="sd"> +----------------------------+</span>
<span class="sd"> | 4|</span>
<span class="sd"> +----------------------------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">get_active_spark_context</span><span class="p">()</span>
<span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span>
<span class="s2">&quot;count_distinct&quot;</span><span class="p">,</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">),</span> <span class="n">_to_seq</span><span class="p">(</span><span class="n">sc</span><span class="p">,</span> <span class="n">cols</span><span class="p">,</span> <span class="n">_to_java_column</span><span class="p">)</span>
<span class="p">)</span></div>
<div class="viewcode-block" id="first"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.first.html#pyspark.sql.functions.first">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">first</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="n">ignorenulls</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Aggregate function: returns the first value in a group.</span>
<span class="sd"> The function by default returns the first values it sees. It will return the first non-null</span>
<span class="sd"> value it sees when ignoreNulls is set to true. If all values are null, then null is returned.</span>
<span class="sd"> .. versionadded:: 1.3.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> The function is non-deterministic because its results depends on the order of the</span>
<span class="sd"> rows which may be non-deterministic after a shuffle.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> column to fetch first value for.</span>
<span class="sd"> ignorenulls : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> if first value is null then look for first non-null value.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> first value of the group.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(&quot;Alice&quot;, 2), (&quot;Bob&quot;, 5), (&quot;Alice&quot;, None)], (&quot;name&quot;, &quot;age&quot;))</span>
<span class="sd"> &gt;&gt;&gt; df = df.orderBy(df.age)</span>
<span class="sd"> &gt;&gt;&gt; df.groupby(&quot;name&quot;).agg(first(&quot;age&quot;)).orderBy(&quot;name&quot;).show()</span>
<span class="sd"> +-----+----------+</span>
<span class="sd"> | name|first(age)|</span>
<span class="sd"> +-----+----------+</span>
<span class="sd"> |Alice| NULL|</span>
<span class="sd"> | Bob| 5|</span>
<span class="sd"> +-----+----------+</span>
<span class="sd"> Now, to ignore any nulls we needs to set ``ignorenulls`` to `True`</span>
<span class="sd"> &gt;&gt;&gt; df.groupby(&quot;name&quot;).agg(first(&quot;age&quot;, ignorenulls=True)).orderBy(&quot;name&quot;).show()</span>
<span class="sd"> +-----+----------+</span>
<span class="sd"> | name|first(age)|</span>
<span class="sd"> +-----+----------+</span>
<span class="sd"> |Alice| 2|</span>
<span class="sd"> | Bob| 5|</span>
<span class="sd"> +-----+----------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">&quot;first&quot;</span><span class="p">,</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">),</span> <span class="n">ignorenulls</span><span class="p">)</span></div>
<div class="viewcode-block" id="grouping"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.grouping.html#pyspark.sql.functions.grouping">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">grouping</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Aggregate function: indicates whether a specified column in a GROUP BY list is aggregated</span>
<span class="sd"> or not, returns 1 for aggregated or 0 for not aggregated in the result set.</span>
<span class="sd"> .. versionadded:: 2.0.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> column to check if it&#39;s aggregated.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> returns 1 for aggregated or 0 for not aggregated in the result set.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(&quot;Alice&quot;, 2), (&quot;Bob&quot;, 5)], (&quot;name&quot;, &quot;age&quot;))</span>
<span class="sd"> &gt;&gt;&gt; df.cube(&quot;name&quot;).agg(grouping(&quot;name&quot;), sum(&quot;age&quot;)).orderBy(&quot;name&quot;).show()</span>
<span class="sd"> +-----+--------------+--------+</span>
<span class="sd"> | name|grouping(name)|sum(age)|</span>
<span class="sd"> +-----+--------------+--------+</span>
<span class="sd"> | NULL| 1| 7|</span>
<span class="sd"> |Alice| 0| 2|</span>
<span class="sd"> | Bob| 0| 5|</span>
<span class="sd"> +-----+--------------+--------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;grouping&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="grouping_id"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.grouping_id.html#pyspark.sql.functions.grouping_id">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">grouping_id</span><span class="p">(</span><span class="o">*</span><span class="n">cols</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Aggregate function: returns the level of grouping, equals to</span>
<span class="sd"> (grouping(c1) &lt;&lt; (n-1)) + (grouping(c2) &lt;&lt; (n-2)) + ... + grouping(cn)</span>
<span class="sd"> .. versionadded:: 2.0.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> The list of columns should match with grouping columns exactly, or empty (means all</span>
<span class="sd"> the grouping columns).</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> cols : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> columns to check for.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> returns level of the grouping it relates to.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(1, &quot;a&quot;, &quot;a&quot;),</span>
<span class="sd"> ... (3, &quot;a&quot;, &quot;a&quot;),</span>
<span class="sd"> ... (4, &quot;b&quot;, &quot;c&quot;)], [&quot;c1&quot;, &quot;c2&quot;, &quot;c3&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df.cube(&quot;c2&quot;, &quot;c3&quot;).agg(grouping_id(), sum(&quot;c1&quot;)).orderBy(&quot;c2&quot;, &quot;c3&quot;).show()</span>
<span class="sd"> +----+----+-------------+-------+</span>
<span class="sd"> | c2| c3|grouping_id()|sum(c1)|</span>
<span class="sd"> +----+----+-------------+-------+</span>
<span class="sd"> |NULL|NULL| 3| 8|</span>
<span class="sd"> |NULL| a| 2| 4|</span>
<span class="sd"> |NULL| c| 2| 4|</span>
<span class="sd"> | a|NULL| 1| 4|</span>
<span class="sd"> | a| a| 0| 4|</span>
<span class="sd"> | b|NULL| 1| 4|</span>
<span class="sd"> | b| c| 0| 4|</span>
<span class="sd"> +----+----+-------------+-------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_seq_of_columns</span><span class="p">(</span><span class="s2">&quot;grouping_id&quot;</span><span class="p">,</span> <span class="n">cols</span><span class="p">)</span></div>
<div class="viewcode-block" id="count_min_sketch"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.count_min_sketch.html#pyspark.sql.functions.count_min_sketch">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">count_min_sketch</span><span class="p">(</span>
<span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span>
<span class="n">eps</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span>
<span class="n">confidence</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span>
<span class="n">seed</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns a count-min sketch of a column with the given esp, confidence and seed.</span>
<span class="sd"> The result is an array of bytes, which can be deserialized to a `CountMinSketch` before usage.</span>
<span class="sd"> Count-min sketch is a probabilistic data structure used for cardinality estimation</span>
<span class="sd"> using sub-linear space.</span>
<span class="sd"> .. versionadded:: 3.5.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> target column to compute on.</span>
<span class="sd"> eps : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> relative error, must be positive</span>
<span class="sd"> confidence : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> confidence, must be positive and less than 1.0</span>
<span class="sd"> seed : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> random seed</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> count-min sketch of the column</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([[1], [2], [1]], [&#39;data&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df = df.agg(count_min_sketch(df.data, lit(0.5), lit(0.5), lit(1)).alias(&#39;sketch&#39;))</span>
<span class="sd"> &gt;&gt;&gt; df.select(hex(df.sketch).alias(&#39;r&#39;)).collect()</span>
<span class="sd"> [Row(r=&#39;0000000100000000000000030000000100000004000000005D8D6AB90000000000000000000000000000000200000000000000010000000000000000&#39;)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;count_min_sketch&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">,</span> <span class="n">eps</span><span class="p">,</span> <span class="n">confidence</span><span class="p">,</span> <span class="n">seed</span><span class="p">)</span></div>
<div class="viewcode-block" id="input_file_name"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.input_file_name.html#pyspark.sql.functions.input_file_name">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">input_file_name</span><span class="p">()</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Creates a string column for the file name of the current Spark task.</span>
<span class="sd"> .. versionadded:: 1.6.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> file names.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; import os</span>
<span class="sd"> &gt;&gt;&gt; path = os.path.abspath(__file__)</span>
<span class="sd"> &gt;&gt;&gt; df = spark.read.text(path)</span>
<span class="sd"> &gt;&gt;&gt; df.select(input_file_name()).first()</span>
<span class="sd"> Row(input_file_name()=&#39;file:///...&#39;)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">&quot;input_file_name&quot;</span><span class="p">)</span></div>
<div class="viewcode-block" id="isnan"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.isnan.html#pyspark.sql.functions.isnan">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">isnan</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;An expression that returns true if the column is NaN.</span>
<span class="sd"> .. versionadded:: 1.6.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> target column to compute on.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> True if value is NaN and False otherwise.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(1.0, float(&#39;nan&#39;)), (float(&#39;nan&#39;), 2.0)], (&quot;a&quot;, &quot;b&quot;))</span>
<span class="sd"> &gt;&gt;&gt; df.select(&quot;a&quot;, &quot;b&quot;, isnan(&quot;a&quot;).alias(&quot;r1&quot;), isnan(df.b).alias(&quot;r2&quot;)).show()</span>
<span class="sd"> +---+---+-----+-----+</span>
<span class="sd"> | a| b| r1| r2|</span>
<span class="sd"> +---+---+-----+-----+</span>
<span class="sd"> |1.0|NaN|false| true|</span>
<span class="sd"> |NaN|2.0| true|false|</span>
<span class="sd"> +---+---+-----+-----+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;isnan&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="isnull"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.isnull.html#pyspark.sql.functions.isnull">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">isnull</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;An expression that returns true if the column is null.</span>
<span class="sd"> .. versionadded:: 1.6.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> target column to compute on.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> True if value is null and False otherwise.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(1, None), (None, 2)], (&quot;a&quot;, &quot;b&quot;))</span>
<span class="sd"> &gt;&gt;&gt; df.select(&quot;a&quot;, &quot;b&quot;, isnull(&quot;a&quot;).alias(&quot;r1&quot;), isnull(df.b).alias(&quot;r2&quot;)).show()</span>
<span class="sd"> +----+----+-----+-----+</span>
<span class="sd"> | a| b| r1| r2|</span>
<span class="sd"> +----+----+-----+-----+</span>
<span class="sd"> | 1|NULL|false| true|</span>
<span class="sd"> |NULL| 2| true|false|</span>
<span class="sd"> +----+----+-----+-----+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;isnull&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="last"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.last.html#pyspark.sql.functions.last">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">last</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="n">ignorenulls</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Aggregate function: returns the last value in a group.</span>
<span class="sd"> The function by default returns the last values it sees. It will return the last non-null</span>
<span class="sd"> value it sees when ignoreNulls is set to true. If all values are null, then null is returned.</span>
<span class="sd"> .. versionadded:: 1.3.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> The function is non-deterministic because its results depends on the order of the</span>
<span class="sd"> rows which may be non-deterministic after a shuffle.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> column to fetch last value for.</span>
<span class="sd"> ignorenulls : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> if last value is null then look for non-null value.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> last value of the group.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(&quot;Alice&quot;, 2), (&quot;Bob&quot;, 5), (&quot;Alice&quot;, None)], (&quot;name&quot;, &quot;age&quot;))</span>
<span class="sd"> &gt;&gt;&gt; df = df.orderBy(df.age.desc())</span>
<span class="sd"> &gt;&gt;&gt; df.groupby(&quot;name&quot;).agg(last(&quot;age&quot;)).orderBy(&quot;name&quot;).show()</span>
<span class="sd"> +-----+---------+</span>
<span class="sd"> | name|last(age)|</span>
<span class="sd"> +-----+---------+</span>
<span class="sd"> |Alice| NULL|</span>
<span class="sd"> | Bob| 5|</span>
<span class="sd"> +-----+---------+</span>
<span class="sd"> Now, to ignore any nulls we needs to set ``ignorenulls`` to `True`</span>
<span class="sd"> &gt;&gt;&gt; df.groupby(&quot;name&quot;).agg(last(&quot;age&quot;, ignorenulls=True)).orderBy(&quot;name&quot;).show()</span>
<span class="sd"> +-----+---------+</span>
<span class="sd"> | name|last(age)|</span>
<span class="sd"> +-----+---------+</span>
<span class="sd"> |Alice| 2|</span>
<span class="sd"> | Bob| 5|</span>
<span class="sd"> +-----+---------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">&quot;last&quot;</span><span class="p">,</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">),</span> <span class="n">ignorenulls</span><span class="p">)</span></div>
<div class="viewcode-block" id="monotonically_increasing_id"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.monotonically_increasing_id.html#pyspark.sql.functions.monotonically_increasing_id">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">monotonically_increasing_id</span><span class="p">()</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;A column that generates monotonically increasing 64-bit integers.</span>
<span class="sd"> The generated ID is guaranteed to be monotonically increasing and unique, but not consecutive.</span>
<span class="sd"> The current implementation puts the partition ID in the upper 31 bits, and the record number</span>
<span class="sd"> within each partition in the lower 33 bits. The assumption is that the data frame has</span>
<span class="sd"> less than 1 billion partitions, and each partition has less than 8 billion records.</span>
<span class="sd"> .. versionadded:: 1.6.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> The function is non-deterministic because its result depends on partition IDs.</span>
<span class="sd"> As an example, consider a :class:`DataFrame` with two partitions, each with 3 records.</span>
<span class="sd"> This expression would return the following IDs:</span>
<span class="sd"> 0, 1, 2, 8589934592 (1L &lt;&lt; 33), 8589934593, 8589934594.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> last value of the group.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; from pyspark.sql import functions as sf</span>
<span class="sd"> &gt;&gt;&gt; spark.range(0, 10, 1, 2).select(sf.monotonically_increasing_id()).show()</span>
<span class="sd"> +-----------------------------+</span>
<span class="sd"> |monotonically_increasing_id()|</span>
<span class="sd"> +-----------------------------+</span>
<span class="sd"> | 0|</span>
<span class="sd"> | 1|</span>
<span class="sd"> | 2|</span>
<span class="sd"> | 3|</span>
<span class="sd"> | 4|</span>
<span class="sd"> | 8589934592|</span>
<span class="sd"> | 8589934593|</span>
<span class="sd"> | 8589934594|</span>
<span class="sd"> | 8589934595|</span>
<span class="sd"> | 8589934596|</span>
<span class="sd"> +-----------------------------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">&quot;monotonically_increasing_id&quot;</span><span class="p">)</span></div>
<div class="viewcode-block" id="nanvl"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.nanvl.html#pyspark.sql.functions.nanvl">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">nanvl</span><span class="p">(</span><span class="n">col1</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="n">col2</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Returns col1 if it is not NaN, or col2 if col1 is NaN.</span>
<span class="sd"> Both inputs should be floating point columns (:class:`DoubleType` or :class:`FloatType`).</span>
<span class="sd"> .. versionadded:: 1.6.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col1 : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> first column to check.</span>
<span class="sd"> col2 : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> second column to return if first is NaN.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> value from first column or second if first is NaN .</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(1.0, float(&#39;nan&#39;)), (float(&#39;nan&#39;), 2.0)], (&quot;a&quot;, &quot;b&quot;))</span>
<span class="sd"> &gt;&gt;&gt; df.select(nanvl(&quot;a&quot;, &quot;b&quot;).alias(&quot;r1&quot;), nanvl(df.a, df.b).alias(&quot;r2&quot;)).collect()</span>
<span class="sd"> [Row(r1=1.0, r2=1.0), Row(r1=2.0, r2=2.0)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;nanvl&quot;</span><span class="p">,</span> <span class="n">col1</span><span class="p">,</span> <span class="n">col2</span><span class="p">)</span></div>
<div class="viewcode-block" id="percentile"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.percentile.html#pyspark.sql.functions.percentile">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">percentile</span><span class="p">(</span>
<span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span>
<span class="n">percentage</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="n">Column</span><span class="p">,</span> <span class="nb">float</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="nb">float</span><span class="p">],</span> <span class="n">Tuple</span><span class="p">[</span><span class="nb">float</span><span class="p">]],</span>
<span class="n">frequency</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="n">Column</span><span class="p">,</span> <span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="mi">1</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Returns the exact percentile(s) of numeric column `expr` at the given percentage(s)</span>
<span class="sd"> with value range in [0.0, 1.0].</span>
<span class="sd"> .. versionadded:: 3.5.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str input column.</span>
<span class="sd"> percentage : :class:`~pyspark.sql.Column`, float, list of floats or tuple of floats</span>
<span class="sd"> percentage in decimal (must be between 0.0 and 1.0).</span>
<span class="sd"> frequency : :class:`~pyspark.sql.Column` or int is a positive numeric literal which</span>
<span class="sd"> controls frequency.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> the exact `percentile` of the numeric column.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; key = (col(&quot;id&quot;) % 3).alias(&quot;key&quot;)</span>
<span class="sd"> &gt;&gt;&gt; value = (randn(42) + key * 10).alias(&quot;value&quot;)</span>
<span class="sd"> &gt;&gt;&gt; df = spark.range(0, 1000, 1, 1).select(key, value)</span>
<span class="sd"> &gt;&gt;&gt; df.select(</span>
<span class="sd"> ... percentile(&quot;value&quot;, [0.25, 0.5, 0.75], lit(1)).alias(&quot;quantiles&quot;)</span>
<span class="sd"> ... ).show()</span>
<span class="sd"> +--------------------+</span>
<span class="sd"> | quantiles|</span>
<span class="sd"> +--------------------+</span>
<span class="sd"> |[0.74419914941216...|</span>
<span class="sd"> +--------------------+</span>
<span class="sd"> &gt;&gt;&gt; df.groupBy(&quot;key&quot;).agg(</span>
<span class="sd"> ... percentile(&quot;value&quot;, 0.5, lit(1)).alias(&quot;median&quot;)</span>
<span class="sd"> ... ).show()</span>
<span class="sd"> +---+--------------------+</span>
<span class="sd"> |key| median|</span>
<span class="sd"> +---+--------------------+</span>
<span class="sd"> | 0|-0.03449962216667901|</span>
<span class="sd"> | 1| 9.990389751837329|</span>
<span class="sd"> | 2| 19.967859769284075|</span>
<span class="sd"> +---+--------------------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">get_active_spark_context</span><span class="p">()</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">percentage</span><span class="p">,</span> <span class="p">(</span><span class="nb">list</span><span class="p">,</span> <span class="nb">tuple</span><span class="p">)):</span>
<span class="c1"># A local list</span>
<span class="n">percentage</span> <span class="o">=</span> <span class="n">_invoke_function</span><span class="p">(</span>
<span class="s2">&quot;array&quot;</span><span class="p">,</span> <span class="n">_to_seq</span><span class="p">(</span><span class="n">sc</span><span class="p">,</span> <span class="p">[</span><span class="n">_create_column_from_literal</span><span class="p">(</span><span class="n">x</span><span class="p">)</span> <span class="k">for</span> <span class="n">x</span> <span class="ow">in</span> <span class="n">percentage</span><span class="p">])</span>
<span class="p">)</span><span class="o">.</span><span class="n">_jc</span>
<span class="k">elif</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">percentage</span><span class="p">,</span> <span class="n">Column</span><span class="p">):</span>
<span class="c1"># Already a Column</span>
<span class="n">percentage</span> <span class="o">=</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">percentage</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="c1"># Probably scalar</span>
<span class="n">percentage</span> <span class="o">=</span> <span class="n">_create_column_from_literal</span><span class="p">(</span><span class="n">percentage</span><span class="p">)</span>
<span class="n">frequency</span> <span class="o">=</span> <span class="p">(</span>
<span class="n">_to_java_column</span><span class="p">(</span><span class="n">frequency</span><span class="p">)</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">frequency</span><span class="p">,</span> <span class="n">Column</span><span class="p">)</span>
<span class="k">else</span> <span class="n">_create_column_from_literal</span><span class="p">(</span><span class="n">frequency</span><span class="p">)</span>
<span class="p">)</span>
<span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">&quot;percentile&quot;</span><span class="p">,</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">),</span> <span class="n">percentage</span><span class="p">,</span> <span class="n">frequency</span><span class="p">)</span></div>
<div class="viewcode-block" id="percentile_approx"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.percentile_approx.html#pyspark.sql.functions.percentile_approx">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">percentile_approx</span><span class="p">(</span>
<span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span>
<span class="n">percentage</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="n">Column</span><span class="p">,</span> <span class="nb">float</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="nb">float</span><span class="p">],</span> <span class="n">Tuple</span><span class="p">[</span><span class="nb">float</span><span class="p">]],</span>
<span class="n">accuracy</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="n">Column</span><span class="p">,</span> <span class="nb">float</span><span class="p">]</span> <span class="o">=</span> <span class="mi">10000</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Returns the approximate `percentile` of the numeric column `col` which is the smallest value</span>
<span class="sd"> in the ordered `col` values (sorted from least to greatest) such that no more than `percentage`</span>
<span class="sd"> of `col` values is less than the value or equal to that value.</span>
<span class="sd"> .. versionadded:: 3.1.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> input column.</span>
<span class="sd"> percentage : :class:`~pyspark.sql.Column`, float, list of floats or tuple of floats</span>
<span class="sd"> percentage in decimal (must be between 0.0 and 1.0).</span>
<span class="sd"> When percentage is an array, each value of the percentage array must be between 0.0 and 1.0.</span>
<span class="sd"> In this case, returns the approximate percentile array of column col</span>
<span class="sd"> at the given percentage array.</span>
<span class="sd"> accuracy : :class:`~pyspark.sql.Column` or float</span>
<span class="sd"> is a positive numeric literal which controls approximation accuracy</span>
<span class="sd"> at the cost of memory. Higher value of accuracy yields better accuracy,</span>
<span class="sd"> 1.0/accuracy is the relative error of the approximation. (default: 10000).</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> approximate `percentile` of the numeric column.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; key = (col(&quot;id&quot;) % 3).alias(&quot;key&quot;)</span>
<span class="sd"> &gt;&gt;&gt; value = (randn(42) + key * 10).alias(&quot;value&quot;)</span>
<span class="sd"> &gt;&gt;&gt; df = spark.range(0, 1000, 1, 1).select(key, value)</span>
<span class="sd"> &gt;&gt;&gt; df.select(</span>
<span class="sd"> ... percentile_approx(&quot;value&quot;, [0.25, 0.5, 0.75], 1000000).alias(&quot;quantiles&quot;)</span>
<span class="sd"> ... ).printSchema()</span>
<span class="sd"> root</span>
<span class="sd"> |-- quantiles: array (nullable = true)</span>
<span class="sd"> | |-- element: double (containsNull = false)</span>
<span class="sd"> &gt;&gt;&gt; df.groupBy(&quot;key&quot;).agg(</span>
<span class="sd"> ... percentile_approx(&quot;value&quot;, 0.5, lit(1000000)).alias(&quot;median&quot;)</span>
<span class="sd"> ... ).printSchema()</span>
<span class="sd"> root</span>
<span class="sd"> |-- key: long (nullable = true)</span>
<span class="sd"> |-- median: double (nullable = true)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">get_active_spark_context</span><span class="p">()</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">percentage</span><span class="p">,</span> <span class="p">(</span><span class="nb">list</span><span class="p">,</span> <span class="nb">tuple</span><span class="p">)):</span>
<span class="c1"># A local list</span>
<span class="n">percentage</span> <span class="o">=</span> <span class="n">_invoke_function</span><span class="p">(</span>
<span class="s2">&quot;array&quot;</span><span class="p">,</span> <span class="n">_to_seq</span><span class="p">(</span><span class="n">sc</span><span class="p">,</span> <span class="p">[</span><span class="n">_create_column_from_literal</span><span class="p">(</span><span class="n">x</span><span class="p">)</span> <span class="k">for</span> <span class="n">x</span> <span class="ow">in</span> <span class="n">percentage</span><span class="p">])</span>
<span class="p">)</span><span class="o">.</span><span class="n">_jc</span>
<span class="k">elif</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">percentage</span><span class="p">,</span> <span class="n">Column</span><span class="p">):</span>
<span class="c1"># Already a Column</span>
<span class="n">percentage</span> <span class="o">=</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">percentage</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="c1"># Probably scalar</span>
<span class="n">percentage</span> <span class="o">=</span> <span class="n">_create_column_from_literal</span><span class="p">(</span><span class="n">percentage</span><span class="p">)</span>
<span class="n">accuracy</span> <span class="o">=</span> <span class="p">(</span>
<span class="n">_to_java_column</span><span class="p">(</span><span class="n">accuracy</span><span class="p">)</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">accuracy</span><span class="p">,</span> <span class="n">Column</span><span class="p">)</span>
<span class="k">else</span> <span class="n">_create_column_from_literal</span><span class="p">(</span><span class="n">accuracy</span><span class="p">)</span>
<span class="p">)</span>
<span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">&quot;percentile_approx&quot;</span><span class="p">,</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">),</span> <span class="n">percentage</span><span class="p">,</span> <span class="n">accuracy</span><span class="p">)</span></div>
<div class="viewcode-block" id="approx_percentile"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.approx_percentile.html#pyspark.sql.functions.approx_percentile">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">approx_percentile</span><span class="p">(</span>
<span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span>
<span class="n">percentage</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="n">Column</span><span class="p">,</span> <span class="nb">float</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="nb">float</span><span class="p">],</span> <span class="n">Tuple</span><span class="p">[</span><span class="nb">float</span><span class="p">]],</span>
<span class="n">accuracy</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="n">Column</span><span class="p">,</span> <span class="nb">float</span><span class="p">]</span> <span class="o">=</span> <span class="mi">10000</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Returns the approximate `percentile` of the numeric column `col` which is the smallest value</span>
<span class="sd"> in the ordered `col` values (sorted from least to greatest) such that no more than `percentage`</span>
<span class="sd"> of `col` values is less than the value or equal to that value.</span>
<span class="sd"> .. versionadded:: 3.5.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> input column.</span>
<span class="sd"> percentage : :class:`~pyspark.sql.Column`, float, list of floats or tuple of floats</span>
<span class="sd"> percentage in decimal (must be between 0.0 and 1.0).</span>
<span class="sd"> When percentage is an array, each value of the percentage array must be between 0.0 and 1.0.</span>
<span class="sd"> In this case, returns the approximate percentile array of column col</span>
<span class="sd"> at the given percentage array.</span>
<span class="sd"> accuracy : :class:`~pyspark.sql.Column` or float</span>
<span class="sd"> is a positive numeric literal which controls approximation accuracy</span>
<span class="sd"> at the cost of memory. Higher value of accuracy yields better accuracy,</span>
<span class="sd"> 1.0/accuracy is the relative error of the approximation. (default: 10000).</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> approximate `percentile` of the numeric column.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; import pyspark.sql.functions as sf</span>
<span class="sd"> &gt;&gt;&gt; key = (sf.col(&quot;id&quot;) % 3).alias(&quot;key&quot;)</span>
<span class="sd"> &gt;&gt;&gt; value = (sf.randn(42) + key * 10).alias(&quot;value&quot;)</span>
<span class="sd"> &gt;&gt;&gt; df = spark.range(0, 1000, 1, 1).select(key, value)</span>
<span class="sd"> &gt;&gt;&gt; df.select(</span>
<span class="sd"> ... sf.approx_percentile(&quot;value&quot;, [0.25, 0.5, 0.75], 1000000)</span>
<span class="sd"> ... ).printSchema()</span>
<span class="sd"> root</span>
<span class="sd"> |-- approx_percentile(value, array(0.25, 0.5, 0.75), 1000000): array (nullable = true)</span>
<span class="sd"> | |-- element: double (containsNull = false)</span>
<span class="sd"> &gt;&gt;&gt; df.groupBy(&quot;key&quot;).agg(</span>
<span class="sd"> ... sf.approx_percentile(&quot;value&quot;, 0.5, sf.lit(1000000))</span>
<span class="sd"> ... ).printSchema()</span>
<span class="sd"> root</span>
<span class="sd"> |-- key: long (nullable = true)</span>
<span class="sd"> |-- approx_percentile(value, 0.5, 1000000): double (nullable = true)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">get_active_spark_context</span><span class="p">()</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">percentage</span><span class="p">,</span> <span class="p">(</span><span class="nb">list</span><span class="p">,</span> <span class="nb">tuple</span><span class="p">)):</span>
<span class="c1"># A local list</span>
<span class="n">percentage</span> <span class="o">=</span> <span class="n">_invoke_function</span><span class="p">(</span>
<span class="s2">&quot;array&quot;</span><span class="p">,</span> <span class="n">_to_seq</span><span class="p">(</span><span class="n">sc</span><span class="p">,</span> <span class="p">[</span><span class="n">_create_column_from_literal</span><span class="p">(</span><span class="n">x</span><span class="p">)</span> <span class="k">for</span> <span class="n">x</span> <span class="ow">in</span> <span class="n">percentage</span><span class="p">])</span>
<span class="p">)</span><span class="o">.</span><span class="n">_jc</span>
<span class="k">elif</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">percentage</span><span class="p">,</span> <span class="n">Column</span><span class="p">):</span>
<span class="c1"># Already a Column</span>
<span class="n">percentage</span> <span class="o">=</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">percentage</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="c1"># Probably scalar</span>
<span class="n">percentage</span> <span class="o">=</span> <span class="n">_create_column_from_literal</span><span class="p">(</span><span class="n">percentage</span><span class="p">)</span>
<span class="n">accuracy</span> <span class="o">=</span> <span class="p">(</span>
<span class="n">_to_java_column</span><span class="p">(</span><span class="n">accuracy</span><span class="p">)</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">accuracy</span><span class="p">,</span> <span class="n">Column</span><span class="p">)</span>
<span class="k">else</span> <span class="n">_create_column_from_literal</span><span class="p">(</span><span class="n">accuracy</span><span class="p">)</span>
<span class="p">)</span>
<span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">&quot;approx_percentile&quot;</span><span class="p">,</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">),</span> <span class="n">percentage</span><span class="p">,</span> <span class="n">accuracy</span><span class="p">)</span></div>
<div class="viewcode-block" id="rand"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.rand.html#pyspark.sql.functions.rand">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">rand</span><span class="p">(</span><span class="n">seed</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Generates a random column with independent and identically distributed (i.i.d.) samples</span>
<span class="sd"> uniformly distributed in [0.0, 1.0).</span>
<span class="sd"> .. versionadded:: 1.4.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> The function is non-deterministic in general case.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> seed : int (default: None)</span>
<span class="sd"> seed value for random generator.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> random values.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; from pyspark.sql import functions as sf</span>
<span class="sd"> &gt;&gt;&gt; spark.range(0, 2, 1, 1).withColumn(&#39;rand&#39;, sf.rand(seed=42) * 3).show()</span>
<span class="sd"> +---+------------------+</span>
<span class="sd"> | id| rand|</span>
<span class="sd"> +---+------------------+</span>
<span class="sd"> | 0|1.8575681106759028|</span>
<span class="sd"> | 1|1.5288056527339444|</span>
<span class="sd"> +---+------------------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="n">seed</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">&quot;rand&quot;</span><span class="p">,</span> <span class="n">seed</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">&quot;rand&quot;</span><span class="p">)</span></div>
<div class="viewcode-block" id="randn"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.randn.html#pyspark.sql.functions.randn">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">randn</span><span class="p">(</span><span class="n">seed</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Generates a column with independent and identically distributed (i.i.d.) samples from</span>
<span class="sd"> the standard normal distribution.</span>
<span class="sd"> .. versionadded:: 1.4.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> The function is non-deterministic in general case.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> seed : int (default: None)</span>
<span class="sd"> seed value for random generator.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> random values.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; from pyspark.sql import functions as sf</span>
<span class="sd"> &gt;&gt;&gt; spark.range(0, 2, 1, 1).withColumn(&#39;randn&#39;, sf.randn(seed=42)).show()</span>
<span class="sd"> +---+------------------+</span>
<span class="sd"> | id| randn|</span>
<span class="sd"> +---+------------------+</span>
<span class="sd"> | 0| 2.384479054241165|</span>
<span class="sd"> | 1|0.1920934041293524|</span>
<span class="sd"> +---+------------------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="n">seed</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">&quot;randn&quot;</span><span class="p">,</span> <span class="n">seed</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">&quot;randn&quot;</span><span class="p">)</span></div>
<div class="viewcode-block" id="round"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.round.html#pyspark.sql.functions.round">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">round</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="n">scale</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">0</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Round the given value to `scale` decimal places using HALF_UP rounding mode if `scale` &gt;= 0</span>
<span class="sd"> or at integral part when `scale` &lt; 0.</span>
<span class="sd"> .. versionadded:: 1.5.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> input column to round.</span>
<span class="sd"> scale : int optional default 0</span>
<span class="sd"> scale value.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> rounded values.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; spark.createDataFrame([(2.5,)], [&#39;a&#39;]).select(round(&#39;a&#39;, 0).alias(&#39;r&#39;)).collect()</span>
<span class="sd"> [Row(r=3.0)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">&quot;round&quot;</span><span class="p">,</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">),</span> <span class="n">scale</span><span class="p">)</span></div>
<div class="viewcode-block" id="bround"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.bround.html#pyspark.sql.functions.bround">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">bround</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="n">scale</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">0</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Round the given value to `scale` decimal places using HALF_EVEN rounding mode if `scale` &gt;= 0</span>
<span class="sd"> or at integral part when `scale` &lt; 0.</span>
<span class="sd"> .. versionadded:: 2.0.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> input column to round.</span>
<span class="sd"> scale : int optional default 0</span>
<span class="sd"> scale value.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> rounded values.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; spark.createDataFrame([(2.5,)], [&#39;a&#39;]).select(bround(&#39;a&#39;, 0).alias(&#39;r&#39;)).collect()</span>
<span class="sd"> [Row(r=2.0)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">&quot;bround&quot;</span><span class="p">,</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">),</span> <span class="n">scale</span><span class="p">)</span></div>
<span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">shiftLeft</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="n">numBits</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Shift the given value numBits left.</span>
<span class="sd"> .. versionadded:: 1.5.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> .. deprecated:: 3.2.0</span>
<span class="sd"> Use :func:`shiftleft` instead.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">warnings</span><span class="o">.</span><span class="n">warn</span><span class="p">(</span><span class="s2">&quot;Deprecated in 3.2, use shiftleft instead.&quot;</span><span class="p">,</span> <span class="ne">FutureWarning</span><span class="p">)</span>
<span class="k">return</span> <span class="n">shiftleft</span><span class="p">(</span><span class="n">col</span><span class="p">,</span> <span class="n">numBits</span><span class="p">)</span>
<div class="viewcode-block" id="shiftleft"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.shiftleft.html#pyspark.sql.functions.shiftleft">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">shiftleft</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="n">numBits</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Shift the given value numBits left.</span>
<span class="sd"> .. versionadded:: 3.2.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> input column of values to shift.</span>
<span class="sd"> numBits : int</span>
<span class="sd"> number of bits to shift.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> shifted value.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; spark.createDataFrame([(21,)], [&#39;a&#39;]).select(shiftleft(&#39;a&#39;, 1).alias(&#39;r&#39;)).collect()</span>
<span class="sd"> [Row(r=42)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">&quot;shiftleft&quot;</span><span class="p">,</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">),</span> <span class="n">numBits</span><span class="p">)</span></div>
<span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">shiftRight</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="n">numBits</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;(Signed) shift the given value numBits right.</span>
<span class="sd"> .. versionadded:: 1.5.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> .. deprecated:: 3.2.0</span>
<span class="sd"> Use :func:`shiftright` instead.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">warnings</span><span class="o">.</span><span class="n">warn</span><span class="p">(</span><span class="s2">&quot;Deprecated in 3.2, use shiftright instead.&quot;</span><span class="p">,</span> <span class="ne">FutureWarning</span><span class="p">)</span>
<span class="k">return</span> <span class="n">shiftright</span><span class="p">(</span><span class="n">col</span><span class="p">,</span> <span class="n">numBits</span><span class="p">)</span>
<div class="viewcode-block" id="shiftright"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.shiftright.html#pyspark.sql.functions.shiftright">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">shiftright</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="n">numBits</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;(Signed) shift the given value numBits right.</span>
<span class="sd"> .. versionadded:: 3.2.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> input column of values to shift.</span>
<span class="sd"> numBits : int</span>
<span class="sd"> number of bits to shift.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> shifted values.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; spark.createDataFrame([(42,)], [&#39;a&#39;]).select(shiftright(&#39;a&#39;, 1).alias(&#39;r&#39;)).collect()</span>
<span class="sd"> [Row(r=21)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">&quot;shiftright&quot;</span><span class="p">,</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">),</span> <span class="n">numBits</span><span class="p">)</span></div>
<span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">shiftRightUnsigned</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="n">numBits</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Unsigned shift the given value numBits right.</span>
<span class="sd"> .. versionadded:: 1.5.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> .. deprecated:: 3.2.0</span>
<span class="sd"> Use :func:`shiftrightunsigned` instead.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">warnings</span><span class="o">.</span><span class="n">warn</span><span class="p">(</span><span class="s2">&quot;Deprecated in 3.2, use shiftrightunsigned instead.&quot;</span><span class="p">,</span> <span class="ne">FutureWarning</span><span class="p">)</span>
<span class="k">return</span> <span class="n">shiftrightunsigned</span><span class="p">(</span><span class="n">col</span><span class="p">,</span> <span class="n">numBits</span><span class="p">)</span>
<div class="viewcode-block" id="shiftrightunsigned"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.shiftrightunsigned.html#pyspark.sql.functions.shiftrightunsigned">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">shiftrightunsigned</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="n">numBits</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Unsigned shift the given value numBits right.</span>
<span class="sd"> .. versionadded:: 3.2.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> input column of values to shift.</span>
<span class="sd"> numBits : int</span>
<span class="sd"> number of bits to shift.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> shifted value.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(-42,)], [&#39;a&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(shiftrightunsigned(&#39;a&#39;, 1).alias(&#39;r&#39;)).collect()</span>
<span class="sd"> [Row(r=9223372036854775787)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">&quot;shiftrightunsigned&quot;</span><span class="p">,</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">),</span> <span class="n">numBits</span><span class="p">)</span></div>
<div class="viewcode-block" id="spark_partition_id"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.spark_partition_id.html#pyspark.sql.functions.spark_partition_id">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">spark_partition_id</span><span class="p">()</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;A column for partition ID.</span>
<span class="sd"> .. versionadded:: 1.6.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> This is non deterministic because it depends on data partitioning and task scheduling.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> partition id the record belongs to.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.range(2)</span>
<span class="sd"> &gt;&gt;&gt; df.repartition(1).select(spark_partition_id().alias(&quot;pid&quot;)).collect()</span>
<span class="sd"> [Row(pid=0), Row(pid=0)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">&quot;spark_partition_id&quot;</span><span class="p">)</span></div>
<div class="viewcode-block" id="expr"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.expr.html#pyspark.sql.functions.expr">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">expr</span><span class="p">(</span><span class="nb">str</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Parses the expression string into the column that it represents</span>
<span class="sd"> .. versionadded:: 1.5.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> str : str</span>
<span class="sd"> expression defined in string.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> column representing the expression.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([[&quot;Alice&quot;], [&quot;Bob&quot;]], [&quot;name&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(&quot;name&quot;, expr(&quot;length(name)&quot;)).show()</span>
<span class="sd"> +-----+------------+</span>
<span class="sd"> | name|length(name)|</span>
<span class="sd"> +-----+------------+</span>
<span class="sd"> |Alice| 5|</span>
<span class="sd"> | Bob| 3|</span>
<span class="sd"> +-----+------------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">&quot;expr&quot;</span><span class="p">,</span> <span class="nb">str</span><span class="p">)</span></div>
<span class="nd">@overload</span>
<span class="k">def</span> <span class="nf">struct</span><span class="p">(</span><span class="o">*</span><span class="n">cols</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="o">...</span>
<span class="nd">@overload</span>
<span class="k">def</span> <span class="nf">struct</span><span class="p">(</span><span class="n">__cols</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="s2">&quot;ColumnOrName_&quot;</span><span class="p">],</span> <span class="n">Tuple</span><span class="p">[</span><span class="s2">&quot;ColumnOrName_&quot;</span><span class="p">,</span> <span class="o">...</span><span class="p">]])</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="o">...</span>
<div class="viewcode-block" id="struct"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.struct.html#pyspark.sql.functions.struct">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">struct</span><span class="p">(</span>
<span class="o">*</span><span class="n">cols</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="n">Union</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="s2">&quot;ColumnOrName_&quot;</span><span class="p">],</span> <span class="n">Tuple</span><span class="p">[</span><span class="s2">&quot;ColumnOrName_&quot;</span><span class="p">,</span> <span class="o">...</span><span class="p">]]]</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Creates a new struct column.</span>
<span class="sd"> .. versionadded:: 1.4.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> cols : list, set, str or :class:`~pyspark.sql.Column`</span>
<span class="sd"> column names or :class:`~pyspark.sql.Column`\\s to contain in the output struct.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> a struct type column of given columns.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(&quot;Alice&quot;, 2), (&quot;Bob&quot;, 5)], (&quot;name&quot;, &quot;age&quot;))</span>
<span class="sd"> &gt;&gt;&gt; df.select(struct(&#39;age&#39;, &#39;name&#39;).alias(&quot;struct&quot;)).collect()</span>
<span class="sd"> [Row(struct=Row(age=2, name=&#39;Alice&#39;)), Row(struct=Row(age=5, name=&#39;Bob&#39;))]</span>
<span class="sd"> &gt;&gt;&gt; df.select(struct([df.age, df.name]).alias(&quot;struct&quot;)).collect()</span>
<span class="sd"> [Row(struct=Row(age=2, name=&#39;Alice&#39;)), Row(struct=Row(age=5, name=&#39;Bob&#39;))]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">cols</span><span class="p">)</span> <span class="o">==</span> <span class="mi">1</span> <span class="ow">and</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">cols</span><span class="p">[</span><span class="mi">0</span><span class="p">],</span> <span class="p">(</span><span class="nb">list</span><span class="p">,</span> <span class="nb">set</span><span class="p">)):</span>
<span class="n">cols</span> <span class="o">=</span> <span class="n">cols</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> <span class="c1"># type: ignore[assignment]</span>
<span class="k">return</span> <span class="n">_invoke_function_over_seq_of_columns</span><span class="p">(</span><span class="s2">&quot;struct&quot;</span><span class="p">,</span> <span class="n">cols</span><span class="p">)</span> <span class="c1"># type: ignore[arg-type]</span></div>
<div class="viewcode-block" id="named_struct"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.named_struct.html#pyspark.sql.functions.named_struct">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">named_struct</span><span class="p">(</span><span class="o">*</span><span class="n">cols</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Creates a struct with the given field names and values.</span>
<span class="sd"> .. versionadded:: 3.5.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> cols : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> list of columns to work on.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(1, 2, 3)], [&#39;a&#39;, &#39;b&#39;, &#39;c&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(named_struct(lit(&#39;x&#39;), df.a, lit(&#39;y&#39;), df.b).alias(&#39;r&#39;)).collect()</span>
<span class="sd"> [Row(r=Row(x=1, y=2))]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_seq_of_columns</span><span class="p">(</span><span class="s2">&quot;named_struct&quot;</span><span class="p">,</span> <span class="n">cols</span><span class="p">)</span></div>
<div class="viewcode-block" id="greatest"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.greatest.html#pyspark.sql.functions.greatest">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">greatest</span><span class="p">(</span><span class="o">*</span><span class="n">cols</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns the greatest value of the list of column names, skipping null values.</span>
<span class="sd"> This function takes at least 2 parameters. It will return null if all parameters are null.</span>
<span class="sd"> .. versionadded:: 1.5.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> columns to check for gratest value.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> gratest value.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(1, 4, 3)], [&#39;a&#39;, &#39;b&#39;, &#39;c&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(greatest(df.a, df.b, df.c).alias(&quot;greatest&quot;)).collect()</span>
<span class="sd"> [Row(greatest=4)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">cols</span><span class="p">)</span> <span class="o">&lt;</span> <span class="mi">2</span><span class="p">:</span>
<span class="k">raise</span> <span class="n">PySparkValueError</span><span class="p">(</span>
<span class="n">error_class</span><span class="o">=</span><span class="s2">&quot;WRONG_NUM_COLUMNS&quot;</span><span class="p">,</span>
<span class="n">message_parameters</span><span class="o">=</span><span class="p">{</span><span class="s2">&quot;func_name&quot;</span><span class="p">:</span> <span class="s2">&quot;greatest&quot;</span><span class="p">,</span> <span class="s2">&quot;num_cols&quot;</span><span class="p">:</span> <span class="s2">&quot;2&quot;</span><span class="p">},</span>
<span class="p">)</span>
<span class="k">return</span> <span class="n">_invoke_function_over_seq_of_columns</span><span class="p">(</span><span class="s2">&quot;greatest&quot;</span><span class="p">,</span> <span class="n">cols</span><span class="p">)</span></div>
<div class="viewcode-block" id="least"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.least.html#pyspark.sql.functions.least">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">least</span><span class="p">(</span><span class="o">*</span><span class="n">cols</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns the least value of the list of column names, skipping null values.</span>
<span class="sd"> This function takes at least 2 parameters. It will return null if all parameters are null.</span>
<span class="sd"> .. versionadded:: 1.5.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> cols : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> column names or columns to be compared</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> least value.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(1, 4, 3)], [&#39;a&#39;, &#39;b&#39;, &#39;c&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(least(df.a, df.b, df.c).alias(&quot;least&quot;)).collect()</span>
<span class="sd"> [Row(least=1)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">cols</span><span class="p">)</span> <span class="o">&lt;</span> <span class="mi">2</span><span class="p">:</span>
<span class="k">raise</span> <span class="n">PySparkValueError</span><span class="p">(</span>
<span class="n">error_class</span><span class="o">=</span><span class="s2">&quot;WRONG_NUM_COLUMNS&quot;</span><span class="p">,</span>
<span class="n">message_parameters</span><span class="o">=</span><span class="p">{</span><span class="s2">&quot;func_name&quot;</span><span class="p">:</span> <span class="s2">&quot;least&quot;</span><span class="p">,</span> <span class="s2">&quot;num_cols&quot;</span><span class="p">:</span> <span class="s2">&quot;2&quot;</span><span class="p">},</span>
<span class="p">)</span>
<span class="k">return</span> <span class="n">_invoke_function_over_seq_of_columns</span><span class="p">(</span><span class="s2">&quot;least&quot;</span><span class="p">,</span> <span class="n">cols</span><span class="p">)</span></div>
<div class="viewcode-block" id="when"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.when.html#pyspark.sql.functions.when">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">when</span><span class="p">(</span><span class="n">condition</span><span class="p">:</span> <span class="n">Column</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Evaluates a list of conditions and returns one of multiple possible result expressions.</span>
<span class="sd"> If :func:`pyspark.sql.Column.otherwise` is not invoked, None is returned for unmatched</span>
<span class="sd"> conditions.</span>
<span class="sd"> .. versionadded:: 1.4.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> condition : :class:`~pyspark.sql.Column`</span>
<span class="sd"> a boolean :class:`~pyspark.sql.Column` expression.</span>
<span class="sd"> value :</span>
<span class="sd"> a literal value, or a :class:`~pyspark.sql.Column` expression.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> column representing when expression.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.range(3)</span>
<span class="sd"> &gt;&gt;&gt; df.select(when(df[&#39;id&#39;] == 2, 3).otherwise(4).alias(&quot;age&quot;)).show()</span>
<span class="sd"> +---+</span>
<span class="sd"> |age|</span>
<span class="sd"> +---+</span>
<span class="sd"> | 4|</span>
<span class="sd"> | 4|</span>
<span class="sd"> | 3|</span>
<span class="sd"> +---+</span>
<span class="sd"> &gt;&gt;&gt; df.select(when(df.id == 2, df.id + 1).alias(&quot;age&quot;)).show()</span>
<span class="sd"> +----+</span>
<span class="sd"> | age|</span>
<span class="sd"> +----+</span>
<span class="sd"> |NULL|</span>
<span class="sd"> |NULL|</span>
<span class="sd"> | 3|</span>
<span class="sd"> +----+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="c1"># Explicitly not using ColumnOrName type here to make reading condition less opaque</span>
<span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">condition</span><span class="p">,</span> <span class="n">Column</span><span class="p">):</span>
<span class="k">raise</span> <span class="n">PySparkTypeError</span><span class="p">(</span>
<span class="n">error_class</span><span class="o">=</span><span class="s2">&quot;NOT_COLUMN&quot;</span><span class="p">,</span>
<span class="n">message_parameters</span><span class="o">=</span><span class="p">{</span><span class="s2">&quot;arg_name&quot;</span><span class="p">:</span> <span class="s2">&quot;condition&quot;</span><span class="p">,</span> <span class="s2">&quot;arg_type&quot;</span><span class="p">:</span> <span class="nb">type</span><span class="p">(</span><span class="n">condition</span><span class="p">)</span><span class="o">.</span><span class="vm">__name__</span><span class="p">},</span>
<span class="p">)</span>
<span class="n">v</span> <span class="o">=</span> <span class="n">value</span><span class="o">.</span><span class="n">_jc</span> <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">value</span><span class="p">,</span> <span class="n">Column</span><span class="p">)</span> <span class="k">else</span> <span class="n">value</span>
<span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">&quot;when&quot;</span><span class="p">,</span> <span class="n">condition</span><span class="o">.</span><span class="n">_jc</span><span class="p">,</span> <span class="n">v</span><span class="p">)</span></div>
<span class="nd">@overload</span> <span class="c1"># type: ignore[no-redef]</span>
<span class="k">def</span> <span class="nf">log</span><span class="p">(</span><span class="n">arg1</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="o">...</span>
<span class="nd">@overload</span>
<span class="k">def</span> <span class="nf">log</span><span class="p">(</span><span class="n">arg1</span><span class="p">:</span> <span class="nb">float</span><span class="p">,</span> <span class="n">arg2</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="o">...</span>
<div class="viewcode-block" id="log"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.log.html#pyspark.sql.functions.log">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">log</span><span class="p">(</span><span class="n">arg1</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="nb">float</span><span class="p">],</span> <span class="n">arg2</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="s2">&quot;ColumnOrName&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Returns the first argument-based logarithm of the second argument.</span>
<span class="sd"> If there is only one argument, then this takes the natural logarithm of the argument.</span>
<span class="sd"> .. versionadded:: 1.5.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> arg1 : :class:`~pyspark.sql.Column`, str or float</span>
<span class="sd"> base number or actual number (in this case base is `e`)</span>
<span class="sd"> arg2 : :class:`~pyspark.sql.Column`, str or float</span>
<span class="sd"> number to calculate logariphm for.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> logariphm of given value.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; from pyspark.sql import functions as sf</span>
<span class="sd"> &gt;&gt;&gt; df = spark.sql(&quot;SELECT * FROM VALUES (1), (2), (4) AS t(value)&quot;)</span>
<span class="sd"> &gt;&gt;&gt; df.select(sf.log(2.0, df.value).alias(&#39;log2_value&#39;)).show()</span>
<span class="sd"> +----------+</span>
<span class="sd"> |log2_value|</span>
<span class="sd"> +----------+</span>
<span class="sd"> | 0.0|</span>
<span class="sd"> | 1.0|</span>
<span class="sd"> | 2.0|</span>
<span class="sd"> +----------+</span>
<span class="sd"> And Natural logarithm</span>
<span class="sd"> &gt;&gt;&gt; df.select(sf.log(df.value).alias(&#39;ln_value&#39;)).show()</span>
<span class="sd"> +------------------+</span>
<span class="sd"> | ln_value|</span>
<span class="sd"> +------------------+</span>
<span class="sd"> | 0.0|</span>
<span class="sd"> |0.6931471805599453|</span>
<span class="sd"> |1.3862943611198906|</span>
<span class="sd"> +------------------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="n">arg2</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;log&quot;</span><span class="p">,</span> <span class="n">cast</span><span class="p">(</span><span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="n">arg1</span><span class="p">))</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">&quot;log&quot;</span><span class="p">,</span> <span class="n">arg1</span><span class="p">,</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">arg2</span><span class="p">))</span></div>
<div class="viewcode-block" id="ln"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.ln.html#pyspark.sql.functions.ln">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">ln</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Returns the natural logarithm of the argument.</span>
<span class="sd"> .. versionadded:: 3.5.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> a column to calculate logariphm for.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> natural logarithm of given value.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(4,)], [&#39;a&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(ln(&#39;a&#39;)).show()</span>
<span class="sd"> +------------------+</span>
<span class="sd"> | ln(a)|</span>
<span class="sd"> +------------------+</span>
<span class="sd"> |1.3862943611198906|</span>
<span class="sd"> +------------------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;ln&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="log2"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.log2.html#pyspark.sql.functions.log2">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">log2</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Returns the base-2 logarithm of the argument.</span>
<span class="sd"> .. versionadded:: 1.5.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> a column to calculate logariphm for.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> logariphm of given value.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(4,)], [&#39;a&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(log2(&#39;a&#39;).alias(&#39;log2&#39;)).show()</span>
<span class="sd"> +----+</span>
<span class="sd"> |log2|</span>
<span class="sd"> +----+</span>
<span class="sd"> | 2.0|</span>
<span class="sd"> +----+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;log2&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="conv"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.conv.html#pyspark.sql.functions.conv">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">conv</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="n">fromBase</span><span class="p">:</span> <span class="nb">int</span><span class="p">,</span> <span class="n">toBase</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Convert a number in a string column from one base to another.</span>
<span class="sd"> .. versionadded:: 1.5.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> a column to convert base for.</span>
<span class="sd"> fromBase: int</span>
<span class="sd"> from base number.</span>
<span class="sd"> toBase: int</span>
<span class="sd"> to base number.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> logariphm of given value.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(&quot;010101&quot;,)], [&#39;n&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(conv(df.n, 2, 16).alias(&#39;hex&#39;)).collect()</span>
<span class="sd"> [Row(hex=&#39;15&#39;)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">&quot;conv&quot;</span><span class="p">,</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">),</span> <span class="n">fromBase</span><span class="p">,</span> <span class="n">toBase</span><span class="p">)</span></div>
<div class="viewcode-block" id="factorial"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.factorial.html#pyspark.sql.functions.factorial">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">factorial</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Computes the factorial of the given value.</span>
<span class="sd"> .. versionadded:: 1.5.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> a column to calculate factorial for.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> factorial of given value.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(5,)], [&#39;n&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(factorial(df.n).alias(&#39;f&#39;)).collect()</span>
<span class="sd"> [Row(f=120)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;factorial&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<span class="c1"># --------------- Window functions ------------------------</span>
<div class="viewcode-block" id="lag"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.lag.html#pyspark.sql.functions.lag">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">lag</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="n">offset</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">1</span><span class="p">,</span> <span class="n">default</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Any</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Window function: returns the value that is `offset` rows before the current row, and</span>
<span class="sd"> `default` if there is less than `offset` rows before the current row. For example,</span>
<span class="sd"> an `offset` of one will return the previous row at any given point in the window partition.</span>
<span class="sd"> This is equivalent to the LAG function in SQL.</span>
<span class="sd"> .. versionadded:: 1.4.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> name of column or expression</span>
<span class="sd"> offset : int, optional default 1</span>
<span class="sd"> number of row to extend</span>
<span class="sd"> default : optional</span>
<span class="sd"> default value</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> value before current row based on `offset`.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; from pyspark.sql import Window</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(&quot;a&quot;, 1),</span>
<span class="sd"> ... (&quot;a&quot;, 2),</span>
<span class="sd"> ... (&quot;a&quot;, 3),</span>
<span class="sd"> ... (&quot;b&quot;, 8),</span>
<span class="sd"> ... (&quot;b&quot;, 2)], [&quot;c1&quot;, &quot;c2&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df.show()</span>
<span class="sd"> +---+---+</span>
<span class="sd"> | c1| c2|</span>
<span class="sd"> +---+---+</span>
<span class="sd"> | a| 1|</span>
<span class="sd"> | a| 2|</span>
<span class="sd"> | a| 3|</span>
<span class="sd"> | b| 8|</span>
<span class="sd"> | b| 2|</span>
<span class="sd"> +---+---+</span>
<span class="sd"> &gt;&gt;&gt; w = Window.partitionBy(&quot;c1&quot;).orderBy(&quot;c2&quot;)</span>
<span class="sd"> &gt;&gt;&gt; df.withColumn(&quot;previos_value&quot;, lag(&quot;c2&quot;).over(w)).show()</span>
<span class="sd"> +---+---+-------------+</span>
<span class="sd"> | c1| c2|previos_value|</span>
<span class="sd"> +---+---+-------------+</span>
<span class="sd"> | a| 1| NULL|</span>
<span class="sd"> | a| 2| 1|</span>
<span class="sd"> | a| 3| 2|</span>
<span class="sd"> | b| 2| NULL|</span>
<span class="sd"> | b| 8| 2|</span>
<span class="sd"> +---+---+-------------+</span>
<span class="sd"> &gt;&gt;&gt; df.withColumn(&quot;previos_value&quot;, lag(&quot;c2&quot;, 1, 0).over(w)).show()</span>
<span class="sd"> +---+---+-------------+</span>
<span class="sd"> | c1| c2|previos_value|</span>
<span class="sd"> +---+---+-------------+</span>
<span class="sd"> | a| 1| 0|</span>
<span class="sd"> | a| 2| 1|</span>
<span class="sd"> | a| 3| 2|</span>
<span class="sd"> | b| 2| 0|</span>
<span class="sd"> | b| 8| 2|</span>
<span class="sd"> +---+---+-------------+</span>
<span class="sd"> &gt;&gt;&gt; df.withColumn(&quot;previos_value&quot;, lag(&quot;c2&quot;, 2, -1).over(w)).show()</span>
<span class="sd"> +---+---+-------------+</span>
<span class="sd"> | c1| c2|previos_value|</span>
<span class="sd"> +---+---+-------------+</span>
<span class="sd"> | a| 1| -1|</span>
<span class="sd"> | a| 2| -1|</span>
<span class="sd"> | a| 3| 1|</span>
<span class="sd"> | b| 2| -1|</span>
<span class="sd"> | b| 8| -1|</span>
<span class="sd"> +---+---+-------------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">&quot;lag&quot;</span><span class="p">,</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">),</span> <span class="n">offset</span><span class="p">,</span> <span class="n">default</span><span class="p">)</span></div>
<div class="viewcode-block" id="lead"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.lead.html#pyspark.sql.functions.lead">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">lead</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="n">offset</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">1</span><span class="p">,</span> <span class="n">default</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Any</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Window function: returns the value that is `offset` rows after the current row, and</span>
<span class="sd"> `default` if there is less than `offset` rows after the current row. For example,</span>
<span class="sd"> an `offset` of one will return the next row at any given point in the window partition.</span>
<span class="sd"> This is equivalent to the LEAD function in SQL.</span>
<span class="sd"> .. versionadded:: 1.4.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> name of column or expression</span>
<span class="sd"> offset : int, optional default 1</span>
<span class="sd"> number of row to extend</span>
<span class="sd"> default : optional</span>
<span class="sd"> default value</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> value after current row based on `offset`.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; from pyspark.sql import Window</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(&quot;a&quot;, 1),</span>
<span class="sd"> ... (&quot;a&quot;, 2),</span>
<span class="sd"> ... (&quot;a&quot;, 3),</span>
<span class="sd"> ... (&quot;b&quot;, 8),</span>
<span class="sd"> ... (&quot;b&quot;, 2)], [&quot;c1&quot;, &quot;c2&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df.show()</span>
<span class="sd"> +---+---+</span>
<span class="sd"> | c1| c2|</span>
<span class="sd"> +---+---+</span>
<span class="sd"> | a| 1|</span>
<span class="sd"> | a| 2|</span>
<span class="sd"> | a| 3|</span>
<span class="sd"> | b| 8|</span>
<span class="sd"> | b| 2|</span>
<span class="sd"> +---+---+</span>
<span class="sd"> &gt;&gt;&gt; w = Window.partitionBy(&quot;c1&quot;).orderBy(&quot;c2&quot;)</span>
<span class="sd"> &gt;&gt;&gt; df.withColumn(&quot;next_value&quot;, lead(&quot;c2&quot;).over(w)).show()</span>
<span class="sd"> +---+---+----------+</span>
<span class="sd"> | c1| c2|next_value|</span>
<span class="sd"> +---+---+----------+</span>
<span class="sd"> | a| 1| 2|</span>
<span class="sd"> | a| 2| 3|</span>
<span class="sd"> | a| 3| NULL|</span>
<span class="sd"> | b| 2| 8|</span>
<span class="sd"> | b| 8| NULL|</span>
<span class="sd"> +---+---+----------+</span>
<span class="sd"> &gt;&gt;&gt; df.withColumn(&quot;next_value&quot;, lead(&quot;c2&quot;, 1, 0).over(w)).show()</span>
<span class="sd"> +---+---+----------+</span>
<span class="sd"> | c1| c2|next_value|</span>
<span class="sd"> +---+---+----------+</span>
<span class="sd"> | a| 1| 2|</span>
<span class="sd"> | a| 2| 3|</span>
<span class="sd"> | a| 3| 0|</span>
<span class="sd"> | b| 2| 8|</span>
<span class="sd"> | b| 8| 0|</span>
<span class="sd"> +---+---+----------+</span>
<span class="sd"> &gt;&gt;&gt; df.withColumn(&quot;next_value&quot;, lead(&quot;c2&quot;, 2, -1).over(w)).show()</span>
<span class="sd"> +---+---+----------+</span>
<span class="sd"> | c1| c2|next_value|</span>
<span class="sd"> +---+---+----------+</span>
<span class="sd"> | a| 1| 3|</span>
<span class="sd"> | a| 2| -1|</span>
<span class="sd"> | a| 3| -1|</span>
<span class="sd"> | b| 2| -1|</span>
<span class="sd"> | b| 8| -1|</span>
<span class="sd"> +---+---+----------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">&quot;lead&quot;</span><span class="p">,</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">),</span> <span class="n">offset</span><span class="p">,</span> <span class="n">default</span><span class="p">)</span></div>
<div class="viewcode-block" id="nth_value"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.nth_value.html#pyspark.sql.functions.nth_value">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">nth_value</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="n">offset</span><span class="p">:</span> <span class="nb">int</span><span class="p">,</span> <span class="n">ignoreNulls</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">bool</span><span class="p">]</span> <span class="o">=</span> <span class="kc">False</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Window function: returns the value that is the `offset`\\th row of the window frame</span>
<span class="sd"> (counting from 1), and `null` if the size of window frame is less than `offset` rows.</span>
<span class="sd"> It will return the `offset`\\th non-null value it sees when `ignoreNulls` is set to</span>
<span class="sd"> true. If all values are null, then null is returned.</span>
<span class="sd"> This is equivalent to the nth_value function in SQL.</span>
<span class="sd"> .. versionadded:: 3.1.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> name of column or expression</span>
<span class="sd"> offset : int</span>
<span class="sd"> number of row to use as the value</span>
<span class="sd"> ignoreNulls : bool, optional</span>
<span class="sd"> indicates the Nth value should skip null in the</span>
<span class="sd"> determination of which row to use</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> value of nth row.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; from pyspark.sql import Window</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(&quot;a&quot;, 1),</span>
<span class="sd"> ... (&quot;a&quot;, 2),</span>
<span class="sd"> ... (&quot;a&quot;, 3),</span>
<span class="sd"> ... (&quot;b&quot;, 8),</span>
<span class="sd"> ... (&quot;b&quot;, 2)], [&quot;c1&quot;, &quot;c2&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df.show()</span>
<span class="sd"> +---+---+</span>
<span class="sd"> | c1| c2|</span>
<span class="sd"> +---+---+</span>
<span class="sd"> | a| 1|</span>
<span class="sd"> | a| 2|</span>
<span class="sd"> | a| 3|</span>
<span class="sd"> | b| 8|</span>
<span class="sd"> | b| 2|</span>
<span class="sd"> +---+---+</span>
<span class="sd"> &gt;&gt;&gt; w = Window.partitionBy(&quot;c1&quot;).orderBy(&quot;c2&quot;)</span>
<span class="sd"> &gt;&gt;&gt; df.withColumn(&quot;nth_value&quot;, nth_value(&quot;c2&quot;, 1).over(w)).show()</span>
<span class="sd"> +---+---+---------+</span>
<span class="sd"> | c1| c2|nth_value|</span>
<span class="sd"> +---+---+---------+</span>
<span class="sd"> | a| 1| 1|</span>
<span class="sd"> | a| 2| 1|</span>
<span class="sd"> | a| 3| 1|</span>
<span class="sd"> | b| 2| 2|</span>
<span class="sd"> | b| 8| 2|</span>
<span class="sd"> +---+---+---------+</span>
<span class="sd"> &gt;&gt;&gt; df.withColumn(&quot;nth_value&quot;, nth_value(&quot;c2&quot;, 2).over(w)).show()</span>
<span class="sd"> +---+---+---------+</span>
<span class="sd"> | c1| c2|nth_value|</span>
<span class="sd"> +---+---+---------+</span>
<span class="sd"> | a| 1| NULL|</span>
<span class="sd"> | a| 2| 2|</span>
<span class="sd"> | a| 3| 2|</span>
<span class="sd"> | b| 2| NULL|</span>
<span class="sd"> | b| 8| 8|</span>
<span class="sd"> +---+---+---------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">&quot;nth_value&quot;</span><span class="p">,</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">),</span> <span class="n">offset</span><span class="p">,</span> <span class="n">ignoreNulls</span><span class="p">)</span></div>
<div class="viewcode-block" id="any_value"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.any_value.html#pyspark.sql.functions.any_value">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">any_value</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="n">ignoreNulls</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">bool</span><span class="p">,</span> <span class="n">Column</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Returns some value of `col` for a group of rows.</span>
<span class="sd"> .. versionadded:: 3.5.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> target column to work on.</span>
<span class="sd"> ignorenulls : :class:`~pyspark.sql.Column` or bool</span>
<span class="sd"> if first value is null then look for first non-null value.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> some value of `col` for a group of rows.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(None, 1),</span>
<span class="sd"> ... (&quot;a&quot;, 2),</span>
<span class="sd"> ... (&quot;a&quot;, 3),</span>
<span class="sd"> ... (&quot;b&quot;, 8),</span>
<span class="sd"> ... (&quot;b&quot;, 2)], [&quot;c1&quot;, &quot;c2&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(any_value(&#39;c1&#39;), any_value(&#39;c2&#39;)).collect()</span>
<span class="sd"> [Row(any_value(c1)=None, any_value(c2)=1)]</span>
<span class="sd"> &gt;&gt;&gt; df.select(any_value(&#39;c1&#39;, True), any_value(&#39;c2&#39;, True)).collect()</span>
<span class="sd"> [Row(any_value(c1)=&#39;a&#39;, any_value(c2)=1)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="n">ignoreNulls</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;any_value&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">ignoreNulls</span> <span class="o">=</span> <span class="n">lit</span><span class="p">(</span><span class="n">ignoreNulls</span><span class="p">)</span> <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">ignoreNulls</span><span class="p">,</span> <span class="nb">bool</span><span class="p">)</span> <span class="k">else</span> <span class="n">ignoreNulls</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;any_value&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">,</span> <span class="n">ignoreNulls</span><span class="p">)</span></div>
<div class="viewcode-block" id="first_value"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.first_value.html#pyspark.sql.functions.first_value">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">first_value</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="n">ignoreNulls</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">bool</span><span class="p">,</span> <span class="n">Column</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Returns the first value of `col` for a group of rows. It will return the first non-null</span>
<span class="sd"> value it sees when `ignoreNulls` is set to true. If all values are null, then null is returned.</span>
<span class="sd"> .. versionadded:: 3.5.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> target column to work on.</span>
<span class="sd"> ignorenulls : :class:`~pyspark.sql.Column` or bool</span>
<span class="sd"> if first value is null then look for first non-null value.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> some value of `col` for a group of rows.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; import pyspark.sql.functions as sf</span>
<span class="sd"> &gt;&gt;&gt; spark.createDataFrame(</span>
<span class="sd"> ... [(None, 1), (&quot;a&quot;, 2), (&quot;a&quot;, 3), (&quot;b&quot;, 8), (&quot;b&quot;, 2)], [&quot;a&quot;, &quot;b&quot;]</span>
<span class="sd"> ... ).select(sf.first_value(&#39;a&#39;), sf.first_value(&#39;b&#39;)).show()</span>
<span class="sd"> +--------------+--------------+</span>
<span class="sd"> |first_value(a)|first_value(b)|</span>
<span class="sd"> +--------------+--------------+</span>
<span class="sd"> | NULL| 1|</span>
<span class="sd"> +--------------+--------------+</span>
<span class="sd"> &gt;&gt;&gt; import pyspark.sql.functions as sf</span>
<span class="sd"> &gt;&gt;&gt; spark.createDataFrame(</span>
<span class="sd"> ... [(None, 1), (&quot;a&quot;, 2), (&quot;a&quot;, 3), (&quot;b&quot;, 8), (&quot;b&quot;, 2)], [&quot;a&quot;, &quot;b&quot;]</span>
<span class="sd"> ... ).select(sf.first_value(&#39;a&#39;, True), sf.first_value(&#39;b&#39;, True)).show()</span>
<span class="sd"> +--------------+--------------+</span>
<span class="sd"> |first_value(a)|first_value(b)|</span>
<span class="sd"> +--------------+--------------+</span>
<span class="sd"> | a| 1|</span>
<span class="sd"> +--------------+--------------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="n">ignoreNulls</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;first_value&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">ignoreNulls</span> <span class="o">=</span> <span class="n">lit</span><span class="p">(</span><span class="n">ignoreNulls</span><span class="p">)</span> <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">ignoreNulls</span><span class="p">,</span> <span class="nb">bool</span><span class="p">)</span> <span class="k">else</span> <span class="n">ignoreNulls</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;first_value&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">,</span> <span class="n">ignoreNulls</span><span class="p">)</span></div>
<div class="viewcode-block" id="last_value"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.last_value.html#pyspark.sql.functions.last_value">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">last_value</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="n">ignoreNulls</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">bool</span><span class="p">,</span> <span class="n">Column</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Returns the last value of `col` for a group of rows. It will return the last non-null</span>
<span class="sd"> value it sees when `ignoreNulls` is set to true. If all values are null, then null is returned.</span>
<span class="sd"> .. versionadded:: 3.5.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> target column to work on.</span>
<span class="sd"> ignorenulls : :class:`~pyspark.sql.Column` or bool</span>
<span class="sd"> if first value is null then look for first non-null value.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> some value of `col` for a group of rows.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; import pyspark.sql.functions as sf</span>
<span class="sd"> &gt;&gt;&gt; spark.createDataFrame(</span>
<span class="sd"> ... [(&quot;a&quot;, 1), (&quot;a&quot;, 2), (&quot;a&quot;, 3), (&quot;b&quot;, 8), (None, 2)], [&quot;a&quot;, &quot;b&quot;]</span>
<span class="sd"> ... ).select(sf.last_value(&#39;a&#39;), sf.last_value(&#39;b&#39;)).show()</span>
<span class="sd"> +-------------+-------------+</span>
<span class="sd"> |last_value(a)|last_value(b)|</span>
<span class="sd"> +-------------+-------------+</span>
<span class="sd"> | NULL| 2|</span>
<span class="sd"> +-------------+-------------+</span>
<span class="sd"> &gt;&gt;&gt; import pyspark.sql.functions as sf</span>
<span class="sd"> &gt;&gt;&gt; spark.createDataFrame(</span>
<span class="sd"> ... [(&quot;a&quot;, 1), (&quot;a&quot;, 2), (&quot;a&quot;, 3), (&quot;b&quot;, 8), (None, 2)], [&quot;a&quot;, &quot;b&quot;]</span>
<span class="sd"> ... ).select(sf.last_value(&#39;a&#39;, True), sf.last_value(&#39;b&#39;, True)).show()</span>
<span class="sd"> +-------------+-------------+</span>
<span class="sd"> |last_value(a)|last_value(b)|</span>
<span class="sd"> +-------------+-------------+</span>
<span class="sd"> | b| 2|</span>
<span class="sd"> +-------------+-------------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="n">ignoreNulls</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;last_value&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">ignoreNulls</span> <span class="o">=</span> <span class="n">lit</span><span class="p">(</span><span class="n">ignoreNulls</span><span class="p">)</span> <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">ignoreNulls</span><span class="p">,</span> <span class="nb">bool</span><span class="p">)</span> <span class="k">else</span> <span class="n">ignoreNulls</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;last_value&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">,</span> <span class="n">ignoreNulls</span><span class="p">)</span></div>
<div class="viewcode-block" id="count_if"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.count_if.html#pyspark.sql.functions.count_if">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">count_if</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Returns the number of `TRUE` values for the `col`.</span>
<span class="sd"> .. versionadded:: 3.5.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> target column to work on.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> the number of `TRUE` values for the `col`.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(&quot;a&quot;, 1),</span>
<span class="sd"> ... (&quot;a&quot;, 2),</span>
<span class="sd"> ... (&quot;a&quot;, 3),</span>
<span class="sd"> ... (&quot;b&quot;, 8),</span>
<span class="sd"> ... (&quot;b&quot;, 2)], [&quot;c1&quot;, &quot;c2&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(count_if(col(&#39;c2&#39;) % 2 == 0)).show()</span>
<span class="sd"> +------------------------+</span>
<span class="sd"> |count_if(((c2 % 2) = 0))|</span>
<span class="sd"> +------------------------+</span>
<span class="sd"> | 3|</span>
<span class="sd"> +------------------------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;count_if&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="histogram_numeric"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.histogram_numeric.html#pyspark.sql.functions.histogram_numeric">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">histogram_numeric</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="n">nBins</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Computes a histogram on numeric &#39;col&#39; using nb bins.</span>
<span class="sd"> The return value is an array of (x,y) pairs representing the centers of the</span>
<span class="sd"> histogram&#39;s bins. As the value of &#39;nb&#39; is increased, the histogram approximation</span>
<span class="sd"> gets finer-grained, but may yield artifacts around outliers. In practice, 20-40</span>
<span class="sd"> histogram bins appear to work well, with more bins being required for skewed or</span>
<span class="sd"> smaller datasets. Note that this function creates a histogram with non-uniform</span>
<span class="sd"> bin widths. It offers no guarantees in terms of the mean-squared-error of the</span>
<span class="sd"> histogram, but in practice is comparable to the histograms produced by the R/S-Plus</span>
<span class="sd"> statistical computing packages. Note: the output type of the &#39;x&#39; field in the return value is</span>
<span class="sd"> propagated from the input value consumed in the aggregate function.</span>
<span class="sd"> .. versionadded:: 3.5.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> target column to work on.</span>
<span class="sd"> nBins : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> number of Histogram columns.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> a histogram on numeric &#39;col&#39; using nb bins.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(&quot;a&quot;, 1),</span>
<span class="sd"> ... (&quot;a&quot;, 2),</span>
<span class="sd"> ... (&quot;a&quot;, 3),</span>
<span class="sd"> ... (&quot;b&quot;, 8),</span>
<span class="sd"> ... (&quot;b&quot;, 2)], [&quot;c1&quot;, &quot;c2&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(histogram_numeric(&#39;c2&#39;, lit(5))).show()</span>
<span class="sd"> +------------------------+</span>
<span class="sd"> |histogram_numeric(c2, 5)|</span>
<span class="sd"> +------------------------+</span>
<span class="sd"> | [{1, 1.0}, {2, 1....|</span>
<span class="sd"> +------------------------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;histogram_numeric&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">,</span> <span class="n">nBins</span><span class="p">)</span></div>
<div class="viewcode-block" id="ntile"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.ntile.html#pyspark.sql.functions.ntile">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">ntile</span><span class="p">(</span><span class="n">n</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Window function: returns the ntile group id (from 1 to `n` inclusive)</span>
<span class="sd"> in an ordered window partition. For example, if `n` is 4, the first</span>
<span class="sd"> quarter of the rows will get value 1, the second quarter will get 2,</span>
<span class="sd"> the third quarter will get 3, and the last quarter will get 4.</span>
<span class="sd"> This is equivalent to the NTILE function in SQL.</span>
<span class="sd"> .. versionadded:: 1.4.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> n : int</span>
<span class="sd"> an integer</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> portioned group id.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; from pyspark.sql import Window</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(&quot;a&quot;, 1),</span>
<span class="sd"> ... (&quot;a&quot;, 2),</span>
<span class="sd"> ... (&quot;a&quot;, 3),</span>
<span class="sd"> ... (&quot;b&quot;, 8),</span>
<span class="sd"> ... (&quot;b&quot;, 2)], [&quot;c1&quot;, &quot;c2&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df.show()</span>
<span class="sd"> +---+---+</span>
<span class="sd"> | c1| c2|</span>
<span class="sd"> +---+---+</span>
<span class="sd"> | a| 1|</span>
<span class="sd"> | a| 2|</span>
<span class="sd"> | a| 3|</span>
<span class="sd"> | b| 8|</span>
<span class="sd"> | b| 2|</span>
<span class="sd"> +---+---+</span>
<span class="sd"> &gt;&gt;&gt; w = Window.partitionBy(&quot;c1&quot;).orderBy(&quot;c2&quot;)</span>
<span class="sd"> &gt;&gt;&gt; df.withColumn(&quot;ntile&quot;, ntile(2).over(w)).show()</span>
<span class="sd"> +---+---+-----+</span>
<span class="sd"> | c1| c2|ntile|</span>
<span class="sd"> +---+---+-----+</span>
<span class="sd"> | a| 1| 1|</span>
<span class="sd"> | a| 2| 1|</span>
<span class="sd"> | a| 3| 2|</span>
<span class="sd"> | b| 2| 1|</span>
<span class="sd"> | b| 8| 2|</span>
<span class="sd"> +---+---+-----+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">&quot;ntile&quot;</span><span class="p">,</span> <span class="nb">int</span><span class="p">(</span><span class="n">n</span><span class="p">))</span></div>
<span class="c1"># ---------------------- Date/Timestamp functions ------------------------------</span>
<div class="viewcode-block" id="curdate"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.curdate.html#pyspark.sql.functions.curdate">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">curdate</span><span class="p">()</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns the current date at the start of query evaluation as a :class:`DateType` column.</span>
<span class="sd"> All calls of current_date within the same query return the same value.</span>
<span class="sd"> .. versionadded:: 3.5.0</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> current date.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; import pyspark.sql.functions as sf</span>
<span class="sd"> &gt;&gt;&gt; spark.range(1).select(sf.curdate()).show() # doctest: +SKIP</span>
<span class="sd"> +--------------+</span>
<span class="sd"> |current_date()|</span>
<span class="sd"> +--------------+</span>
<span class="sd"> | 2022-08-26|</span>
<span class="sd"> +--------------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">&quot;curdate&quot;</span><span class="p">)</span></div>
<div class="viewcode-block" id="current_date"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.current_date.html#pyspark.sql.functions.current_date">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">current_date</span><span class="p">()</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns the current date at the start of query evaluation as a :class:`DateType` column.</span>
<span class="sd"> All calls of current_date within the same query return the same value.</span>
<span class="sd"> .. versionadded:: 1.5.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> current date.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.range(1)</span>
<span class="sd"> &gt;&gt;&gt; df.select(current_date()).show() # doctest: +SKIP</span>
<span class="sd"> +--------------+</span>
<span class="sd"> |current_date()|</span>
<span class="sd"> +--------------+</span>
<span class="sd"> | 2022-08-26|</span>
<span class="sd"> +--------------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">&quot;current_date&quot;</span><span class="p">)</span></div>
<div class="viewcode-block" id="current_timezone"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.current_timezone.html#pyspark.sql.functions.current_timezone">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">current_timezone</span><span class="p">()</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns the current session local timezone.</span>
<span class="sd"> .. versionadded:: 3.5.0</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> current session local timezone.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; spark.conf.set(&quot;spark.sql.session.timeZone&quot;, &quot;America/Los_Angeles&quot;)</span>
<span class="sd"> &gt;&gt;&gt; spark.range(1).select(current_timezone()).show()</span>
<span class="sd"> +-------------------+</span>
<span class="sd"> | current_timezone()|</span>
<span class="sd"> +-------------------+</span>
<span class="sd"> |America/Los_Angeles|</span>
<span class="sd"> +-------------------+</span>
<span class="sd"> &gt;&gt;&gt; spark.conf.unset(&quot;spark.sql.session.timeZone&quot;)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">&quot;current_timezone&quot;</span><span class="p">)</span></div>
<div class="viewcode-block" id="current_timestamp"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.current_timestamp.html#pyspark.sql.functions.current_timestamp">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">current_timestamp</span><span class="p">()</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns the current timestamp at the start of query evaluation as a :class:`TimestampType`</span>
<span class="sd"> column. All calls of current_timestamp within the same query return the same value.</span>
<span class="sd"> .. versionadded:: 1.5.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> current date and time.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.range(1)</span>
<span class="sd"> &gt;&gt;&gt; df.select(current_timestamp()).show(truncate=False) # doctest: +SKIP</span>
<span class="sd"> +-----------------------+</span>
<span class="sd"> |current_timestamp() |</span>
<span class="sd"> +-----------------------+</span>
<span class="sd"> |2022-08-26 21:23:22.716|</span>
<span class="sd"> +-----------------------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">&quot;current_timestamp&quot;</span><span class="p">)</span></div>
<div class="viewcode-block" id="now"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.now.html#pyspark.sql.functions.now">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">now</span><span class="p">()</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns the current timestamp at the start of query evaluation.</span>
<span class="sd"> .. versionadded:: 3.5.0</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> current timestamp at the start of query evaluation.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.range(1)</span>
<span class="sd"> &gt;&gt;&gt; df.select(now()).show(truncate=False) # doctest: +SKIP</span>
<span class="sd"> +-----------------------+</span>
<span class="sd"> |now() |</span>
<span class="sd"> +-----------------------+</span>
<span class="sd"> |2022-08-26 21:23:22.716|</span>
<span class="sd"> +-----------------------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">&quot;current_timestamp&quot;</span><span class="p">)</span></div>
<div class="viewcode-block" id="localtimestamp"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.localtimestamp.html#pyspark.sql.functions.localtimestamp">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">localtimestamp</span><span class="p">()</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns the current timestamp without time zone at the start of query evaluation</span>
<span class="sd"> as a timestamp without time zone column. All calls of localtimestamp within the</span>
<span class="sd"> same query return the same value.</span>
<span class="sd"> .. versionadded:: 3.4.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> current local date and time.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.range(1)</span>
<span class="sd"> &gt;&gt;&gt; df.select(localtimestamp()).show(truncate=False) # doctest: +SKIP</span>
<span class="sd"> +-----------------------+</span>
<span class="sd"> |localtimestamp() |</span>
<span class="sd"> +-----------------------+</span>
<span class="sd"> |2022-08-26 21:28:34.639|</span>
<span class="sd"> +-----------------------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">&quot;localtimestamp&quot;</span><span class="p">)</span></div>
<div class="viewcode-block" id="date_format"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.date_format.html#pyspark.sql.functions.date_format">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">date_format</span><span class="p">(</span><span class="n">date</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="nb">format</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Converts a date/timestamp/string to a value of string in the format specified by the date</span>
<span class="sd"> format given by the second argument.</span>
<span class="sd"> A pattern could be for instance `dd.MM.yyyy` and could return a string like &#39;18.03.1993&#39;. All</span>
<span class="sd"> pattern letters of `datetime pattern`_. can be used.</span>
<span class="sd"> .. _datetime pattern: https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html</span>
<span class="sd"> .. versionadded:: 1.5.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> Whenever possible, use specialized functions like `year`.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> date : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> input column of values to format.</span>
<span class="sd"> format: str</span>
<span class="sd"> format to use to represent datetime values.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> string value representing formatted datetime.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(&#39;2015-04-08&#39;,)], [&#39;dt&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(date_format(&#39;dt&#39;, &#39;MM/dd/yyy&#39;).alias(&#39;date&#39;)).collect()</span>
<span class="sd"> [Row(date=&#39;04/08/2015&#39;)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">&quot;date_format&quot;</span><span class="p">,</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">date</span><span class="p">),</span> <span class="nb">format</span><span class="p">)</span></div>
<div class="viewcode-block" id="year"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.year.html#pyspark.sql.functions.year">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">year</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Extract the year of a given date/timestamp as integer.</span>
<span class="sd"> .. versionadded:: 1.5.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> target date/timestamp column to work on.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> year part of the date/timestamp as integer.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(&#39;2015-04-08&#39;,)], [&#39;dt&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(year(&#39;dt&#39;).alias(&#39;year&#39;)).collect()</span>
<span class="sd"> [Row(year=2015)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;year&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="quarter"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.quarter.html#pyspark.sql.functions.quarter">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">quarter</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Extract the quarter of a given date/timestamp as integer.</span>
<span class="sd"> .. versionadded:: 1.5.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> target date/timestamp column to work on.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> quarter of the date/timestamp as integer.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(&#39;2015-04-08&#39;,)], [&#39;dt&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(quarter(&#39;dt&#39;).alias(&#39;quarter&#39;)).collect()</span>
<span class="sd"> [Row(quarter=2)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;quarter&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="month"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.month.html#pyspark.sql.functions.month">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">month</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Extract the month of a given date/timestamp as integer.</span>
<span class="sd"> .. versionadded:: 1.5.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> target date/timestamp column to work on.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> month part of the date/timestamp as integer.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(&#39;2015-04-08&#39;,)], [&#39;dt&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(month(&#39;dt&#39;).alias(&#39;month&#39;)).collect()</span>
<span class="sd"> [Row(month=4)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;month&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="dayofweek"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.dayofweek.html#pyspark.sql.functions.dayofweek">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">dayofweek</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Extract the day of the week of a given date/timestamp as integer.</span>
<span class="sd"> Ranges from 1 for a Sunday through to 7 for a Saturday</span>
<span class="sd"> .. versionadded:: 2.3.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> target date/timestamp column to work on.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> day of the week for given date/timestamp as integer.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(&#39;2015-04-08&#39;,)], [&#39;dt&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(dayofweek(&#39;dt&#39;).alias(&#39;day&#39;)).collect()</span>
<span class="sd"> [Row(day=4)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;dayofweek&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="dayofmonth"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.dayofmonth.html#pyspark.sql.functions.dayofmonth">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">dayofmonth</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Extract the day of the month of a given date/timestamp as integer.</span>
<span class="sd"> .. versionadded:: 1.5.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> target date/timestamp column to work on.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> day of the month for given date/timestamp as integer.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(&#39;2015-04-08&#39;,)], [&#39;dt&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(dayofmonth(&#39;dt&#39;).alias(&#39;day&#39;)).collect()</span>
<span class="sd"> [Row(day=8)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;dayofmonth&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="day"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.day.html#pyspark.sql.functions.day">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">day</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Extract the day of the month of a given date/timestamp as integer.</span>
<span class="sd"> .. versionadded:: 3.5.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> target date/timestamp column to work on.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> day of the month for given date/timestamp as integer.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(&#39;2015-04-08&#39;,)], [&#39;dt&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(day(&#39;dt&#39;).alias(&#39;day&#39;)).collect()</span>
<span class="sd"> [Row(day=8)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;day&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="dayofyear"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.dayofyear.html#pyspark.sql.functions.dayofyear">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">dayofyear</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Extract the day of the year of a given date/timestamp as integer.</span>
<span class="sd"> .. versionadded:: 1.5.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> target date/timestamp column to work on.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> day of the year for given date/timestamp as integer.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(&#39;2015-04-08&#39;,)], [&#39;dt&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(dayofyear(&#39;dt&#39;).alias(&#39;day&#39;)).collect()</span>
<span class="sd"> [Row(day=98)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;dayofyear&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="hour"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.hour.html#pyspark.sql.functions.hour">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">hour</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Extract the hours of a given timestamp as integer.</span>
<span class="sd"> .. versionadded:: 1.5.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> target date/timestamp column to work on.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> hour part of the timestamp as integer.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; import datetime</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(datetime.datetime(2015, 4, 8, 13, 8, 15),)], [&#39;ts&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(hour(&#39;ts&#39;).alias(&#39;hour&#39;)).collect()</span>
<span class="sd"> [Row(hour=13)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;hour&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="minute"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.minute.html#pyspark.sql.functions.minute">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">minute</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Extract the minutes of a given timestamp as integer.</span>
<span class="sd"> .. versionadded:: 1.5.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> target date/timestamp column to work on.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> minutes part of the timestamp as integer.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; import datetime</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(datetime.datetime(2015, 4, 8, 13, 8, 15),)], [&#39;ts&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(minute(&#39;ts&#39;).alias(&#39;minute&#39;)).collect()</span>
<span class="sd"> [Row(minute=8)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;minute&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="second"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.second.html#pyspark.sql.functions.second">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">second</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Extract the seconds of a given date as integer.</span>
<span class="sd"> .. versionadded:: 1.5.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> target date/timestamp column to work on.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> `seconds` part of the timestamp as integer.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; import datetime</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(datetime.datetime(2015, 4, 8, 13, 8, 15),)], [&#39;ts&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(second(&#39;ts&#39;).alias(&#39;second&#39;)).collect()</span>
<span class="sd"> [Row(second=15)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;second&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="weekofyear"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.weekofyear.html#pyspark.sql.functions.weekofyear">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">weekofyear</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Extract the week number of a given date as integer.</span>
<span class="sd"> A week is considered to start on a Monday and week 1 is the first week with more than 3 days,</span>
<span class="sd"> as defined by ISO 8601</span>
<span class="sd"> .. versionadded:: 1.5.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> target timestamp column to work on.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> `week` of the year for given date as integer.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(&#39;2015-04-08&#39;,)], [&#39;dt&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(weekofyear(df.dt).alias(&#39;week&#39;)).collect()</span>
<span class="sd"> [Row(week=15)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;weekofyear&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="weekday"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.weekday.html#pyspark.sql.functions.weekday">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">weekday</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns the day of the week for date/timestamp (0 = Monday, 1 = Tuesday, ..., 6 = Sunday).</span>
<span class="sd"> .. versionadded:: 3.5.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> target date/timestamp column to work on.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> the day of the week for date/timestamp (0 = Monday, 1 = Tuesday, ..., 6 = Sunday).</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(&#39;2015-04-08&#39;,)], [&#39;dt&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(weekday(&#39;dt&#39;).alias(&#39;day&#39;)).show()</span>
<span class="sd"> +---+</span>
<span class="sd"> |day|</span>
<span class="sd"> +---+</span>
<span class="sd"> | 2|</span>
<span class="sd"> +---+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;weekday&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="extract"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.extract.html#pyspark.sql.functions.extract">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">extract</span><span class="p">(</span><span class="n">field</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="n">source</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Extracts a part of the date/timestamp or interval source.</span>
<span class="sd"> .. versionadded:: 3.5.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> field : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> selects which part of the source should be extracted.</span>
<span class="sd"> source : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> a date/timestamp or interval column from where `field` should be extracted.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> a part of the date/timestamp or interval source.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; import datetime</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(datetime.datetime(2015, 4, 8, 13, 8, 15),)], [&#39;ts&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(</span>
<span class="sd"> ... extract(lit(&#39;YEAR&#39;), &#39;ts&#39;).alias(&#39;year&#39;),</span>
<span class="sd"> ... extract(lit(&#39;month&#39;), &#39;ts&#39;).alias(&#39;month&#39;),</span>
<span class="sd"> ... extract(lit(&#39;WEEK&#39;), &#39;ts&#39;).alias(&#39;week&#39;),</span>
<span class="sd"> ... extract(lit(&#39;D&#39;), &#39;ts&#39;).alias(&#39;day&#39;),</span>
<span class="sd"> ... extract(lit(&#39;M&#39;), &#39;ts&#39;).alias(&#39;minute&#39;),</span>
<span class="sd"> ... extract(lit(&#39;S&#39;), &#39;ts&#39;).alias(&#39;second&#39;)</span>
<span class="sd"> ... ).collect()</span>
<span class="sd"> [Row(year=2015, month=4, week=15, day=8, minute=8, second=Decimal(&#39;15.000000&#39;))]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;extract&quot;</span><span class="p">,</span> <span class="n">field</span><span class="p">,</span> <span class="n">source</span><span class="p">)</span></div>
<div class="viewcode-block" id="date_part"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.date_part.html#pyspark.sql.functions.date_part">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">date_part</span><span class="p">(</span><span class="n">field</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="n">source</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Extracts a part of the date/timestamp or interval source.</span>
<span class="sd"> .. versionadded:: 3.5.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> field : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> selects which part of the source should be extracted, and supported string values</span>
<span class="sd"> are as same as the fields of the equivalent function `extract`.</span>
<span class="sd"> source : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> a date/timestamp or interval column from where `field` should be extracted.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> a part of the date/timestamp or interval source.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; import datetime</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(datetime.datetime(2015, 4, 8, 13, 8, 15),)], [&#39;ts&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(</span>
<span class="sd"> ... date_part(lit(&#39;YEAR&#39;), &#39;ts&#39;).alias(&#39;year&#39;),</span>
<span class="sd"> ... date_part(lit(&#39;month&#39;), &#39;ts&#39;).alias(&#39;month&#39;),</span>
<span class="sd"> ... date_part(lit(&#39;WEEK&#39;), &#39;ts&#39;).alias(&#39;week&#39;),</span>
<span class="sd"> ... date_part(lit(&#39;D&#39;), &#39;ts&#39;).alias(&#39;day&#39;),</span>
<span class="sd"> ... date_part(lit(&#39;M&#39;), &#39;ts&#39;).alias(&#39;minute&#39;),</span>
<span class="sd"> ... date_part(lit(&#39;S&#39;), &#39;ts&#39;).alias(&#39;second&#39;)</span>
<span class="sd"> ... ).collect()</span>
<span class="sd"> [Row(year=2015, month=4, week=15, day=8, minute=8, second=Decimal(&#39;15.000000&#39;))]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;date_part&quot;</span><span class="p">,</span> <span class="n">field</span><span class="p">,</span> <span class="n">source</span><span class="p">)</span></div>
<div class="viewcode-block" id="datepart"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.datepart.html#pyspark.sql.functions.datepart">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">datepart</span><span class="p">(</span><span class="n">field</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="n">source</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Extracts a part of the date/timestamp or interval source.</span>
<span class="sd"> .. versionadded:: 3.5.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> field : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> selects which part of the source should be extracted, and supported string values</span>
<span class="sd"> are as same as the fields of the equivalent function `extract`.</span>
<span class="sd"> source : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> a date/timestamp or interval column from where `field` should be extracted.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> a part of the date/timestamp or interval source.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; import datetime</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(datetime.datetime(2015, 4, 8, 13, 8, 15),)], [&#39;ts&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(</span>
<span class="sd"> ... datepart(lit(&#39;YEAR&#39;), &#39;ts&#39;).alias(&#39;year&#39;),</span>
<span class="sd"> ... datepart(lit(&#39;month&#39;), &#39;ts&#39;).alias(&#39;month&#39;),</span>
<span class="sd"> ... datepart(lit(&#39;WEEK&#39;), &#39;ts&#39;).alias(&#39;week&#39;),</span>
<span class="sd"> ... datepart(lit(&#39;D&#39;), &#39;ts&#39;).alias(&#39;day&#39;),</span>
<span class="sd"> ... datepart(lit(&#39;M&#39;), &#39;ts&#39;).alias(&#39;minute&#39;),</span>
<span class="sd"> ... datepart(lit(&#39;S&#39;), &#39;ts&#39;).alias(&#39;second&#39;)</span>
<span class="sd"> ... ).collect()</span>
<span class="sd"> [Row(year=2015, month=4, week=15, day=8, minute=8, second=Decimal(&#39;15.000000&#39;))]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;datepart&quot;</span><span class="p">,</span> <span class="n">field</span><span class="p">,</span> <span class="n">source</span><span class="p">)</span></div>
<div class="viewcode-block" id="make_date"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.make_date.html#pyspark.sql.functions.make_date">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">make_date</span><span class="p">(</span><span class="n">year</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="n">month</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="n">day</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns a column with a date built from the year, month and day columns.</span>
<span class="sd"> .. versionadded:: 3.3.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> year : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> The year to build the date</span>
<span class="sd"> month : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> The month to build the date</span>
<span class="sd"> day : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> The day to build the date</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> a date built from given parts.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(2020, 6, 26)], [&#39;Y&#39;, &#39;M&#39;, &#39;D&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(make_date(df.Y, df.M, df.D).alias(&quot;datefield&quot;)).collect()</span>
<span class="sd"> [Row(datefield=datetime.date(2020, 6, 26))]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;make_date&quot;</span><span class="p">,</span> <span class="n">year</span><span class="p">,</span> <span class="n">month</span><span class="p">,</span> <span class="n">day</span><span class="p">)</span></div>
<div class="viewcode-block" id="date_add"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.date_add.html#pyspark.sql.functions.date_add">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">date_add</span><span class="p">(</span><span class="n">start</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="n">days</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="nb">int</span><span class="p">])</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns the date that is `days` days after `start`. If `days` is a negative value</span>
<span class="sd"> then these amount of days will be deducted from `start`.</span>
<span class="sd"> .. versionadded:: 1.5.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> start : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> date column to work on.</span>
<span class="sd"> days : :class:`~pyspark.sql.Column` or str or int</span>
<span class="sd"> how many days after the given date to calculate.</span>
<span class="sd"> Accepts negative value as well to calculate backwards in time.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> a date after/before given number of days.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(&#39;2015-04-08&#39;, 2,)], [&#39;dt&#39;, &#39;add&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(date_add(df.dt, 1).alias(&#39;next_date&#39;)).collect()</span>
<span class="sd"> [Row(next_date=datetime.date(2015, 4, 9))]</span>
<span class="sd"> &gt;&gt;&gt; df.select(date_add(df.dt, df.add.cast(&#39;integer&#39;)).alias(&#39;next_date&#39;)).collect()</span>
<span class="sd"> [Row(next_date=datetime.date(2015, 4, 10))]</span>
<span class="sd"> &gt;&gt;&gt; df.select(date_add(&#39;dt&#39;, -1).alias(&#39;prev_date&#39;)).collect()</span>
<span class="sd"> [Row(prev_date=datetime.date(2015, 4, 7))]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">days</span> <span class="o">=</span> <span class="n">lit</span><span class="p">(</span><span class="n">days</span><span class="p">)</span> <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">days</span><span class="p">,</span> <span class="nb">int</span><span class="p">)</span> <span class="k">else</span> <span class="n">days</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;date_add&quot;</span><span class="p">,</span> <span class="n">start</span><span class="p">,</span> <span class="n">days</span><span class="p">)</span></div>
<div class="viewcode-block" id="dateadd"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.dateadd.html#pyspark.sql.functions.dateadd">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">dateadd</span><span class="p">(</span><span class="n">start</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="n">days</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="nb">int</span><span class="p">])</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns the date that is `days` days after `start`. If `days` is a negative value</span>
<span class="sd"> then these amount of days will be deducted from `start`.</span>
<span class="sd"> .. versionadded:: 3.5.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> start : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> date column to work on.</span>
<span class="sd"> days : :class:`~pyspark.sql.Column` or str or int</span>
<span class="sd"> how many days after the given date to calculate.</span>
<span class="sd"> Accepts negative value as well to calculate backwards in time.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> a date after/before given number of days.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; import pyspark.sql.functions as sf</span>
<span class="sd"> &gt;&gt;&gt; spark.createDataFrame(</span>
<span class="sd"> ... [(&#39;2015-04-08&#39;, 2,)], [&#39;dt&#39;, &#39;add&#39;]</span>
<span class="sd"> ... ).select(sf.dateadd(&quot;dt&quot;, 1)).show()</span>
<span class="sd"> +---------------+</span>
<span class="sd"> |date_add(dt, 1)|</span>
<span class="sd"> +---------------+</span>
<span class="sd"> | 2015-04-09|</span>
<span class="sd"> +---------------+</span>
<span class="sd"> &gt;&gt;&gt; import pyspark.sql.functions as sf</span>
<span class="sd"> &gt;&gt;&gt; spark.createDataFrame(</span>
<span class="sd"> ... [(&#39;2015-04-08&#39;, 2,)], [&#39;dt&#39;, &#39;add&#39;]</span>
<span class="sd"> ... ).select(sf.dateadd(&quot;dt&quot;, sf.lit(2))).show()</span>
<span class="sd"> +---------------+</span>
<span class="sd"> |date_add(dt, 2)|</span>
<span class="sd"> +---------------+</span>
<span class="sd"> | 2015-04-10|</span>
<span class="sd"> +---------------+</span>
<span class="sd"> &gt;&gt;&gt; import pyspark.sql.functions as sf</span>
<span class="sd"> &gt;&gt;&gt; spark.createDataFrame(</span>
<span class="sd"> ... [(&#39;2015-04-08&#39;, 2,)], [&#39;dt&#39;, &#39;add&#39;]</span>
<span class="sd"> ... ).select(sf.dateadd(&quot;dt&quot;, -1)).show()</span>
<span class="sd"> +----------------+</span>
<span class="sd"> |date_add(dt, -1)|</span>
<span class="sd"> +----------------+</span>
<span class="sd"> | 2015-04-07|</span>
<span class="sd"> +----------------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">days</span> <span class="o">=</span> <span class="n">lit</span><span class="p">(</span><span class="n">days</span><span class="p">)</span> <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">days</span><span class="p">,</span> <span class="nb">int</span><span class="p">)</span> <span class="k">else</span> <span class="n">days</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;dateadd&quot;</span><span class="p">,</span> <span class="n">start</span><span class="p">,</span> <span class="n">days</span><span class="p">)</span></div>
<div class="viewcode-block" id="date_sub"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.date_sub.html#pyspark.sql.functions.date_sub">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">date_sub</span><span class="p">(</span><span class="n">start</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="n">days</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="nb">int</span><span class="p">])</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns the date that is `days` days before `start`. If `days` is a negative value</span>
<span class="sd"> then these amount of days will be added to `start`.</span>
<span class="sd"> .. versionadded:: 1.5.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> start : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> date column to work on.</span>
<span class="sd"> days : :class:`~pyspark.sql.Column` or str or int</span>
<span class="sd"> how many days before the given date to calculate.</span>
<span class="sd"> Accepts negative value as well to calculate forward in time.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> a date before/after given number of days.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(&#39;2015-04-08&#39;, 2,)], [&#39;dt&#39;, &#39;sub&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(date_sub(df.dt, 1).alias(&#39;prev_date&#39;)).collect()</span>
<span class="sd"> [Row(prev_date=datetime.date(2015, 4, 7))]</span>
<span class="sd"> &gt;&gt;&gt; df.select(date_sub(df.dt, df.sub.cast(&#39;integer&#39;)).alias(&#39;prev_date&#39;)).collect()</span>
<span class="sd"> [Row(prev_date=datetime.date(2015, 4, 6))]</span>
<span class="sd"> &gt;&gt;&gt; df.select(date_sub(&#39;dt&#39;, -1).alias(&#39;next_date&#39;)).collect()</span>
<span class="sd"> [Row(next_date=datetime.date(2015, 4, 9))]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">days</span> <span class="o">=</span> <span class="n">lit</span><span class="p">(</span><span class="n">days</span><span class="p">)</span> <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">days</span><span class="p">,</span> <span class="nb">int</span><span class="p">)</span> <span class="k">else</span> <span class="n">days</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;date_sub&quot;</span><span class="p">,</span> <span class="n">start</span><span class="p">,</span> <span class="n">days</span><span class="p">)</span></div>
<div class="viewcode-block" id="datediff"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.datediff.html#pyspark.sql.functions.datediff">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">datediff</span><span class="p">(</span><span class="n">end</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="n">start</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns the number of days from `start` to `end`.</span>
<span class="sd"> .. versionadded:: 1.5.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> end : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> to date column to work on.</span>
<span class="sd"> start : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> from date column to work on.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> difference in days between two dates.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(&#39;2015-04-08&#39;,&#39;2015-05-10&#39;)], [&#39;d1&#39;, &#39;d2&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(datediff(df.d2, df.d1).alias(&#39;diff&#39;)).collect()</span>
<span class="sd"> [Row(diff=32)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;datediff&quot;</span><span class="p">,</span> <span class="n">end</span><span class="p">,</span> <span class="n">start</span><span class="p">)</span></div>
<div class="viewcode-block" id="date_diff"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.date_diff.html#pyspark.sql.functions.date_diff">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">date_diff</span><span class="p">(</span><span class="n">end</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="n">start</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns the number of days from `start` to `end`.</span>
<span class="sd"> .. versionadded:: 3.5.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> end : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> to date column to work on.</span>
<span class="sd"> start : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> from date column to work on.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> difference in days between two dates.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(&#39;2015-04-08&#39;,&#39;2015-05-10&#39;)], [&#39;d1&#39;, &#39;d2&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(date_diff(df.d2, df.d1).alias(&#39;diff&#39;)).collect()</span>
<span class="sd"> [Row(diff=32)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;date_diff&quot;</span><span class="p">,</span> <span class="n">end</span><span class="p">,</span> <span class="n">start</span><span class="p">)</span></div>
<div class="viewcode-block" id="date_from_unix_date"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.date_from_unix_date.html#pyspark.sql.functions.date_from_unix_date">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">date_from_unix_date</span><span class="p">(</span><span class="n">days</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Create date from the number of `days` since 1970-01-01.</span>
<span class="sd"> .. versionadded:: 3.5.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> days : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> the target column to work on.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> the date from the number of days since 1970-01-01.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.range(1)</span>
<span class="sd"> &gt;&gt;&gt; df.select(date_from_unix_date(lit(1))).show()</span>
<span class="sd"> +----------------------+</span>
<span class="sd"> |date_from_unix_date(1)|</span>
<span class="sd"> +----------------------+</span>
<span class="sd"> | 1970-01-02|</span>
<span class="sd"> +----------------------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;date_from_unix_date&quot;</span><span class="p">,</span> <span class="n">days</span><span class="p">)</span></div>
<div class="viewcode-block" id="add_months"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.add_months.html#pyspark.sql.functions.add_months">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">add_months</span><span class="p">(</span><span class="n">start</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="n">months</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="nb">int</span><span class="p">])</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns the date that is `months` months after `start`. If `months` is a negative value</span>
<span class="sd"> then these amount of months will be deducted from the `start`.</span>
<span class="sd"> .. versionadded:: 1.5.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> start : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> date column to work on.</span>
<span class="sd"> months : :class:`~pyspark.sql.Column` or str or int</span>
<span class="sd"> how many months after the given date to calculate.</span>
<span class="sd"> Accepts negative value as well to calculate backwards.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> a date after/before given number of months.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(&#39;2015-04-08&#39;, 2)], [&#39;dt&#39;, &#39;add&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(add_months(df.dt, 1).alias(&#39;next_month&#39;)).collect()</span>
<span class="sd"> [Row(next_month=datetime.date(2015, 5, 8))]</span>
<span class="sd"> &gt;&gt;&gt; df.select(add_months(df.dt, df.add.cast(&#39;integer&#39;)).alias(&#39;next_month&#39;)).collect()</span>
<span class="sd"> [Row(next_month=datetime.date(2015, 6, 8))]</span>
<span class="sd"> &gt;&gt;&gt; df.select(add_months(&#39;dt&#39;, -2).alias(&#39;prev_month&#39;)).collect()</span>
<span class="sd"> [Row(prev_month=datetime.date(2015, 2, 8))]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">months</span> <span class="o">=</span> <span class="n">lit</span><span class="p">(</span><span class="n">months</span><span class="p">)</span> <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">months</span><span class="p">,</span> <span class="nb">int</span><span class="p">)</span> <span class="k">else</span> <span class="n">months</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;add_months&quot;</span><span class="p">,</span> <span class="n">start</span><span class="p">,</span> <span class="n">months</span><span class="p">)</span></div>
<div class="viewcode-block" id="months_between"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.months_between.html#pyspark.sql.functions.months_between">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">months_between</span><span class="p">(</span><span class="n">date1</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="n">date2</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="n">roundOff</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns number of months between dates date1 and date2.</span>
<span class="sd"> If date1 is later than date2, then the result is positive.</span>
<span class="sd"> A whole number is returned if both inputs have the same day of month or both are the last day</span>
<span class="sd"> of their respective months. Otherwise, the difference is calculated assuming 31 days per month.</span>
<span class="sd"> The result is rounded off to 8 digits unless `roundOff` is set to `False`.</span>
<span class="sd"> .. versionadded:: 1.5.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> date1 : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> first date column.</span>
<span class="sd"> date2 : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> second date column.</span>
<span class="sd"> roundOff : bool, optional</span>
<span class="sd"> whether to round (to 8 digits) the final value or not (default: True).</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> number of months between two dates.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(&#39;1997-02-28 10:30:00&#39;, &#39;1996-10-30&#39;)], [&#39;date1&#39;, &#39;date2&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(months_between(df.date1, df.date2).alias(&#39;months&#39;)).collect()</span>
<span class="sd"> [Row(months=3.94959677)]</span>
<span class="sd"> &gt;&gt;&gt; df.select(months_between(df.date1, df.date2, False).alias(&#39;months&#39;)).collect()</span>
<span class="sd"> [Row(months=3.9495967741935485)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span>
<span class="s2">&quot;months_between&quot;</span><span class="p">,</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">date1</span><span class="p">),</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">date2</span><span class="p">),</span> <span class="n">roundOff</span>
<span class="p">)</span></div>
<div class="viewcode-block" id="to_date"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.to_date.html#pyspark.sql.functions.to_date">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">to_date</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="nb">format</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Converts a :class:`~pyspark.sql.Column` into :class:`pyspark.sql.types.DateType`</span>
<span class="sd"> using the optionally specified format. Specify formats according to `datetime pattern`_.</span>
<span class="sd"> By default, it follows casting rules to :class:`pyspark.sql.types.DateType` if the format</span>
<span class="sd"> is omitted. Equivalent to ``col.cast(&quot;date&quot;)``.</span>
<span class="sd"> .. _datetime pattern: https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html</span>
<span class="sd"> .. versionadded:: 2.2.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> input column of values to convert.</span>
<span class="sd"> format: str, optional</span>
<span class="sd"> format to use to convert date values.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> date value as :class:`pyspark.sql.types.DateType` type.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(&#39;1997-02-28 10:30:00&#39;,)], [&#39;t&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(to_date(df.t).alias(&#39;date&#39;)).collect()</span>
<span class="sd"> [Row(date=datetime.date(1997, 2, 28))]</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(&#39;1997-02-28 10:30:00&#39;,)], [&#39;t&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(to_date(df.t, &#39;yyyy-MM-dd HH:mm:ss&#39;).alias(&#39;date&#39;)).collect()</span>
<span class="sd"> [Row(date=datetime.date(1997, 2, 28))]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="nb">format</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;to_date&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">&quot;to_date&quot;</span><span class="p">,</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">),</span> <span class="nb">format</span><span class="p">)</span></div>
<div class="viewcode-block" id="unix_date"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.unix_date.html#pyspark.sql.functions.unix_date">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">unix_date</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Returns the number of days since 1970-01-01.</span>
<span class="sd"> .. versionadded:: 3.5.0</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; spark.conf.set(&quot;spark.sql.session.timeZone&quot;, &quot;America/Los_Angeles&quot;)</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(&#39;1970-01-02&#39;,)], [&#39;t&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(unix_date(to_date(df.t)).alias(&#39;n&#39;)).collect()</span>
<span class="sd"> [Row(n=1)]</span>
<span class="sd"> &gt;&gt;&gt; spark.conf.unset(&quot;spark.sql.session.timeZone&quot;)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;unix_date&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="unix_micros"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.unix_micros.html#pyspark.sql.functions.unix_micros">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">unix_micros</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Returns the number of microseconds since 1970-01-01 00:00:00 UTC.</span>
<span class="sd"> .. versionadded:: 3.5.0</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; spark.conf.set(&quot;spark.sql.session.timeZone&quot;, &quot;America/Los_Angeles&quot;)</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(&#39;2015-07-22 10:00:00&#39;,)], [&#39;t&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(unix_micros(to_timestamp(df.t)).alias(&#39;n&#39;)).collect()</span>
<span class="sd"> [Row(n=1437584400000000)]</span>
<span class="sd"> &gt;&gt;&gt; spark.conf.unset(&quot;spark.sql.session.timeZone&quot;)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;unix_micros&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="unix_millis"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.unix_millis.html#pyspark.sql.functions.unix_millis">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">unix_millis</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Returns the number of milliseconds since 1970-01-01 00:00:00 UTC.</span>
<span class="sd"> Truncates higher levels of precision.</span>
<span class="sd"> .. versionadded:: 3.5.0</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; spark.conf.set(&quot;spark.sql.session.timeZone&quot;, &quot;America/Los_Angeles&quot;)</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(&#39;2015-07-22 10:00:00&#39;,)], [&#39;t&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(unix_millis(to_timestamp(df.t)).alias(&#39;n&#39;)).collect()</span>
<span class="sd"> [Row(n=1437584400000)]</span>
<span class="sd"> &gt;&gt;&gt; spark.conf.unset(&quot;spark.sql.session.timeZone&quot;)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;unix_millis&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="unix_seconds"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.unix_seconds.html#pyspark.sql.functions.unix_seconds">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">unix_seconds</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Returns the number of seconds since 1970-01-01 00:00:00 UTC.</span>
<span class="sd"> Truncates higher levels of precision.</span>
<span class="sd"> .. versionadded:: 3.5.0</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; spark.conf.set(&quot;spark.sql.session.timeZone&quot;, &quot;America/Los_Angeles&quot;)</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(&#39;2015-07-22 10:00:00&#39;,)], [&#39;t&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(unix_seconds(to_timestamp(df.t)).alias(&#39;n&#39;)).collect()</span>
<span class="sd"> [Row(n=1437584400)]</span>
<span class="sd"> &gt;&gt;&gt; spark.conf.unset(&quot;spark.sql.session.timeZone&quot;)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;unix_seconds&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<span class="nd">@overload</span>
<span class="k">def</span> <span class="nf">to_timestamp</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="o">...</span>
<span class="nd">@overload</span>
<span class="k">def</span> <span class="nf">to_timestamp</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="nb">format</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="o">...</span>
<div class="viewcode-block" id="to_timestamp"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.to_timestamp.html#pyspark.sql.functions.to_timestamp">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">to_timestamp</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="nb">format</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Converts a :class:`~pyspark.sql.Column` into :class:`pyspark.sql.types.TimestampType`</span>
<span class="sd"> using the optionally specified format. Specify formats according to `datetime pattern`_.</span>
<span class="sd"> By default, it follows casting rules to :class:`pyspark.sql.types.TimestampType` if the format</span>
<span class="sd"> is omitted. Equivalent to ``col.cast(&quot;timestamp&quot;)``.</span>
<span class="sd"> .. _datetime pattern: https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html</span>
<span class="sd"> .. versionadded:: 2.2.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> column values to convert.</span>
<span class="sd"> format: str, optional</span>
<span class="sd"> format to use to convert timestamp values.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> timestamp value as :class:`pyspark.sql.types.TimestampType` type.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(&#39;1997-02-28 10:30:00&#39;,)], [&#39;t&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(to_timestamp(df.t).alias(&#39;dt&#39;)).collect()</span>
<span class="sd"> [Row(dt=datetime.datetime(1997, 2, 28, 10, 30))]</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(&#39;1997-02-28 10:30:00&#39;,)], [&#39;t&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(to_timestamp(df.t, &#39;yyyy-MM-dd HH:mm:ss&#39;).alias(&#39;dt&#39;)).collect()</span>
<span class="sd"> [Row(dt=datetime.datetime(1997, 2, 28, 10, 30))]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="nb">format</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;to_timestamp&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">&quot;to_timestamp&quot;</span><span class="p">,</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">),</span> <span class="nb">format</span><span class="p">)</span></div>
<div class="viewcode-block" id="try_to_timestamp"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.try_to_timestamp.html#pyspark.sql.functions.try_to_timestamp">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">try_to_timestamp</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="nb">format</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="s2">&quot;ColumnOrName&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Parses the `col` with the `format` to a timestamp. The function always</span>
<span class="sd"> returns null on an invalid input with/without ANSI SQL mode enabled. The result data type is</span>
<span class="sd"> consistent with the value of configuration `spark.sql.timestampType`.</span>
<span class="sd"> .. versionadded:: 3.5.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> column values to convert.</span>
<span class="sd"> format: str, optional</span>
<span class="sd"> format to use to convert timestamp values.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(&#39;1997-02-28 10:30:00&#39;,)], [&#39;t&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(try_to_timestamp(df.t).alias(&#39;dt&#39;)).collect()</span>
<span class="sd"> [Row(dt=datetime.datetime(1997, 2, 28, 10, 30))]</span>
<span class="sd"> &gt;&gt;&gt; df.select(try_to_timestamp(df.t, lit(&#39;yyyy-MM-dd HH:mm:ss&#39;)).alias(&#39;dt&#39;)).collect()</span>
<span class="sd"> [Row(dt=datetime.datetime(1997, 2, 28, 10, 30))]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="nb">format</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;try_to_timestamp&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">,</span> <span class="nb">format</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;try_to_timestamp&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="xpath"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.xpath.html#pyspark.sql.functions.xpath">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">xpath</span><span class="p">(</span><span class="n">xml</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="n">path</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns a string array of values within the nodes of xml that match the XPath expression.</span>
<span class="sd"> .. versionadded:: 3.5.0</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame(</span>
<span class="sd"> ... [(&#39;&lt;a&gt;&lt;b&gt;b1&lt;/b&gt;&lt;b&gt;b2&lt;/b&gt;&lt;b&gt;b3&lt;/b&gt;&lt;c&gt;c1&lt;/c&gt;&lt;c&gt;c2&lt;/c&gt;&lt;/a&gt;&#39;,)], [&#39;x&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(xpath(df.x, lit(&#39;a/b/text()&#39;)).alias(&#39;r&#39;)).collect()</span>
<span class="sd"> [Row(r=[&#39;b1&#39;, &#39;b2&#39;, &#39;b3&#39;])]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;xpath&quot;</span><span class="p">,</span> <span class="n">xml</span><span class="p">,</span> <span class="n">path</span><span class="p">)</span></div>
<div class="viewcode-block" id="xpath_boolean"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.xpath_boolean.html#pyspark.sql.functions.xpath_boolean">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">xpath_boolean</span><span class="p">(</span><span class="n">xml</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="n">path</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns true if the XPath expression evaluates to true, or if a matching node is found.</span>
<span class="sd"> .. versionadded:: 3.5.0</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(&#39;&lt;a&gt;&lt;b&gt;1&lt;/b&gt;&lt;/a&gt;&#39;,)], [&#39;x&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(xpath_boolean(df.x, lit(&#39;a/b&#39;)).alias(&#39;r&#39;)).collect()</span>
<span class="sd"> [Row(r=True)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;xpath_boolean&quot;</span><span class="p">,</span> <span class="n">xml</span><span class="p">,</span> <span class="n">path</span><span class="p">)</span></div>
<div class="viewcode-block" id="xpath_double"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.xpath_double.html#pyspark.sql.functions.xpath_double">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">xpath_double</span><span class="p">(</span><span class="n">xml</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="n">path</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns a double value, the value zero if no match is found,</span>
<span class="sd"> or NaN if a match is found but the value is non-numeric.</span>
<span class="sd"> .. versionadded:: 3.5.0</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(&#39;&lt;a&gt;&lt;b&gt;1&lt;/b&gt;&lt;b&gt;2&lt;/b&gt;&lt;/a&gt;&#39;,)], [&#39;x&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(xpath_double(df.x, lit(&#39;sum(a/b)&#39;)).alias(&#39;r&#39;)).collect()</span>
<span class="sd"> [Row(r=3.0)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;xpath_double&quot;</span><span class="p">,</span> <span class="n">xml</span><span class="p">,</span> <span class="n">path</span><span class="p">)</span></div>
<div class="viewcode-block" id="xpath_number"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.xpath_number.html#pyspark.sql.functions.xpath_number">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">xpath_number</span><span class="p">(</span><span class="n">xml</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="n">path</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns a double value, the value zero if no match is found,</span>
<span class="sd"> or NaN if a match is found but the value is non-numeric.</span>
<span class="sd"> .. versionadded:: 3.5.0</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; import pyspark.sql.functions as sf</span>
<span class="sd"> &gt;&gt;&gt; spark.createDataFrame(</span>
<span class="sd"> ... [(&#39;&lt;a&gt;&lt;b&gt;1&lt;/b&gt;&lt;b&gt;2&lt;/b&gt;&lt;/a&gt;&#39;,)], [&#39;x&#39;]</span>
<span class="sd"> ... ).select(sf.xpath_number(&#39;x&#39;, sf.lit(&#39;sum(a/b)&#39;))).show()</span>
<span class="sd"> +-------------------------+</span>
<span class="sd"> |xpath_number(x, sum(a/b))|</span>
<span class="sd"> +-------------------------+</span>
<span class="sd"> | 3.0|</span>
<span class="sd"> +-------------------------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;xpath_number&quot;</span><span class="p">,</span> <span class="n">xml</span><span class="p">,</span> <span class="n">path</span><span class="p">)</span></div>
<div class="viewcode-block" id="xpath_float"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.xpath_float.html#pyspark.sql.functions.xpath_float">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">xpath_float</span><span class="p">(</span><span class="n">xml</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="n">path</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns a float value, the value zero if no match is found,</span>
<span class="sd"> or NaN if a match is found but the value is non-numeric.</span>
<span class="sd"> .. versionadded:: 3.5.0</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(&#39;&lt;a&gt;&lt;b&gt;1&lt;/b&gt;&lt;b&gt;2&lt;/b&gt;&lt;/a&gt;&#39;,)], [&#39;x&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(xpath_float(df.x, lit(&#39;sum(a/b)&#39;)).alias(&#39;r&#39;)).collect()</span>
<span class="sd"> [Row(r=3.0)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;xpath_float&quot;</span><span class="p">,</span> <span class="n">xml</span><span class="p">,</span> <span class="n">path</span><span class="p">)</span></div>
<div class="viewcode-block" id="xpath_int"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.xpath_int.html#pyspark.sql.functions.xpath_int">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">xpath_int</span><span class="p">(</span><span class="n">xml</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="n">path</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns an integer value, or the value zero if no match is found,</span>
<span class="sd"> or a match is found but the value is non-numeric.</span>
<span class="sd"> .. versionadded:: 3.5.0</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(&#39;&lt;a&gt;&lt;b&gt;1&lt;/b&gt;&lt;b&gt;2&lt;/b&gt;&lt;/a&gt;&#39;,)], [&#39;x&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(xpath_int(df.x, lit(&#39;sum(a/b)&#39;)).alias(&#39;r&#39;)).collect()</span>
<span class="sd"> [Row(r=3)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;xpath_int&quot;</span><span class="p">,</span> <span class="n">xml</span><span class="p">,</span> <span class="n">path</span><span class="p">)</span></div>
<div class="viewcode-block" id="xpath_long"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.xpath_long.html#pyspark.sql.functions.xpath_long">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">xpath_long</span><span class="p">(</span><span class="n">xml</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="n">path</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns a long integer value, or the value zero if no match is found,</span>
<span class="sd"> or a match is found but the value is non-numeric.</span>
<span class="sd"> .. versionadded:: 3.5.0</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(&#39;&lt;a&gt;&lt;b&gt;1&lt;/b&gt;&lt;b&gt;2&lt;/b&gt;&lt;/a&gt;&#39;,)], [&#39;x&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(xpath_long(df.x, lit(&#39;sum(a/b)&#39;)).alias(&#39;r&#39;)).collect()</span>
<span class="sd"> [Row(r=3)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;xpath_long&quot;</span><span class="p">,</span> <span class="n">xml</span><span class="p">,</span> <span class="n">path</span><span class="p">)</span></div>
<div class="viewcode-block" id="xpath_short"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.xpath_short.html#pyspark.sql.functions.xpath_short">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">xpath_short</span><span class="p">(</span><span class="n">xml</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="n">path</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns a short integer value, or the value zero if no match is found,</span>
<span class="sd"> or a match is found but the value is non-numeric.</span>
<span class="sd"> .. versionadded:: 3.5.0</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(&#39;&lt;a&gt;&lt;b&gt;1&lt;/b&gt;&lt;b&gt;2&lt;/b&gt;&lt;/a&gt;&#39;,)], [&#39;x&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(xpath_short(df.x, lit(&#39;sum(a/b)&#39;)).alias(&#39;r&#39;)).collect()</span>
<span class="sd"> [Row(r=3)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;xpath_short&quot;</span><span class="p">,</span> <span class="n">xml</span><span class="p">,</span> <span class="n">path</span><span class="p">)</span></div>
<div class="viewcode-block" id="xpath_string"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.xpath_string.html#pyspark.sql.functions.xpath_string">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">xpath_string</span><span class="p">(</span><span class="n">xml</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="n">path</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns the text contents of the first xml node that matches the XPath expression.</span>
<span class="sd"> .. versionadded:: 3.5.0</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(&#39;&lt;a&gt;&lt;b&gt;b&lt;/b&gt;&lt;c&gt;cc&lt;/c&gt;&lt;/a&gt;&#39;,)], [&#39;x&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(xpath_string(df.x, lit(&#39;a/c&#39;)).alias(&#39;r&#39;)).collect()</span>
<span class="sd"> [Row(r=&#39;cc&#39;)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;xpath_string&quot;</span><span class="p">,</span> <span class="n">xml</span><span class="p">,</span> <span class="n">path</span><span class="p">)</span></div>
<div class="viewcode-block" id="trunc"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.trunc.html#pyspark.sql.functions.trunc">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">trunc</span><span class="p">(</span><span class="n">date</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="nb">format</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns date truncated to the unit specified by the format.</span>
<span class="sd"> .. versionadded:: 1.5.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> date : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> input column of values to truncate.</span>
<span class="sd"> format : str</span>
<span class="sd"> &#39;year&#39;, &#39;yyyy&#39;, &#39;yy&#39; to truncate by year,</span>
<span class="sd"> or &#39;month&#39;, &#39;mon&#39;, &#39;mm&#39; to truncate by month</span>
<span class="sd"> Other options are: &#39;week&#39;, &#39;quarter&#39;</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> truncated date.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(&#39;1997-02-28&#39;,)], [&#39;d&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(trunc(df.d, &#39;year&#39;).alias(&#39;year&#39;)).collect()</span>
<span class="sd"> [Row(year=datetime.date(1997, 1, 1))]</span>
<span class="sd"> &gt;&gt;&gt; df.select(trunc(df.d, &#39;mon&#39;).alias(&#39;month&#39;)).collect()</span>
<span class="sd"> [Row(month=datetime.date(1997, 2, 1))]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">&quot;trunc&quot;</span><span class="p">,</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">date</span><span class="p">),</span> <span class="nb">format</span><span class="p">)</span></div>
<div class="viewcode-block" id="date_trunc"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.date_trunc.html#pyspark.sql.functions.date_trunc">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">date_trunc</span><span class="p">(</span><span class="nb">format</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="n">timestamp</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns timestamp truncated to the unit specified by the format.</span>
<span class="sd"> .. versionadded:: 2.3.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> format : str</span>
<span class="sd"> &#39;year&#39;, &#39;yyyy&#39;, &#39;yy&#39; to truncate by year,</span>
<span class="sd"> &#39;month&#39;, &#39;mon&#39;, &#39;mm&#39; to truncate by month,</span>
<span class="sd"> &#39;day&#39;, &#39;dd&#39; to truncate by day,</span>
<span class="sd"> Other options are:</span>
<span class="sd"> &#39;microsecond&#39;, &#39;millisecond&#39;, &#39;second&#39;, &#39;minute&#39;, &#39;hour&#39;, &#39;week&#39;, &#39;quarter&#39;</span>
<span class="sd"> timestamp : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> input column of values to truncate.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> truncated timestamp.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(&#39;1997-02-28 05:02:11&#39;,)], [&#39;t&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(date_trunc(&#39;year&#39;, df.t).alias(&#39;year&#39;)).collect()</span>
<span class="sd"> [Row(year=datetime.datetime(1997, 1, 1, 0, 0))]</span>
<span class="sd"> &gt;&gt;&gt; df.select(date_trunc(&#39;mon&#39;, df.t).alias(&#39;month&#39;)).collect()</span>
<span class="sd"> [Row(month=datetime.datetime(1997, 2, 1, 0, 0))]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">&quot;date_trunc&quot;</span><span class="p">,</span> <span class="nb">format</span><span class="p">,</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">timestamp</span><span class="p">))</span></div>
<div class="viewcode-block" id="next_day"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.next_day.html#pyspark.sql.functions.next_day">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">next_day</span><span class="p">(</span><span class="n">date</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="n">dayOfWeek</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns the first date which is later than the value of the date column</span>
<span class="sd"> based on second `week day` argument.</span>
<span class="sd"> .. versionadded:: 1.5.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> date : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> target column to compute on.</span>
<span class="sd"> dayOfWeek : str</span>
<span class="sd"> day of the week, case-insensitive, accepts:</span>
<span class="sd"> &quot;Mon&quot;, &quot;Tue&quot;, &quot;Wed&quot;, &quot;Thu&quot;, &quot;Fri&quot;, &quot;Sat&quot;, &quot;Sun&quot;</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> the column of computed results.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(&#39;2015-07-27&#39;,)], [&#39;d&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(next_day(df.d, &#39;Sun&#39;).alias(&#39;date&#39;)).collect()</span>
<span class="sd"> [Row(date=datetime.date(2015, 8, 2))]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">&quot;next_day&quot;</span><span class="p">,</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">date</span><span class="p">),</span> <span class="n">dayOfWeek</span><span class="p">)</span></div>
<div class="viewcode-block" id="last_day"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.last_day.html#pyspark.sql.functions.last_day">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">last_day</span><span class="p">(</span><span class="n">date</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns the last day of the month which the given date belongs to.</span>
<span class="sd"> .. versionadded:: 1.5.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> date : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> target column to compute on.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> last day of the month.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(&#39;1997-02-10&#39;,)], [&#39;d&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(last_day(df.d).alias(&#39;date&#39;)).collect()</span>
<span class="sd"> [Row(date=datetime.date(1997, 2, 28))]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">&quot;last_day&quot;</span><span class="p">,</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">date</span><span class="p">))</span></div>
<div class="viewcode-block" id="from_unixtime"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.from_unixtime.html#pyspark.sql.functions.from_unixtime">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">from_unixtime</span><span class="p">(</span><span class="n">timestamp</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="nb">format</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">&quot;yyyy-MM-dd HH:mm:ss&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Converts the number of seconds from unix epoch (1970-01-01 00:00:00 UTC) to a string</span>
<span class="sd"> representing the timestamp of that moment in the current system time zone in the given</span>
<span class="sd"> format.</span>
<span class="sd"> .. versionadded:: 1.5.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> timestamp : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> column of unix time values.</span>
<span class="sd"> format : str, optional</span>
<span class="sd"> format to use to convert to (default: yyyy-MM-dd HH:mm:ss)</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> formatted timestamp as string.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; spark.conf.set(&quot;spark.sql.session.timeZone&quot;, &quot;America/Los_Angeles&quot;)</span>
<span class="sd"> &gt;&gt;&gt; time_df = spark.createDataFrame([(1428476400,)], [&#39;unix_time&#39;])</span>
<span class="sd"> &gt;&gt;&gt; time_df.select(from_unixtime(&#39;unix_time&#39;).alias(&#39;ts&#39;)).collect()</span>
<span class="sd"> [Row(ts=&#39;2015-04-08 00:00:00&#39;)]</span>
<span class="sd"> &gt;&gt;&gt; spark.conf.unset(&quot;spark.sql.session.timeZone&quot;)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">&quot;from_unixtime&quot;</span><span class="p">,</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">timestamp</span><span class="p">),</span> <span class="nb">format</span><span class="p">)</span></div>
<span class="nd">@overload</span>
<span class="k">def</span> <span class="nf">unix_timestamp</span><span class="p">(</span><span class="n">timestamp</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="nb">format</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="o">...</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="o">...</span>
<span class="nd">@overload</span>
<span class="k">def</span> <span class="nf">unix_timestamp</span><span class="p">()</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="o">...</span>
<div class="viewcode-block" id="unix_timestamp"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.unix_timestamp.html#pyspark.sql.functions.unix_timestamp">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">unix_timestamp</span><span class="p">(</span>
<span class="n">timestamp</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="s2">&quot;ColumnOrName&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> <span class="nb">format</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">&quot;yyyy-MM-dd HH:mm:ss&quot;</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Convert time string with given pattern (&#39;yyyy-MM-dd HH:mm:ss&#39;, by default)</span>
<span class="sd"> to Unix time stamp (in seconds), using the default timezone and the default</span>
<span class="sd"> locale, returns null if failed.</span>
<span class="sd"> if `timestamp` is None, then it returns current timestamp.</span>
<span class="sd"> .. versionadded:: 1.5.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> timestamp : :class:`~pyspark.sql.Column` or str, optional</span>
<span class="sd"> timestamps of string values.</span>
<span class="sd"> format : str, optional</span>
<span class="sd"> alternative format to use for converting (default: yyyy-MM-dd HH:mm:ss).</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> unix time as long integer.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; spark.conf.set(&quot;spark.sql.session.timeZone&quot;, &quot;America/Los_Angeles&quot;)</span>
<span class="sd"> &gt;&gt;&gt; time_df = spark.createDataFrame([(&#39;2015-04-08&#39;,)], [&#39;dt&#39;])</span>
<span class="sd"> &gt;&gt;&gt; time_df.select(unix_timestamp(&#39;dt&#39;, &#39;yyyy-MM-dd&#39;).alias(&#39;unix_time&#39;)).collect()</span>
<span class="sd"> [Row(unix_time=1428476400)]</span>
<span class="sd"> &gt;&gt;&gt; spark.conf.unset(&quot;spark.sql.session.timeZone&quot;)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="n">timestamp</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
<span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">&quot;unix_timestamp&quot;</span><span class="p">)</span>
<span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">&quot;unix_timestamp&quot;</span><span class="p">,</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">timestamp</span><span class="p">),</span> <span class="nb">format</span><span class="p">)</span></div>
<div class="viewcode-block" id="from_utc_timestamp"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.from_utc_timestamp.html#pyspark.sql.functions.from_utc_timestamp">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">from_utc_timestamp</span><span class="p">(</span><span class="n">timestamp</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="n">tz</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> This is a common function for databases supporting TIMESTAMP WITHOUT TIMEZONE. This function</span>
<span class="sd"> takes a timestamp which is timezone-agnostic, and interprets it as a timestamp in UTC, and</span>
<span class="sd"> renders that timestamp as a timestamp in the given time zone.</span>
<span class="sd"> However, timestamp in Spark represents number of microseconds from the Unix epoch, which is not</span>
<span class="sd"> timezone-agnostic. So in Spark this function just shift the timestamp value from UTC timezone to</span>
<span class="sd"> the given timezone.</span>
<span class="sd"> This function may return confusing result if the input is a string with timezone, e.g.</span>
<span class="sd"> &#39;2018-03-13T06:18:23+00:00&#39;. The reason is that, Spark firstly cast the string to timestamp</span>
<span class="sd"> according to the timezone in the string, and finally display the result by converting the</span>
<span class="sd"> timestamp to string according to the session local timezone.</span>
<span class="sd"> .. versionadded:: 1.5.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> timestamp : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> the column that contains timestamps</span>
<span class="sd"> tz : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> A string detailing the time zone ID that the input should be adjusted to. It should</span>
<span class="sd"> be in the format of either region-based zone IDs or zone offsets. Region IDs must</span>
<span class="sd"> have the form &#39;area/city&#39;, such as &#39;America/Los_Angeles&#39;. Zone offsets must be in</span>
<span class="sd"> the format &#39;(+|-)HH:mm&#39;, for example &#39;-08:00&#39; or &#39;+01:00&#39;. Also &#39;UTC&#39; and &#39;Z&#39; are</span>
<span class="sd"> supported as aliases of &#39;+00:00&#39;. Other short names are not recommended to use</span>
<span class="sd"> because they can be ambiguous.</span>
<span class="sd"> .. versionchanged:: 2.4</span>
<span class="sd"> `tz` can take a :class:`~pyspark.sql.Column` containing timezone ID strings.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> timestamp value represented in given timezone.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(&#39;1997-02-28 10:30:00&#39;, &#39;JST&#39;)], [&#39;ts&#39;, &#39;tz&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(from_utc_timestamp(df.ts, &quot;PST&quot;).alias(&#39;local_time&#39;)).collect()</span>
<span class="sd"> [Row(local_time=datetime.datetime(1997, 2, 28, 2, 30))]</span>
<span class="sd"> &gt;&gt;&gt; df.select(from_utc_timestamp(df.ts, df.tz).alias(&#39;local_time&#39;)).collect()</span>
<span class="sd"> [Row(local_time=datetime.datetime(1997, 2, 28, 19, 30))]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">tz</span><span class="p">,</span> <span class="n">Column</span><span class="p">):</span>
<span class="n">tz</span> <span class="o">=</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">tz</span><span class="p">)</span>
<span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">&quot;from_utc_timestamp&quot;</span><span class="p">,</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">timestamp</span><span class="p">),</span> <span class="n">tz</span><span class="p">)</span></div>
<div class="viewcode-block" id="to_utc_timestamp"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.to_utc_timestamp.html#pyspark.sql.functions.to_utc_timestamp">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">to_utc_timestamp</span><span class="p">(</span><span class="n">timestamp</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="n">tz</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> This is a common function for databases supporting TIMESTAMP WITHOUT TIMEZONE. This function</span>
<span class="sd"> takes a timestamp which is timezone-agnostic, and interprets it as a timestamp in the given</span>
<span class="sd"> timezone, and renders that timestamp as a timestamp in UTC.</span>
<span class="sd"> However, timestamp in Spark represents number of microseconds from the Unix epoch, which is not</span>
<span class="sd"> timezone-agnostic. So in Spark this function just shift the timestamp value from the given</span>
<span class="sd"> timezone to UTC timezone.</span>
<span class="sd"> This function may return confusing result if the input is a string with timezone, e.g.</span>
<span class="sd"> &#39;2018-03-13T06:18:23+00:00&#39;. The reason is that, Spark firstly cast the string to timestamp</span>
<span class="sd"> according to the timezone in the string, and finally display the result by converting the</span>
<span class="sd"> timestamp to string according to the session local timezone.</span>
<span class="sd"> .. versionadded:: 1.5.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> timestamp : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> the column that contains timestamps</span>
<span class="sd"> tz : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> A string detailing the time zone ID that the input should be adjusted to. It should</span>
<span class="sd"> be in the format of either region-based zone IDs or zone offsets. Region IDs must</span>
<span class="sd"> have the form &#39;area/city&#39;, such as &#39;America/Los_Angeles&#39;. Zone offsets must be in</span>
<span class="sd"> the format &#39;(+|-)HH:mm&#39;, for example &#39;-08:00&#39; or &#39;+01:00&#39;. Also &#39;UTC&#39; and &#39;Z&#39; are</span>
<span class="sd"> supported as aliases of &#39;+00:00&#39;. Other short names are not recommended to use</span>
<span class="sd"> because they can be ambiguous.</span>
<span class="sd"> .. versionchanged:: 2.4.0</span>
<span class="sd"> `tz` can take a :class:`~pyspark.sql.Column` containing timezone ID strings.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> timestamp value represented in UTC timezone.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(&#39;1997-02-28 10:30:00&#39;, &#39;JST&#39;)], [&#39;ts&#39;, &#39;tz&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(to_utc_timestamp(df.ts, &quot;PST&quot;).alias(&#39;utc_time&#39;)).collect()</span>
<span class="sd"> [Row(utc_time=datetime.datetime(1997, 2, 28, 18, 30))]</span>
<span class="sd"> &gt;&gt;&gt; df.select(to_utc_timestamp(df.ts, df.tz).alias(&#39;utc_time&#39;)).collect()</span>
<span class="sd"> [Row(utc_time=datetime.datetime(1997, 2, 28, 1, 30))]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">tz</span><span class="p">,</span> <span class="n">Column</span><span class="p">):</span>
<span class="n">tz</span> <span class="o">=</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">tz</span><span class="p">)</span>
<span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">&quot;to_utc_timestamp&quot;</span><span class="p">,</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">timestamp</span><span class="p">),</span> <span class="n">tz</span><span class="p">)</span></div>
<div class="viewcode-block" id="timestamp_seconds"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.timestamp_seconds.html#pyspark.sql.functions.timestamp_seconds">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">timestamp_seconds</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Converts the number of seconds from the Unix epoch (1970-01-01T00:00:00Z)</span>
<span class="sd"> to a timestamp.</span>
<span class="sd"> .. versionadded:: 3.1.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> unix time values.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> converted timestamp value.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; from pyspark.sql.functions import timestamp_seconds</span>
<span class="sd"> &gt;&gt;&gt; spark.conf.set(&quot;spark.sql.session.timeZone&quot;, &quot;UTC&quot;)</span>
<span class="sd"> &gt;&gt;&gt; time_df = spark.createDataFrame([(1230219000,)], [&#39;unix_time&#39;])</span>
<span class="sd"> &gt;&gt;&gt; time_df.select(timestamp_seconds(time_df.unix_time).alias(&#39;ts&#39;)).show()</span>
<span class="sd"> +-------------------+</span>
<span class="sd"> | ts|</span>
<span class="sd"> +-------------------+</span>
<span class="sd"> |2008-12-25 15:30:00|</span>
<span class="sd"> +-------------------+</span>
<span class="sd"> &gt;&gt;&gt; time_df.select(timestamp_seconds(&#39;unix_time&#39;).alias(&#39;ts&#39;)).printSchema()</span>
<span class="sd"> root</span>
<span class="sd"> |-- ts: timestamp (nullable = true)</span>
<span class="sd"> &gt;&gt;&gt; spark.conf.unset(&quot;spark.sql.session.timeZone&quot;)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;timestamp_seconds&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="timestamp_millis"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.timestamp_millis.html#pyspark.sql.functions.timestamp_millis">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">timestamp_millis</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Creates timestamp from the number of milliseconds since UTC epoch.</span>
<span class="sd"> .. versionadded:: 3.5.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> unix time values.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> converted timestamp value.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; spark.conf.set(&quot;spark.sql.session.timeZone&quot;, &quot;UTC&quot;)</span>
<span class="sd"> &gt;&gt;&gt; time_df = spark.createDataFrame([(1230219000,)], [&#39;unix_time&#39;])</span>
<span class="sd"> &gt;&gt;&gt; time_df.select(timestamp_millis(time_df.unix_time).alias(&#39;ts&#39;)).show()</span>
<span class="sd"> +-------------------+</span>
<span class="sd"> | ts|</span>
<span class="sd"> +-------------------+</span>
<span class="sd"> |1970-01-15 05:43:39|</span>
<span class="sd"> +-------------------+</span>
<span class="sd"> &gt;&gt;&gt; time_df.select(timestamp_millis(&#39;unix_time&#39;).alias(&#39;ts&#39;)).printSchema()</span>
<span class="sd"> root</span>
<span class="sd"> |-- ts: timestamp (nullable = true)</span>
<span class="sd"> &gt;&gt;&gt; spark.conf.unset(&quot;spark.sql.session.timeZone&quot;)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;timestamp_millis&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="timestamp_micros"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.timestamp_micros.html#pyspark.sql.functions.timestamp_micros">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">timestamp_micros</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Creates timestamp from the number of microseconds since UTC epoch.</span>
<span class="sd"> .. versionadded:: 3.5.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> unix time values.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> converted timestamp value.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; spark.conf.set(&quot;spark.sql.session.timeZone&quot;, &quot;UTC&quot;)</span>
<span class="sd"> &gt;&gt;&gt; time_df = spark.createDataFrame([(1230219000,)], [&#39;unix_time&#39;])</span>
<span class="sd"> &gt;&gt;&gt; time_df.select(timestamp_micros(time_df.unix_time).alias(&#39;ts&#39;)).show()</span>
<span class="sd"> +--------------------+</span>
<span class="sd"> | ts|</span>
<span class="sd"> +--------------------+</span>
<span class="sd"> |1970-01-01 00:20:...|</span>
<span class="sd"> +--------------------+</span>
<span class="sd"> &gt;&gt;&gt; time_df.select(timestamp_micros(&#39;unix_time&#39;).alias(&#39;ts&#39;)).printSchema()</span>
<span class="sd"> root</span>
<span class="sd"> |-- ts: timestamp (nullable = true)</span>
<span class="sd"> &gt;&gt;&gt; spark.conf.unset(&quot;spark.sql.session.timeZone&quot;)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;timestamp_micros&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="window"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.window.html#pyspark.sql.functions.window">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">window</span><span class="p">(</span>
<span class="n">timeColumn</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span>
<span class="n">windowDuration</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span>
<span class="n">slideDuration</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">startTime</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Bucketize rows into one or more time windows given a timestamp specifying column. Window</span>
<span class="sd"> starts are inclusive but the window ends are exclusive, e.g. 12:05 will be in the window</span>
<span class="sd"> [12:05,12:10) but not in [12:00,12:05). Windows can support microsecond precision. Windows in</span>
<span class="sd"> the order of months are not supported.</span>
<span class="sd"> The time column must be of :class:`pyspark.sql.types.TimestampType`.</span>
<span class="sd"> Durations are provided as strings, e.g. &#39;1 second&#39;, &#39;1 day 12 hours&#39;, &#39;2 minutes&#39;. Valid</span>
<span class="sd"> interval strings are &#39;week&#39;, &#39;day&#39;, &#39;hour&#39;, &#39;minute&#39;, &#39;second&#39;, &#39;millisecond&#39;, &#39;microsecond&#39;.</span>
<span class="sd"> If the ``slideDuration`` is not provided, the windows will be tumbling windows.</span>
<span class="sd"> The startTime is the offset with respect to 1970-01-01 00:00:00 UTC with which to start</span>
<span class="sd"> window intervals. For example, in order to have hourly tumbling windows that start 15 minutes</span>
<span class="sd"> past the hour, e.g. 12:15-13:15, 13:15-14:15... provide `startTime` as `15 minutes`.</span>
<span class="sd"> The output column will be a struct called &#39;window&#39; by default with the nested columns &#39;start&#39;</span>
<span class="sd"> and &#39;end&#39;, where &#39;start&#39; and &#39;end&#39; will be of :class:`pyspark.sql.types.TimestampType`.</span>
<span class="sd"> .. versionadded:: 2.0.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> timeColumn : :class:`~pyspark.sql.Column`</span>
<span class="sd"> The column or the expression to use as the timestamp for windowing by time.</span>
<span class="sd"> The time column must be of TimestampType or TimestampNTZType.</span>
<span class="sd"> windowDuration : str</span>
<span class="sd"> A string specifying the width of the window, e.g. `10 minutes`,</span>
<span class="sd"> `1 second`. Check `org.apache.spark.unsafe.types.CalendarInterval` for</span>
<span class="sd"> valid duration identifiers. Note that the duration is a fixed length of</span>
<span class="sd"> time, and does not vary over time according to a calendar. For example,</span>
<span class="sd"> `1 day` always means 86,400,000 milliseconds, not a calendar day.</span>
<span class="sd"> slideDuration : str, optional</span>
<span class="sd"> A new window will be generated every `slideDuration`. Must be less than</span>
<span class="sd"> or equal to the `windowDuration`. Check</span>
<span class="sd"> `org.apache.spark.unsafe.types.CalendarInterval` for valid duration</span>
<span class="sd"> identifiers. This duration is likewise absolute, and does not vary</span>
<span class="sd"> according to a calendar.</span>
<span class="sd"> startTime : str, optional</span>
<span class="sd"> The offset with respect to 1970-01-01 00:00:00 UTC with which to start</span>
<span class="sd"> window intervals. For example, in order to have hourly tumbling windows that</span>
<span class="sd"> start 15 minutes past the hour, e.g. 12:15-13:15, 13:15-14:15... provide</span>
<span class="sd"> `startTime` as `15 minutes`.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> the column for computed results.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; import datetime</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame(</span>
<span class="sd"> ... [(datetime.datetime(2016, 3, 11, 9, 0, 7), 1)],</span>
<span class="sd"> ... ).toDF(&quot;date&quot;, &quot;val&quot;)</span>
<span class="sd"> &gt;&gt;&gt; w = df.groupBy(window(&quot;date&quot;, &quot;5 seconds&quot;)).agg(sum(&quot;val&quot;).alias(&quot;sum&quot;))</span>
<span class="sd"> &gt;&gt;&gt; w.select(w.window.start.cast(&quot;string&quot;).alias(&quot;start&quot;),</span>
<span class="sd"> ... w.window.end.cast(&quot;string&quot;).alias(&quot;end&quot;), &quot;sum&quot;).collect()</span>
<span class="sd"> [Row(start=&#39;2016-03-11 09:00:05&#39;, end=&#39;2016-03-11 09:00:10&#39;, sum=1)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">def</span> <span class="nf">check_string_field</span><span class="p">(</span><span class="n">field</span><span class="p">,</span> <span class="n">fieldName</span><span class="p">):</span> <span class="c1"># type: ignore[no-untyped-def]</span>
<span class="k">if</span> <span class="ow">not</span> <span class="n">field</span> <span class="ow">or</span> <span class="nb">type</span><span class="p">(</span><span class="n">field</span><span class="p">)</span> <span class="ow">is</span> <span class="ow">not</span> <span class="nb">str</span><span class="p">:</span>
<span class="k">raise</span> <span class="n">PySparkTypeError</span><span class="p">(</span>
<span class="n">error_class</span><span class="o">=</span><span class="s2">&quot;NOT_STR&quot;</span><span class="p">,</span>
<span class="n">message_parameters</span><span class="o">=</span><span class="p">{</span><span class="s2">&quot;arg_name&quot;</span><span class="p">:</span> <span class="n">fieldName</span><span class="p">,</span> <span class="s2">&quot;arg_type&quot;</span><span class="p">:</span> <span class="nb">type</span><span class="p">(</span><span class="n">field</span><span class="p">)</span><span class="o">.</span><span class="vm">__name__</span><span class="p">},</span>
<span class="p">)</span>
<span class="n">time_col</span> <span class="o">=</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">timeColumn</span><span class="p">)</span>
<span class="n">check_string_field</span><span class="p">(</span><span class="n">windowDuration</span><span class="p">,</span> <span class="s2">&quot;windowDuration&quot;</span><span class="p">)</span>
<span class="k">if</span> <span class="n">slideDuration</span> <span class="ow">and</span> <span class="n">startTime</span><span class="p">:</span>
<span class="n">check_string_field</span><span class="p">(</span><span class="n">slideDuration</span><span class="p">,</span> <span class="s2">&quot;slideDuration&quot;</span><span class="p">)</span>
<span class="n">check_string_field</span><span class="p">(</span><span class="n">startTime</span><span class="p">,</span> <span class="s2">&quot;startTime&quot;</span><span class="p">)</span>
<span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">&quot;window&quot;</span><span class="p">,</span> <span class="n">time_col</span><span class="p">,</span> <span class="n">windowDuration</span><span class="p">,</span> <span class="n">slideDuration</span><span class="p">,</span> <span class="n">startTime</span><span class="p">)</span>
<span class="k">elif</span> <span class="n">slideDuration</span><span class="p">:</span>
<span class="n">check_string_field</span><span class="p">(</span><span class="n">slideDuration</span><span class="p">,</span> <span class="s2">&quot;slideDuration&quot;</span><span class="p">)</span>
<span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">&quot;window&quot;</span><span class="p">,</span> <span class="n">time_col</span><span class="p">,</span> <span class="n">windowDuration</span><span class="p">,</span> <span class="n">slideDuration</span><span class="p">)</span>
<span class="k">elif</span> <span class="n">startTime</span><span class="p">:</span>
<span class="n">check_string_field</span><span class="p">(</span><span class="n">startTime</span><span class="p">,</span> <span class="s2">&quot;startTime&quot;</span><span class="p">)</span>
<span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">&quot;window&quot;</span><span class="p">,</span> <span class="n">time_col</span><span class="p">,</span> <span class="n">windowDuration</span><span class="p">,</span> <span class="n">windowDuration</span><span class="p">,</span> <span class="n">startTime</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">&quot;window&quot;</span><span class="p">,</span> <span class="n">time_col</span><span class="p">,</span> <span class="n">windowDuration</span><span class="p">)</span></div>
<div class="viewcode-block" id="window_time"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.window_time.html#pyspark.sql.functions.window_time">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">window_time</span><span class="p">(</span>
<span class="n">windowColumn</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Computes the event time from a window column. The column window values are produced</span>
<span class="sd"> by window aggregating operators and are of type `STRUCT&lt;start: TIMESTAMP, end: TIMESTAMP&gt;`</span>
<span class="sd"> where start is inclusive and end is exclusive. The event time of records produced by window</span>
<span class="sd"> aggregating operators can be computed as ``window_time(window)`` and are</span>
<span class="sd"> ``window.end - lit(1).alias(&quot;microsecond&quot;)`` (as microsecond is the minimal supported event</span>
<span class="sd"> time precision). The window column must be one produced by a window aggregating operator.</span>
<span class="sd"> .. versionadded:: 3.4.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> windowColumn : :class:`~pyspark.sql.Column`</span>
<span class="sd"> The window column of a window aggregate records.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> the column for computed results.</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; import datetime</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame(</span>
<span class="sd"> ... [(datetime.datetime(2016, 3, 11, 9, 0, 7), 1)],</span>
<span class="sd"> ... ).toDF(&quot;date&quot;, &quot;val&quot;)</span>
<span class="sd"> Group the data into 5 second time windows and aggregate as sum.</span>
<span class="sd"> &gt;&gt;&gt; w = df.groupBy(window(&quot;date&quot;, &quot;5 seconds&quot;)).agg(sum(&quot;val&quot;).alias(&quot;sum&quot;))</span>
<span class="sd"> Extract the window event time using the window_time function.</span>
<span class="sd"> &gt;&gt;&gt; w.select(</span>
<span class="sd"> ... w.window.end.cast(&quot;string&quot;).alias(&quot;end&quot;),</span>
<span class="sd"> ... window_time(w.window).cast(&quot;string&quot;).alias(&quot;window_time&quot;),</span>
<span class="sd"> ... &quot;sum&quot;</span>
<span class="sd"> ... ).collect()</span>
<span class="sd"> [Row(end=&#39;2016-03-11 09:00:10&#39;, window_time=&#39;2016-03-11 09:00:09.999999&#39;, sum=1)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">window_col</span> <span class="o">=</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">windowColumn</span><span class="p">)</span>
<span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">&quot;window_time&quot;</span><span class="p">,</span> <span class="n">window_col</span><span class="p">)</span></div>
<div class="viewcode-block" id="session_window"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.session_window.html#pyspark.sql.functions.session_window">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">session_window</span><span class="p">(</span><span class="n">timeColumn</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="n">gapDuration</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="n">Column</span><span class="p">,</span> <span class="nb">str</span><span class="p">])</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Generates session window given a timestamp specifying column.</span>
<span class="sd"> Session window is one of dynamic windows, which means the length of window is varying</span>
<span class="sd"> according to the given inputs. The length of session window is defined as &quot;the timestamp</span>
<span class="sd"> of latest input of the session + gap duration&quot;, so when the new inputs are bound to the</span>
<span class="sd"> current session window, the end time of session window can be expanded according to the new</span>
<span class="sd"> inputs.</span>
<span class="sd"> Windows can support microsecond precision. Windows in the order of months are not supported.</span>
<span class="sd"> For a streaming query, you may use the function `current_timestamp` to generate windows on</span>
<span class="sd"> processing time.</span>
<span class="sd"> gapDuration is provided as strings, e.g. &#39;1 second&#39;, &#39;1 day 12 hours&#39;, &#39;2 minutes&#39;. Valid</span>
<span class="sd"> interval strings are &#39;week&#39;, &#39;day&#39;, &#39;hour&#39;, &#39;minute&#39;, &#39;second&#39;, &#39;millisecond&#39;, &#39;microsecond&#39;.</span>
<span class="sd"> It could also be a Column which can be evaluated to gap duration dynamically based on the</span>
<span class="sd"> input row.</span>
<span class="sd"> The output column will be a struct called &#39;session_window&#39; by default with the nested columns</span>
<span class="sd"> &#39;start&#39; and &#39;end&#39;, where &#39;start&#39; and &#39;end&#39; will be of :class:`pyspark.sql.types.TimestampType`.</span>
<span class="sd"> .. versionadded:: 3.2.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> timeColumn : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> The column name or column to use as the timestamp for windowing by time.</span>
<span class="sd"> The time column must be of TimestampType or TimestampNTZType.</span>
<span class="sd"> gapDuration : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> A Python string literal or column specifying the timeout of the session. It could be</span>
<span class="sd"> static value, e.g. `10 minutes`, `1 second`, or an expression/UDF that specifies gap</span>
<span class="sd"> duration dynamically based on the input row.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> the column for computed results.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(&quot;2016-03-11 09:00:07&quot;, 1)]).toDF(&quot;date&quot;, &quot;val&quot;)</span>
<span class="sd"> &gt;&gt;&gt; w = df.groupBy(session_window(&quot;date&quot;, &quot;5 seconds&quot;)).agg(sum(&quot;val&quot;).alias(&quot;sum&quot;))</span>
<span class="sd"> &gt;&gt;&gt; w.select(w.session_window.start.cast(&quot;string&quot;).alias(&quot;start&quot;),</span>
<span class="sd"> ... w.session_window.end.cast(&quot;string&quot;).alias(&quot;end&quot;), &quot;sum&quot;).collect()</span>
<span class="sd"> [Row(start=&#39;2016-03-11 09:00:07&#39;, end=&#39;2016-03-11 09:00:12&#39;, sum=1)]</span>
<span class="sd"> &gt;&gt;&gt; w = df.groupBy(session_window(&quot;date&quot;, lit(&quot;5 seconds&quot;))).agg(sum(&quot;val&quot;).alias(&quot;sum&quot;))</span>
<span class="sd"> &gt;&gt;&gt; w.select(w.session_window.start.cast(&quot;string&quot;).alias(&quot;start&quot;),</span>
<span class="sd"> ... w.session_window.end.cast(&quot;string&quot;).alias(&quot;end&quot;), &quot;sum&quot;).collect()</span>
<span class="sd"> [Row(start=&#39;2016-03-11 09:00:07&#39;, end=&#39;2016-03-11 09:00:12&#39;, sum=1)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">def</span> <span class="nf">check_field</span><span class="p">(</span><span class="n">field</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="n">Column</span><span class="p">,</span> <span class="nb">str</span><span class="p">],</span> <span class="n">fieldName</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="kc">None</span><span class="p">:</span>
<span class="k">if</span> <span class="n">field</span> <span class="ow">is</span> <span class="kc">None</span> <span class="ow">or</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">field</span><span class="p">,</span> <span class="p">(</span><span class="nb">str</span><span class="p">,</span> <span class="n">Column</span><span class="p">)):</span>
<span class="k">raise</span> <span class="n">PySparkTypeError</span><span class="p">(</span>
<span class="n">error_class</span><span class="o">=</span><span class="s2">&quot;NOT_COLUMN_OR_STR&quot;</span><span class="p">,</span>
<span class="n">message_parameters</span><span class="o">=</span><span class="p">{</span><span class="s2">&quot;arg_name&quot;</span><span class="p">:</span> <span class="n">fieldName</span><span class="p">,</span> <span class="s2">&quot;arg_type&quot;</span><span class="p">:</span> <span class="nb">type</span><span class="p">(</span><span class="n">field</span><span class="p">)</span><span class="o">.</span><span class="vm">__name__</span><span class="p">},</span>
<span class="p">)</span>
<span class="n">time_col</span> <span class="o">=</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">timeColumn</span><span class="p">)</span>
<span class="n">check_field</span><span class="p">(</span><span class="n">gapDuration</span><span class="p">,</span> <span class="s2">&quot;gapDuration&quot;</span><span class="p">)</span>
<span class="n">gap_duration</span> <span class="o">=</span> <span class="n">gapDuration</span> <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">gapDuration</span><span class="p">,</span> <span class="nb">str</span><span class="p">)</span> <span class="k">else</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">gapDuration</span><span class="p">)</span>
<span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">&quot;session_window&quot;</span><span class="p">,</span> <span class="n">time_col</span><span class="p">,</span> <span class="n">gap_duration</span><span class="p">)</span></div>
<div class="viewcode-block" id="to_unix_timestamp"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.to_unix_timestamp.html#pyspark.sql.functions.to_unix_timestamp">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">to_unix_timestamp</span><span class="p">(</span>
<span class="n">timestamp</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span>
<span class="nb">format</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="s2">&quot;ColumnOrName&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns the UNIX timestamp of the given time.</span>
<span class="sd"> .. versionadded:: 3.5.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> timestamp : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> Input column or strings.</span>
<span class="sd"> format : :class:`~pyspark.sql.Column` or str, optional</span>
<span class="sd"> format to use to convert UNIX timestamp values.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; spark.conf.set(&quot;spark.sql.session.timeZone&quot;, &quot;America/Los_Angeles&quot;)</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(&quot;2016-04-08&quot;,)], [&quot;e&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(to_unix_timestamp(df.e, lit(&quot;yyyy-MM-dd&quot;)).alias(&#39;r&#39;)).collect()</span>
<span class="sd"> [Row(r=1460098800)]</span>
<span class="sd"> &gt;&gt;&gt; spark.conf.unset(&quot;spark.sql.session.timeZone&quot;)</span>
<span class="sd"> &gt;&gt;&gt; spark.conf.set(&quot;spark.sql.session.timeZone&quot;, &quot;America/Los_Angeles&quot;)</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(&quot;2016-04-08&quot;,)], [&quot;e&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(to_unix_timestamp(df.e).alias(&#39;r&#39;)).collect()</span>
<span class="sd"> [Row(r=None)]</span>
<span class="sd"> &gt;&gt;&gt; spark.conf.unset(&quot;spark.sql.session.timeZone&quot;)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="nb">format</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;to_unix_timestamp&quot;</span><span class="p">,</span> <span class="n">timestamp</span><span class="p">,</span> <span class="nb">format</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;to_unix_timestamp&quot;</span><span class="p">,</span> <span class="n">timestamp</span><span class="p">)</span></div>
<div class="viewcode-block" id="to_timestamp_ltz"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.to_timestamp_ltz.html#pyspark.sql.functions.to_timestamp_ltz">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">to_timestamp_ltz</span><span class="p">(</span>
<span class="n">timestamp</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span>
<span class="nb">format</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="s2">&quot;ColumnOrName&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Parses the `timestamp` with the `format` to a timestamp without time zone.</span>
<span class="sd"> Returns null with invalid input.</span>
<span class="sd"> .. versionadded:: 3.5.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> timestamp : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> Input column or strings.</span>
<span class="sd"> format : :class:`~pyspark.sql.Column` or str, optional</span>
<span class="sd"> format to use to convert type `TimestampType` timestamp values.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(&quot;2016-12-31&quot;,)], [&quot;e&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(to_timestamp_ltz(df.e, lit(&quot;yyyy-MM-dd&quot;)).alias(&#39;r&#39;)).collect()</span>
<span class="sd"> ... # doctest: +SKIP</span>
<span class="sd"> [Row(r=datetime.datetime(2016, 12, 31, 0, 0))]</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(&quot;2016-12-31&quot;,)], [&quot;e&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(to_timestamp_ltz(df.e).alias(&#39;r&#39;)).collect()</span>
<span class="sd"> ... # doctest: +SKIP</span>
<span class="sd"> [Row(r=datetime.datetime(2016, 12, 31, 0, 0))]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="nb">format</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;to_timestamp_ltz&quot;</span><span class="p">,</span> <span class="n">timestamp</span><span class="p">,</span> <span class="nb">format</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;to_timestamp_ltz&quot;</span><span class="p">,</span> <span class="n">timestamp</span><span class="p">)</span></div>
<div class="viewcode-block" id="to_timestamp_ntz"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.to_timestamp_ntz.html#pyspark.sql.functions.to_timestamp_ntz">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">to_timestamp_ntz</span><span class="p">(</span>
<span class="n">timestamp</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span>
<span class="nb">format</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="s2">&quot;ColumnOrName&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Parses the `timestamp` with the `format` to a timestamp without time zone.</span>
<span class="sd"> Returns null with invalid input.</span>
<span class="sd"> .. versionadded:: 3.5.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> timestamp : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> Input column or strings.</span>
<span class="sd"> format : :class:`~pyspark.sql.Column` or str, optional</span>
<span class="sd"> format to use to convert type `TimestampNTZType` timestamp values.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(&quot;2016-04-08&quot;,)], [&quot;e&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(to_timestamp_ntz(df.e, lit(&quot;yyyy-MM-dd&quot;)).alias(&#39;r&#39;)).collect()</span>
<span class="sd"> ... # doctest: +SKIP</span>
<span class="sd"> [Row(r=datetime.datetime(2016, 4, 8, 0, 0))]</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(&quot;2016-04-08&quot;,)], [&quot;e&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(to_timestamp_ntz(df.e).alias(&#39;r&#39;)).collect()</span>
<span class="sd"> ... # doctest: +SKIP</span>
<span class="sd"> [Row(r=datetime.datetime(2016, 4, 8, 0, 0))]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="nb">format</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;to_timestamp_ntz&quot;</span><span class="p">,</span> <span class="n">timestamp</span><span class="p">,</span> <span class="nb">format</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;to_timestamp_ntz&quot;</span><span class="p">,</span> <span class="n">timestamp</span><span class="p">)</span></div>
<span class="c1"># ---------------------------- misc functions ----------------------------------</span>
<div class="viewcode-block" id="current_catalog"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.current_catalog.html#pyspark.sql.functions.current_catalog">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">current_catalog</span><span class="p">()</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Returns the current catalog.</span>
<span class="sd"> .. versionadded:: 3.5.0</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; spark.range(1).select(current_catalog()).show()</span>
<span class="sd"> +-----------------+</span>
<span class="sd"> |current_catalog()|</span>
<span class="sd"> +-----------------+</span>
<span class="sd"> | spark_catalog|</span>
<span class="sd"> +-----------------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">&quot;current_catalog&quot;</span><span class="p">)</span></div>
<div class="viewcode-block" id="current_database"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.current_database.html#pyspark.sql.functions.current_database">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">current_database</span><span class="p">()</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Returns the current database.</span>
<span class="sd"> .. versionadded:: 3.5.0</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; spark.range(1).select(current_database()).show()</span>
<span class="sd"> +------------------+</span>
<span class="sd"> |current_database()|</span>
<span class="sd"> +------------------+</span>
<span class="sd"> | default|</span>
<span class="sd"> +------------------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">&quot;current_database&quot;</span><span class="p">)</span></div>
<div class="viewcode-block" id="current_schema"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.current_schema.html#pyspark.sql.functions.current_schema">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">current_schema</span><span class="p">()</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Returns the current database.</span>
<span class="sd"> .. versionadded:: 3.5.0</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; import pyspark.sql.functions as sf</span>
<span class="sd"> &gt;&gt;&gt; spark.range(1).select(sf.current_schema()).show()</span>
<span class="sd"> +------------------+</span>
<span class="sd"> |current_database()|</span>
<span class="sd"> +------------------+</span>
<span class="sd"> | default|</span>
<span class="sd"> +------------------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">&quot;current_schema&quot;</span><span class="p">)</span></div>
<div class="viewcode-block" id="current_user"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.current_user.html#pyspark.sql.functions.current_user">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">current_user</span><span class="p">()</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Returns the current database.</span>
<span class="sd"> .. versionadded:: 3.5.0</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; spark.range(1).select(current_user()).show() # doctest: +SKIP</span>
<span class="sd"> +--------------+</span>
<span class="sd"> |current_user()|</span>
<span class="sd"> +--------------+</span>
<span class="sd"> | ruifeng.zheng|</span>
<span class="sd"> +--------------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">&quot;current_user&quot;</span><span class="p">)</span></div>
<div class="viewcode-block" id="user"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.user.html#pyspark.sql.functions.user">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">user</span><span class="p">()</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Returns the current database.</span>
<span class="sd"> .. versionadded:: 3.5.0</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; import pyspark.sql.functions as sf</span>
<span class="sd"> &gt;&gt;&gt; spark.range(1).select(sf.user()).show() # doctest: +SKIP</span>
<span class="sd"> +--------------+</span>
<span class="sd"> |current_user()|</span>
<span class="sd"> +--------------+</span>
<span class="sd"> | ruifeng.zheng|</span>
<span class="sd"> +--------------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">&quot;user&quot;</span><span class="p">)</span></div>
<div class="viewcode-block" id="crc32"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.crc32.html#pyspark.sql.functions.crc32">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">crc32</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Calculates the cyclic redundancy check value (CRC32) of a binary column and</span>
<span class="sd"> returns the value as a bigint.</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> target column to compute on.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> the column for computed results.</span>
<span class="sd"> .. versionadded:: 1.5.0</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; spark.createDataFrame([(&#39;ABC&#39;,)], [&#39;a&#39;]).select(crc32(&#39;a&#39;).alias(&#39;crc32&#39;)).collect()</span>
<span class="sd"> [Row(crc32=2743272264)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;crc32&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="md5"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.md5.html#pyspark.sql.functions.md5">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">md5</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Calculates the MD5 digest and returns the value as a 32 character hex string.</span>
<span class="sd"> .. versionadded:: 1.5.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> target column to compute on.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> the column for computed results.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; spark.createDataFrame([(&#39;ABC&#39;,)], [&#39;a&#39;]).select(md5(&#39;a&#39;).alias(&#39;hash&#39;)).collect()</span>
<span class="sd"> [Row(hash=&#39;902fbdd2b1df0c4f70b4a5d23525e932&#39;)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;md5&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="sha1"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.sha1.html#pyspark.sql.functions.sha1">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">sha1</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Returns the hex string result of SHA-1.</span>
<span class="sd"> .. versionadded:: 1.5.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> target column to compute on.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> the column for computed results.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; spark.createDataFrame([(&#39;ABC&#39;,)], [&#39;a&#39;]).select(sha1(&#39;a&#39;).alias(&#39;hash&#39;)).collect()</span>
<span class="sd"> [Row(hash=&#39;3c01bdbb26f358bab27f267924aa2c9a03fcfdb8&#39;)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;sha1&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="sha2"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.sha2.html#pyspark.sql.functions.sha2">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">sha2</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="n">numBits</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Returns the hex string result of SHA-2 family of hash functions (SHA-224, SHA-256, SHA-384,</span>
<span class="sd"> and SHA-512). The numBits indicates the desired bit length of the result, which must have a</span>
<span class="sd"> value of 224, 256, 384, 512, or 0 (which is equivalent to 256).</span>
<span class="sd"> .. versionadded:: 1.5.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> target column to compute on.</span>
<span class="sd"> numBits : int</span>
<span class="sd"> the desired bit length of the result, which must have a</span>
<span class="sd"> value of 224, 256, 384, 512, or 0 (which is equivalent to 256).</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> the column for computed results.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([[&quot;Alice&quot;], [&quot;Bob&quot;]], [&quot;name&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df.withColumn(&quot;sha2&quot;, sha2(df.name, 256)).show(truncate=False)</span>
<span class="sd"> +-----+----------------------------------------------------------------+</span>
<span class="sd"> |name |sha2 |</span>
<span class="sd"> +-----+----------------------------------------------------------------+</span>
<span class="sd"> |Alice|3bc51062973c458d5a6f2d8d64a023246354ad7e064b1e4e009ec8a0699a3043|</span>
<span class="sd"> |Bob |cd9fb1e148ccd8442e5aa74904cc73bf6fb54d1d54d333bd596aa9bb4bb4e961|</span>
<span class="sd"> +-----+----------------------------------------------------------------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">&quot;sha2&quot;</span><span class="p">,</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">),</span> <span class="n">numBits</span><span class="p">)</span></div>
<div class="viewcode-block" id="hash"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.hash.html#pyspark.sql.functions.hash">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">hash</span><span class="p">(</span><span class="o">*</span><span class="n">cols</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Calculates the hash code of given columns, and returns the result as an int column.</span>
<span class="sd"> .. versionadded:: 2.0.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> cols : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> one or more columns to compute on.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> hash value as int column.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(&#39;ABC&#39;, &#39;DEF&#39;)], [&#39;c1&#39;, &#39;c2&#39;])</span>
<span class="sd"> Hash for one column</span>
<span class="sd"> &gt;&gt;&gt; df.select(hash(&#39;c1&#39;).alias(&#39;hash&#39;)).show()</span>
<span class="sd"> +----------+</span>
<span class="sd"> | hash|</span>
<span class="sd"> +----------+</span>
<span class="sd"> |-757602832|</span>
<span class="sd"> +----------+</span>
<span class="sd"> Two or more columns</span>
<span class="sd"> &gt;&gt;&gt; df.select(hash(&#39;c1&#39;, &#39;c2&#39;).alias(&#39;hash&#39;)).show()</span>
<span class="sd"> +---------+</span>
<span class="sd"> | hash|</span>
<span class="sd"> +---------+</span>
<span class="sd"> |599895104|</span>
<span class="sd"> +---------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_seq_of_columns</span><span class="p">(</span><span class="s2">&quot;hash&quot;</span><span class="p">,</span> <span class="n">cols</span><span class="p">)</span></div>
<div class="viewcode-block" id="xxhash64"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.xxhash64.html#pyspark.sql.functions.xxhash64">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">xxhash64</span><span class="p">(</span><span class="o">*</span><span class="n">cols</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Calculates the hash code of given columns using the 64-bit variant of the xxHash algorithm,</span>
<span class="sd"> and returns the result as a long column. The hash computation uses an initial seed of 42.</span>
<span class="sd"> .. versionadded:: 3.0.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> cols : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> one or more columns to compute on.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> hash value as long column.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(&#39;ABC&#39;, &#39;DEF&#39;)], [&#39;c1&#39;, &#39;c2&#39;])</span>
<span class="sd"> Hash for one column</span>
<span class="sd"> &gt;&gt;&gt; df.select(xxhash64(&#39;c1&#39;).alias(&#39;hash&#39;)).show()</span>
<span class="sd"> +-------------------+</span>
<span class="sd"> | hash|</span>
<span class="sd"> +-------------------+</span>
<span class="sd"> |4105715581806190027|</span>
<span class="sd"> +-------------------+</span>
<span class="sd"> Two or more columns</span>
<span class="sd"> &gt;&gt;&gt; df.select(xxhash64(&#39;c1&#39;, &#39;c2&#39;).alias(&#39;hash&#39;)).show()</span>
<span class="sd"> +-------------------+</span>
<span class="sd"> | hash|</span>
<span class="sd"> +-------------------+</span>
<span class="sd"> |3233247871021311208|</span>
<span class="sd"> +-------------------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_seq_of_columns</span><span class="p">(</span><span class="s2">&quot;xxhash64&quot;</span><span class="p">,</span> <span class="n">cols</span><span class="p">)</span></div>
<div class="viewcode-block" id="assert_true"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.assert_true.html#pyspark.sql.functions.assert_true">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">assert_true</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="n">errMsg</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="n">Column</span><span class="p">,</span> <span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns `null` if the input column is `true`; throws an exception</span>
<span class="sd"> with the provided error message otherwise.</span>
<span class="sd"> .. versionadded:: 3.1.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> column name or column that represents the input column to test</span>
<span class="sd"> errMsg : :class:`~pyspark.sql.Column` or str, optional</span>
<span class="sd"> A Python string literal or column containing the error message</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> `null` if the input column is `true` otherwise throws an error with specified message.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(0,1)], [&#39;a&#39;, &#39;b&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(assert_true(df.a &lt; df.b).alias(&#39;r&#39;)).collect()</span>
<span class="sd"> [Row(r=None)]</span>
<span class="sd"> &gt;&gt;&gt; df.select(assert_true(df.a &lt; df.b, df.a).alias(&#39;r&#39;)).collect()</span>
<span class="sd"> [Row(r=None)]</span>
<span class="sd"> &gt;&gt;&gt; df.select(assert_true(df.a &lt; df.b, &#39;error&#39;).alias(&#39;r&#39;)).collect()</span>
<span class="sd"> [Row(r=None)]</span>
<span class="sd"> &gt;&gt;&gt; df.select(assert_true(df.a &gt; df.b, &#39;My error msg&#39;).alias(&#39;r&#39;)).collect() # doctest: +SKIP</span>
<span class="sd"> ...</span>
<span class="sd"> java.lang.RuntimeException: My error msg</span>
<span class="sd"> ...</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="n">errMsg</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;assert_true&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span>
<span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">errMsg</span><span class="p">,</span> <span class="p">(</span><span class="nb">str</span><span class="p">,</span> <span class="n">Column</span><span class="p">)):</span>
<span class="k">raise</span> <span class="n">PySparkTypeError</span><span class="p">(</span>
<span class="n">error_class</span><span class="o">=</span><span class="s2">&quot;NOT_COLUMN_OR_STR&quot;</span><span class="p">,</span>
<span class="n">message_parameters</span><span class="o">=</span><span class="p">{</span><span class="s2">&quot;arg_name&quot;</span><span class="p">:</span> <span class="s2">&quot;errMsg&quot;</span><span class="p">,</span> <span class="s2">&quot;arg_type&quot;</span><span class="p">:</span> <span class="nb">type</span><span class="p">(</span><span class="n">errMsg</span><span class="p">)</span><span class="o">.</span><span class="vm">__name__</span><span class="p">},</span>
<span class="p">)</span>
<span class="n">errMsg</span> <span class="o">=</span> <span class="p">(</span>
<span class="n">_create_column_from_literal</span><span class="p">(</span><span class="n">errMsg</span><span class="p">)</span> <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">errMsg</span><span class="p">,</span> <span class="nb">str</span><span class="p">)</span> <span class="k">else</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">errMsg</span><span class="p">)</span>
<span class="p">)</span>
<span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">&quot;assert_true&quot;</span><span class="p">,</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">),</span> <span class="n">errMsg</span><span class="p">)</span></div>
<div class="viewcode-block" id="raise_error"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.raise_error.html#pyspark.sql.functions.raise_error">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">raise_error</span><span class="p">(</span><span class="n">errMsg</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="n">Column</span><span class="p">,</span> <span class="nb">str</span><span class="p">])</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Throws an exception with the provided error message.</span>
<span class="sd"> .. versionadded:: 3.1.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> errMsg : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> A Python string literal or column containing the error message</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> throws an error with specified message.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.range(1)</span>
<span class="sd"> &gt;&gt;&gt; df.select(raise_error(&quot;My error message&quot;)).show() # doctest: +SKIP</span>
<span class="sd"> ...</span>
<span class="sd"> java.lang.RuntimeException: My error message</span>
<span class="sd"> ...</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">errMsg</span><span class="p">,</span> <span class="p">(</span><span class="nb">str</span><span class="p">,</span> <span class="n">Column</span><span class="p">)):</span>
<span class="k">raise</span> <span class="n">PySparkTypeError</span><span class="p">(</span>
<span class="n">error_class</span><span class="o">=</span><span class="s2">&quot;NOT_COLUMN_OR_STR&quot;</span><span class="p">,</span>
<span class="n">message_parameters</span><span class="o">=</span><span class="p">{</span><span class="s2">&quot;arg_name&quot;</span><span class="p">:</span> <span class="s2">&quot;errMsg&quot;</span><span class="p">,</span> <span class="s2">&quot;arg_type&quot;</span><span class="p">:</span> <span class="nb">type</span><span class="p">(</span><span class="n">errMsg</span><span class="p">)</span><span class="o">.</span><span class="vm">__name__</span><span class="p">},</span>
<span class="p">)</span>
<span class="n">errMsg</span> <span class="o">=</span> <span class="p">(</span>
<span class="n">_create_column_from_literal</span><span class="p">(</span><span class="n">errMsg</span><span class="p">)</span> <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">errMsg</span><span class="p">,</span> <span class="nb">str</span><span class="p">)</span> <span class="k">else</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">errMsg</span><span class="p">)</span>
<span class="p">)</span>
<span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">&quot;raise_error&quot;</span><span class="p">,</span> <span class="n">errMsg</span><span class="p">)</span></div>
<span class="c1"># ---------------------- String/Binary functions ------------------------------</span>
<div class="viewcode-block" id="upper"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.upper.html#pyspark.sql.functions.upper">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">upper</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Converts a string expression to upper case.</span>
<span class="sd"> .. versionadded:: 1.5.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> target column to work on.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> upper case values.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([&quot;Spark&quot;, &quot;PySpark&quot;, &quot;Pandas API&quot;], &quot;STRING&quot;)</span>
<span class="sd"> &gt;&gt;&gt; df.select(upper(&quot;value&quot;)).show()</span>
<span class="sd"> +------------+</span>
<span class="sd"> |upper(value)|</span>
<span class="sd"> +------------+</span>
<span class="sd"> | SPARK|</span>
<span class="sd"> | PYSPARK|</span>
<span class="sd"> | PANDAS API|</span>
<span class="sd"> +------------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;upper&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="lower"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.lower.html#pyspark.sql.functions.lower">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">lower</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Converts a string expression to lower case.</span>
<span class="sd"> .. versionadded:: 1.5.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> target column to work on.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> lower case values.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([&quot;Spark&quot;, &quot;PySpark&quot;, &quot;Pandas API&quot;], &quot;STRING&quot;)</span>
<span class="sd"> &gt;&gt;&gt; df.select(lower(&quot;value&quot;)).show()</span>
<span class="sd"> +------------+</span>
<span class="sd"> |lower(value)|</span>
<span class="sd"> +------------+</span>
<span class="sd"> | spark|</span>
<span class="sd"> | pyspark|</span>
<span class="sd"> | pandas api|</span>
<span class="sd"> +------------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;lower&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="ascii"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.ascii.html#pyspark.sql.functions.ascii">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">ascii</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Computes the numeric value of the first character of the string column.</span>
<span class="sd"> .. versionadded:: 1.5.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> target column to work on.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> numeric value.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([&quot;Spark&quot;, &quot;PySpark&quot;, &quot;Pandas API&quot;], &quot;STRING&quot;)</span>
<span class="sd"> &gt;&gt;&gt; df.select(ascii(&quot;value&quot;)).show()</span>
<span class="sd"> +------------+</span>
<span class="sd"> |ascii(value)|</span>
<span class="sd"> +------------+</span>
<span class="sd"> | 83|</span>
<span class="sd"> | 80|</span>
<span class="sd"> | 80|</span>
<span class="sd"> +------------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;ascii&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="base64"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.base64.html#pyspark.sql.functions.base64">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">base64</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Computes the BASE64 encoding of a binary column and returns it as a string column.</span>
<span class="sd"> .. versionadded:: 1.5.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> target column to work on.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> BASE64 encoding of string value.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([&quot;Spark&quot;, &quot;PySpark&quot;, &quot;Pandas API&quot;], &quot;STRING&quot;)</span>
<span class="sd"> &gt;&gt;&gt; df.select(base64(&quot;value&quot;)).show()</span>
<span class="sd"> +----------------+</span>
<span class="sd"> | base64(value)|</span>
<span class="sd"> +----------------+</span>
<span class="sd"> | U3Bhcms=|</span>
<span class="sd"> | UHlTcGFyaw==|</span>
<span class="sd"> |UGFuZGFzIEFQSQ==|</span>
<span class="sd"> +----------------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;base64&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="unbase64"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.unbase64.html#pyspark.sql.functions.unbase64">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">unbase64</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Decodes a BASE64 encoded string column and returns it as a binary column.</span>
<span class="sd"> .. versionadded:: 1.5.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> target column to work on.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> encoded string value.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([&quot;U3Bhcms=&quot;,</span>
<span class="sd"> ... &quot;UHlTcGFyaw==&quot;,</span>
<span class="sd"> ... &quot;UGFuZGFzIEFQSQ==&quot;], &quot;STRING&quot;)</span>
<span class="sd"> &gt;&gt;&gt; df.select(unbase64(&quot;value&quot;)).show()</span>
<span class="sd"> +--------------------+</span>
<span class="sd"> | unbase64(value)|</span>
<span class="sd"> +--------------------+</span>
<span class="sd"> | [53 70 61 72 6B]|</span>
<span class="sd"> |[50 79 53 70 61 7...|</span>
<span class="sd"> |[50 61 6E 64 61 7...|</span>
<span class="sd"> +--------------------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;unbase64&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="ltrim"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.ltrim.html#pyspark.sql.functions.ltrim">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">ltrim</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Trim the spaces from left end for the specified string value.</span>
<span class="sd"> .. versionadded:: 1.5.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> target column to work on.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> left trimmed values.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([&quot; Spark&quot;, &quot;Spark &quot;, &quot; Spark&quot;], &quot;STRING&quot;)</span>
<span class="sd"> &gt;&gt;&gt; df.select(ltrim(&quot;value&quot;).alias(&quot;r&quot;)).withColumn(&quot;length&quot;, length(&quot;r&quot;)).show()</span>
<span class="sd"> +-------+------+</span>
<span class="sd"> | r|length|</span>
<span class="sd"> +-------+------+</span>
<span class="sd"> | Spark| 5|</span>
<span class="sd"> |Spark | 7|</span>
<span class="sd"> | Spark| 5|</span>
<span class="sd"> +-------+------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;ltrim&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="rtrim"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.rtrim.html#pyspark.sql.functions.rtrim">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">rtrim</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Trim the spaces from right end for the specified string value.</span>
<span class="sd"> .. versionadded:: 1.5.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> target column to work on.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> right trimmed values.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([&quot; Spark&quot;, &quot;Spark &quot;, &quot; Spark&quot;], &quot;STRING&quot;)</span>
<span class="sd"> &gt;&gt;&gt; df.select(rtrim(&quot;value&quot;).alias(&quot;r&quot;)).withColumn(&quot;length&quot;, length(&quot;r&quot;)).show()</span>
<span class="sd"> +--------+------+</span>
<span class="sd"> | r|length|</span>
<span class="sd"> +--------+------+</span>
<span class="sd"> | Spark| 8|</span>
<span class="sd"> | Spark| 5|</span>
<span class="sd"> | Spark| 6|</span>
<span class="sd"> +--------+------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;rtrim&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="trim"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.trim.html#pyspark.sql.functions.trim">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">trim</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Trim the spaces from both ends for the specified string column.</span>
<span class="sd"> .. versionadded:: 1.5.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> target column to work on.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> trimmed values from both sides.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([&quot; Spark&quot;, &quot;Spark &quot;, &quot; Spark&quot;], &quot;STRING&quot;)</span>
<span class="sd"> &gt;&gt;&gt; df.select(trim(&quot;value&quot;).alias(&quot;r&quot;)).withColumn(&quot;length&quot;, length(&quot;r&quot;)).show()</span>
<span class="sd"> +-----+------+</span>
<span class="sd"> | r|length|</span>
<span class="sd"> +-----+------+</span>
<span class="sd"> |Spark| 5|</span>
<span class="sd"> |Spark| 5|</span>
<span class="sd"> |Spark| 5|</span>
<span class="sd"> +-----+------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;trim&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="concat_ws"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.concat_ws.html#pyspark.sql.functions.concat_ws">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">concat_ws</span><span class="p">(</span><span class="n">sep</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="o">*</span><span class="n">cols</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Concatenates multiple input string columns together into a single string column,</span>
<span class="sd"> using the given separator.</span>
<span class="sd"> .. versionadded:: 1.5.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> sep : str</span>
<span class="sd"> words separator.</span>
<span class="sd"> cols : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> list of columns to work on.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> string of concatenated words.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(&#39;abcd&#39;,&#39;123&#39;)], [&#39;s&#39;, &#39;d&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(concat_ws(&#39;-&#39;, df.s, df.d).alias(&#39;s&#39;)).collect()</span>
<span class="sd"> [Row(s=&#39;abcd-123&#39;)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">get_active_spark_context</span><span class="p">()</span>
<span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">&quot;concat_ws&quot;</span><span class="p">,</span> <span class="n">sep</span><span class="p">,</span> <span class="n">_to_seq</span><span class="p">(</span><span class="n">sc</span><span class="p">,</span> <span class="n">cols</span><span class="p">,</span> <span class="n">_to_java_column</span><span class="p">))</span></div>
<div class="viewcode-block" id="decode"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.decode.html#pyspark.sql.functions.decode">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">decode</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="n">charset</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Computes the first argument into a string from a binary using the provided character set</span>
<span class="sd"> (one of &#39;US-ASCII&#39;, &#39;ISO-8859-1&#39;, &#39;UTF-8&#39;, &#39;UTF-16BE&#39;, &#39;UTF-16LE&#39;, &#39;UTF-16&#39;).</span>
<span class="sd"> .. versionadded:: 1.5.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> target column to work on.</span>
<span class="sd"> charset : str</span>
<span class="sd"> charset to use to decode to.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> the column for computed results.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(&#39;abcd&#39;,)], [&#39;a&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(decode(&quot;a&quot;, &quot;UTF-8&quot;)).show()</span>
<span class="sd"> +----------------+</span>
<span class="sd"> |decode(a, UTF-8)|</span>
<span class="sd"> +----------------+</span>
<span class="sd"> | abcd|</span>
<span class="sd"> +----------------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">&quot;decode&quot;</span><span class="p">,</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">),</span> <span class="n">charset</span><span class="p">)</span></div>
<div class="viewcode-block" id="encode"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.encode.html#pyspark.sql.functions.encode">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">encode</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="n">charset</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Computes the first argument into a binary from a string using the provided character set</span>
<span class="sd"> (one of &#39;US-ASCII&#39;, &#39;ISO-8859-1&#39;, &#39;UTF-8&#39;, &#39;UTF-16BE&#39;, &#39;UTF-16LE&#39;, &#39;UTF-16&#39;).</span>
<span class="sd"> .. versionadded:: 1.5.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> target column to work on.</span>
<span class="sd"> charset : str</span>
<span class="sd"> charset to use to encode.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> the column for computed results.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(&#39;abcd&#39;,)], [&#39;c&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(encode(&quot;c&quot;, &quot;UTF-8&quot;)).show()</span>
<span class="sd"> +----------------+</span>
<span class="sd"> |encode(c, UTF-8)|</span>
<span class="sd"> +----------------+</span>
<span class="sd"> | [61 62 63 64]|</span>
<span class="sd"> +----------------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">&quot;encode&quot;</span><span class="p">,</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">),</span> <span class="n">charset</span><span class="p">)</span></div>
<div class="viewcode-block" id="format_number"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.format_number.html#pyspark.sql.functions.format_number">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">format_number</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="n">d</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Formats the number X to a format like &#39;#,--#,--#.--&#39;, rounded to d decimal places</span>
<span class="sd"> with HALF_EVEN round mode, and returns the result as a string.</span>
<span class="sd"> .. versionadded:: 1.5.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> the column name of the numeric value to be formatted</span>
<span class="sd"> d : int</span>
<span class="sd"> the N decimal places</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> the column of formatted results.</span>
<span class="sd"> &gt;&gt;&gt; spark.createDataFrame([(5,)], [&#39;a&#39;]).select(format_number(&#39;a&#39;, 4).alias(&#39;v&#39;)).collect()</span>
<span class="sd"> [Row(v=&#39;5.0000&#39;)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">&quot;format_number&quot;</span><span class="p">,</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">),</span> <span class="n">d</span><span class="p">)</span></div>
<div class="viewcode-block" id="format_string"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.format_string.html#pyspark.sql.functions.format_string">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">format_string</span><span class="p">(</span><span class="nb">format</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="o">*</span><span class="n">cols</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Formats the arguments in printf-style and returns the result as a string column.</span>
<span class="sd"> .. versionadded:: 1.5.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> format : str</span>
<span class="sd"> string that can contain embedded format tags and used as result column&#39;s value</span>
<span class="sd"> cols : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> column names or :class:`~pyspark.sql.Column`\\s to be used in formatting</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> the column of formatted results.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(5, &quot;hello&quot;)], [&#39;a&#39;, &#39;b&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(format_string(&#39;%d %s&#39;, df.a, df.b).alias(&#39;v&#39;)).collect()</span>
<span class="sd"> [Row(v=&#39;5 hello&#39;)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">get_active_spark_context</span><span class="p">()</span>
<span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">&quot;format_string&quot;</span><span class="p">,</span> <span class="nb">format</span><span class="p">,</span> <span class="n">_to_seq</span><span class="p">(</span><span class="n">sc</span><span class="p">,</span> <span class="n">cols</span><span class="p">,</span> <span class="n">_to_java_column</span><span class="p">))</span></div>
<div class="viewcode-block" id="instr"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.instr.html#pyspark.sql.functions.instr">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">instr</span><span class="p">(</span><span class="nb">str</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="n">substr</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Locate the position of the first occurrence of substr column in the given string.</span>
<span class="sd"> Returns null if either of the arguments are null.</span>
<span class="sd"> .. versionadded:: 1.5.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> The position is not zero based, but 1 based index. Returns 0 if substr</span>
<span class="sd"> could not be found in str.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> str : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> target column to work on.</span>
<span class="sd"> substr : str</span>
<span class="sd"> substring to look for.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> location of the first occurrence of the substring as integer.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(&#39;abcd&#39;,)], [&#39;s&#39;,])</span>
<span class="sd"> &gt;&gt;&gt; df.select(instr(df.s, &#39;b&#39;).alias(&#39;s&#39;)).collect()</span>
<span class="sd"> [Row(s=2)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">&quot;instr&quot;</span><span class="p">,</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="nb">str</span><span class="p">),</span> <span class="n">substr</span><span class="p">)</span></div>
<div class="viewcode-block" id="overlay"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.overlay.html#pyspark.sql.functions.overlay">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">overlay</span><span class="p">(</span>
<span class="n">src</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span>
<span class="n">replace</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span>
<span class="n">pos</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="nb">int</span><span class="p">],</span>
<span class="nb">len</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="o">-</span><span class="mi">1</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Overlay the specified portion of `src` with `replace`,</span>
<span class="sd"> starting from byte position `pos` of `src` and proceeding for `len` bytes.</span>
<span class="sd"> .. versionadded:: 3.0.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> src : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> column name or column containing the string that will be replaced</span>
<span class="sd"> replace : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> column name or column containing the substitution string</span>
<span class="sd"> pos : :class:`~pyspark.sql.Column` or str or int</span>
<span class="sd"> column name, column, or int containing the starting position in src</span>
<span class="sd"> len : :class:`~pyspark.sql.Column` or str or int, optional</span>
<span class="sd"> column name, column, or int containing the number of bytes to replace in src</span>
<span class="sd"> string by &#39;replace&#39; defaults to -1, which represents the length of the &#39;replace&#39; string</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> string with replaced values.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(&quot;SPARK_SQL&quot;, &quot;CORE&quot;)], (&quot;x&quot;, &quot;y&quot;))</span>
<span class="sd"> &gt;&gt;&gt; df.select(overlay(&quot;x&quot;, &quot;y&quot;, 7).alias(&quot;overlayed&quot;)).collect()</span>
<span class="sd"> [Row(overlayed=&#39;SPARK_CORE&#39;)]</span>
<span class="sd"> &gt;&gt;&gt; df.select(overlay(&quot;x&quot;, &quot;y&quot;, 7, 0).alias(&quot;overlayed&quot;)).collect()</span>
<span class="sd"> [Row(overlayed=&#39;SPARK_CORESQL&#39;)]</span>
<span class="sd"> &gt;&gt;&gt; df.select(overlay(&quot;x&quot;, &quot;y&quot;, 7, 2).alias(&quot;overlayed&quot;)).collect()</span>
<span class="sd"> [Row(overlayed=&#39;SPARK_COREL&#39;)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">pos</span><span class="p">,</span> <span class="p">(</span><span class="nb">int</span><span class="p">,</span> <span class="nb">str</span><span class="p">,</span> <span class="n">Column</span><span class="p">)):</span>
<span class="k">raise</span> <span class="n">PySparkTypeError</span><span class="p">(</span>
<span class="n">error_class</span><span class="o">=</span><span class="s2">&quot;NOT_COLUMN_OR_INT_OR_STR&quot;</span><span class="p">,</span>
<span class="n">message_parameters</span><span class="o">=</span><span class="p">{</span><span class="s2">&quot;arg_name&quot;</span><span class="p">:</span> <span class="s2">&quot;pos&quot;</span><span class="p">,</span> <span class="s2">&quot;arg_type&quot;</span><span class="p">:</span> <span class="nb">type</span><span class="p">(</span><span class="n">pos</span><span class="p">)</span><span class="o">.</span><span class="vm">__name__</span><span class="p">},</span>
<span class="p">)</span>
<span class="k">if</span> <span class="nb">len</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> <span class="ow">and</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="nb">len</span><span class="p">,</span> <span class="p">(</span><span class="nb">int</span><span class="p">,</span> <span class="nb">str</span><span class="p">,</span> <span class="n">Column</span><span class="p">)):</span>
<span class="k">raise</span> <span class="n">PySparkTypeError</span><span class="p">(</span>
<span class="n">error_class</span><span class="o">=</span><span class="s2">&quot;NOT_COLUMN_OR_INT_OR_STR&quot;</span><span class="p">,</span>
<span class="n">message_parameters</span><span class="o">=</span><span class="p">{</span><span class="s2">&quot;arg_name&quot;</span><span class="p">:</span> <span class="s2">&quot;len&quot;</span><span class="p">,</span> <span class="s2">&quot;arg_type&quot;</span><span class="p">:</span> <span class="nb">type</span><span class="p">(</span><span class="nb">len</span><span class="p">)</span><span class="o">.</span><span class="vm">__name__</span><span class="p">},</span>
<span class="p">)</span>
<span class="n">pos</span> <span class="o">=</span> <span class="n">_create_column_from_literal</span><span class="p">(</span><span class="n">pos</span><span class="p">)</span> <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">pos</span><span class="p">,</span> <span class="nb">int</span><span class="p">)</span> <span class="k">else</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">pos</span><span class="p">)</span>
<span class="nb">len</span> <span class="o">=</span> <span class="n">_create_column_from_literal</span><span class="p">(</span><span class="nb">len</span><span class="p">)</span> <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="nb">len</span><span class="p">,</span> <span class="nb">int</span><span class="p">)</span> <span class="k">else</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="nb">len</span><span class="p">)</span>
<span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">&quot;overlay&quot;</span><span class="p">,</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">src</span><span class="p">),</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">replace</span><span class="p">),</span> <span class="n">pos</span><span class="p">,</span> <span class="nb">len</span><span class="p">)</span></div>
<div class="viewcode-block" id="sentences"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.sentences.html#pyspark.sql.functions.sentences">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">sentences</span><span class="p">(</span>
<span class="n">string</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span>
<span class="n">language</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="s2">&quot;ColumnOrName&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">country</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="s2">&quot;ColumnOrName&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Splits a string into arrays of sentences, where each sentence is an array of words.</span>
<span class="sd"> The &#39;language&#39; and &#39;country&#39; arguments are optional, and if omitted, the default locale is used.</span>
<span class="sd"> .. versionadded:: 3.2.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> string : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> a string to be split</span>
<span class="sd"> language : :class:`~pyspark.sql.Column` or str, optional</span>
<span class="sd"> a language of the locale</span>
<span class="sd"> country : :class:`~pyspark.sql.Column` or str, optional</span>
<span class="sd"> a country of the locale</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> arrays of split sentences.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([[&quot;This is an example sentence.&quot;]], [&quot;string&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(sentences(df.string, lit(&quot;en&quot;), lit(&quot;US&quot;))).show(truncate=False)</span>
<span class="sd"> +-----------------------------------+</span>
<span class="sd"> |sentences(string, en, US) |</span>
<span class="sd"> +-----------------------------------+</span>
<span class="sd"> |[[This, is, an, example, sentence]]|</span>
<span class="sd"> +-----------------------------------+</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([[&quot;Hello world. How are you?&quot;]], [&quot;s&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(sentences(&quot;s&quot;)).show(truncate=False)</span>
<span class="sd"> +---------------------------------+</span>
<span class="sd"> |sentences(s, , ) |</span>
<span class="sd"> +---------------------------------+</span>
<span class="sd"> |[[Hello, world], [How, are, you]]|</span>
<span class="sd"> +---------------------------------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="n">language</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
<span class="n">language</span> <span class="o">=</span> <span class="n">lit</span><span class="p">(</span><span class="s2">&quot;&quot;</span><span class="p">)</span>
<span class="k">if</span> <span class="n">country</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
<span class="n">country</span> <span class="o">=</span> <span class="n">lit</span><span class="p">(</span><span class="s2">&quot;&quot;</span><span class="p">)</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;sentences&quot;</span><span class="p">,</span> <span class="n">string</span><span class="p">,</span> <span class="n">language</span><span class="p">,</span> <span class="n">country</span><span class="p">)</span></div>
<div class="viewcode-block" id="substring"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.substring.html#pyspark.sql.functions.substring">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">substring</span><span class="p">(</span><span class="nb">str</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="n">pos</span><span class="p">:</span> <span class="nb">int</span><span class="p">,</span> <span class="nb">len</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Substring starts at `pos` and is of length `len` when str is String type or</span>
<span class="sd"> returns the slice of byte array that starts at `pos` in byte and is of length `len`</span>
<span class="sd"> when str is Binary type.</span>
<span class="sd"> .. versionadded:: 1.5.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> The position is not zero based, but 1 based index.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> str : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> target column to work on.</span>
<span class="sd"> pos : int</span>
<span class="sd"> starting position in str.</span>
<span class="sd"> len : int</span>
<span class="sd"> length of chars.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> substring of given value.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(&#39;abcd&#39;,)], [&#39;s&#39;,])</span>
<span class="sd"> &gt;&gt;&gt; df.select(substring(df.s, 1, 2).alias(&#39;s&#39;)).collect()</span>
<span class="sd"> [Row(s=&#39;ab&#39;)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">&quot;substring&quot;</span><span class="p">,</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="nb">str</span><span class="p">),</span> <span class="n">pos</span><span class="p">,</span> <span class="nb">len</span><span class="p">)</span></div>
<div class="viewcode-block" id="substring_index"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.substring_index.html#pyspark.sql.functions.substring_index">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">substring_index</span><span class="p">(</span><span class="nb">str</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="n">delim</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="n">count</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns the substring from string str before count occurrences of the delimiter delim.</span>
<span class="sd"> If count is positive, everything the left of the final delimiter (counting from left) is</span>
<span class="sd"> returned. If count is negative, every to the right of the final delimiter (counting from the</span>
<span class="sd"> right) is returned. substring_index performs a case-sensitive match when searching for delim.</span>
<span class="sd"> .. versionadded:: 1.5.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> str : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> target column to work on.</span>
<span class="sd"> delim : str</span>
<span class="sd"> delimiter of values.</span>
<span class="sd"> count : int</span>
<span class="sd"> number of occurrences.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> substring of given value.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(&#39;a.b.c.d&#39;,)], [&#39;s&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(substring_index(df.s, &#39;.&#39;, 2).alias(&#39;s&#39;)).collect()</span>
<span class="sd"> [Row(s=&#39;a.b&#39;)]</span>
<span class="sd"> &gt;&gt;&gt; df.select(substring_index(df.s, &#39;.&#39;, -3).alias(&#39;s&#39;)).collect()</span>
<span class="sd"> [Row(s=&#39;b.c.d&#39;)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">&quot;substring_index&quot;</span><span class="p">,</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="nb">str</span><span class="p">),</span> <span class="n">delim</span><span class="p">,</span> <span class="n">count</span><span class="p">)</span></div>
<div class="viewcode-block" id="levenshtein"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.levenshtein.html#pyspark.sql.functions.levenshtein">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">levenshtein</span><span class="p">(</span>
<span class="n">left</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="n">right</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="n">threshold</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Computes the Levenshtein distance of the two given strings.</span>
<span class="sd"> .. versionadded:: 1.5.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> left : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> first column value.</span>
<span class="sd"> right : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> second column value.</span>
<span class="sd"> threshold : int, optional</span>
<span class="sd"> if set when the levenshtein distance of the two given strings</span>
<span class="sd"> less than or equal to a given threshold then return result distance, or -1</span>
<span class="sd"> .. versionchanged: 3.5.0</span>
<span class="sd"> Added ``threshold`` argument.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> Levenshtein distance as integer value.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df0 = spark.createDataFrame([(&#39;kitten&#39;, &#39;sitting&#39;,)], [&#39;l&#39;, &#39;r&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df0.select(levenshtein(&#39;l&#39;, &#39;r&#39;).alias(&#39;d&#39;)).collect()</span>
<span class="sd"> [Row(d=3)]</span>
<span class="sd"> &gt;&gt;&gt; df0.select(levenshtein(&#39;l&#39;, &#39;r&#39;, 2).alias(&#39;d&#39;)).collect()</span>
<span class="sd"> [Row(d=-1)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="n">threshold</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;levenshtein&quot;</span><span class="p">,</span> <span class="n">left</span><span class="p">,</span> <span class="n">right</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span>
<span class="s2">&quot;levenshtein&quot;</span><span class="p">,</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">left</span><span class="p">),</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">right</span><span class="p">),</span> <span class="n">threshold</span>
<span class="p">)</span></div>
<div class="viewcode-block" id="locate"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.locate.html#pyspark.sql.functions.locate">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">locate</span><span class="p">(</span><span class="n">substr</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="nb">str</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="n">pos</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">1</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Locate the position of the first occurrence of substr in a string column, after position pos.</span>
<span class="sd"> .. versionadded:: 1.5.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> substr : str</span>
<span class="sd"> a string</span>
<span class="sd"> str : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> a Column of :class:`pyspark.sql.types.StringType`</span>
<span class="sd"> pos : int, optional</span>
<span class="sd"> start position (zero based)</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> position of the substring.</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> The position is not zero based, but 1 based index. Returns 0 if substr</span>
<span class="sd"> could not be found in str.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(&#39;abcd&#39;,)], [&#39;s&#39;,])</span>
<span class="sd"> &gt;&gt;&gt; df.select(locate(&#39;b&#39;, df.s, 1).alias(&#39;s&#39;)).collect()</span>
<span class="sd"> [Row(s=2)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">&quot;locate&quot;</span><span class="p">,</span> <span class="n">substr</span><span class="p">,</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="nb">str</span><span class="p">),</span> <span class="n">pos</span><span class="p">)</span></div>
<div class="viewcode-block" id="lpad"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.lpad.html#pyspark.sql.functions.lpad">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">lpad</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="nb">len</span><span class="p">:</span> <span class="nb">int</span><span class="p">,</span> <span class="n">pad</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Left-pad the string column to width `len` with `pad`.</span>
<span class="sd"> .. versionadded:: 1.5.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> target column to work on.</span>
<span class="sd"> len : int</span>
<span class="sd"> length of the final string.</span>
<span class="sd"> pad : str</span>
<span class="sd"> chars to prepend.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> left padded result.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(&#39;abcd&#39;,)], [&#39;s&#39;,])</span>
<span class="sd"> &gt;&gt;&gt; df.select(lpad(df.s, 6, &#39;#&#39;).alias(&#39;s&#39;)).collect()</span>
<span class="sd"> [Row(s=&#39;##abcd&#39;)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">&quot;lpad&quot;</span><span class="p">,</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">),</span> <span class="nb">len</span><span class="p">,</span> <span class="n">pad</span><span class="p">)</span></div>
<div class="viewcode-block" id="rpad"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.rpad.html#pyspark.sql.functions.rpad">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">rpad</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="nb">len</span><span class="p">:</span> <span class="nb">int</span><span class="p">,</span> <span class="n">pad</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Right-pad the string column to width `len` with `pad`.</span>
<span class="sd"> .. versionadded:: 1.5.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> target column to work on.</span>
<span class="sd"> len : int</span>
<span class="sd"> length of the final string.</span>
<span class="sd"> pad : str</span>
<span class="sd"> chars to append.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> right padded result.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(&#39;abcd&#39;,)], [&#39;s&#39;,])</span>
<span class="sd"> &gt;&gt;&gt; df.select(rpad(df.s, 6, &#39;#&#39;).alias(&#39;s&#39;)).collect()</span>
<span class="sd"> [Row(s=&#39;abcd##&#39;)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">&quot;rpad&quot;</span><span class="p">,</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">),</span> <span class="nb">len</span><span class="p">,</span> <span class="n">pad</span><span class="p">)</span></div>
<div class="viewcode-block" id="repeat"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.repeat.html#pyspark.sql.functions.repeat">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">repeat</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="n">n</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Repeats a string column n times, and returns it as a new string column.</span>
<span class="sd"> .. versionadded:: 1.5.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> target column to work on.</span>
<span class="sd"> n : int</span>
<span class="sd"> number of times to repeat value.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> string with repeated values.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(&#39;ab&#39;,)], [&#39;s&#39;,])</span>
<span class="sd"> &gt;&gt;&gt; df.select(repeat(df.s, 3).alias(&#39;s&#39;)).collect()</span>
<span class="sd"> [Row(s=&#39;ababab&#39;)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">&quot;repeat&quot;</span><span class="p">,</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">),</span> <span class="n">n</span><span class="p">)</span></div>
<div class="viewcode-block" id="split"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.split.html#pyspark.sql.functions.split">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">split</span><span class="p">(</span><span class="nb">str</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="n">pattern</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="n">limit</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="o">-</span><span class="mi">1</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Splits str around matches of the given pattern.</span>
<span class="sd"> .. versionadded:: 1.5.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> str : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> a string expression to split</span>
<span class="sd"> pattern : str</span>
<span class="sd"> a string representing a regular expression. The regex string should be</span>
<span class="sd"> a Java regular expression.</span>
<span class="sd"> limit : int, optional</span>
<span class="sd"> an integer which controls the number of times `pattern` is applied.</span>
<span class="sd"> * ``limit &gt; 0``: The resulting array&#39;s length will not be more than `limit`, and the</span>
<span class="sd"> resulting array&#39;s last entry will contain all input beyond the last</span>
<span class="sd"> matched pattern.</span>
<span class="sd"> * ``limit &lt;= 0``: `pattern` will be applied as many times as possible, and the resulting</span>
<span class="sd"> array can be of any size.</span>
<span class="sd"> .. versionchanged:: 3.0</span>
<span class="sd"> `split` now takes an optional `limit` field. If not provided, default limit value is -1.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> array of separated strings.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(&#39;oneAtwoBthreeC&#39;,)], [&#39;s&#39;,])</span>
<span class="sd"> &gt;&gt;&gt; df.select(split(df.s, &#39;[ABC]&#39;, 2).alias(&#39;s&#39;)).collect()</span>
<span class="sd"> [Row(s=[&#39;one&#39;, &#39;twoBthreeC&#39;])]</span>
<span class="sd"> &gt;&gt;&gt; df.select(split(df.s, &#39;[ABC]&#39;, -1).alias(&#39;s&#39;)).collect()</span>
<span class="sd"> [Row(s=[&#39;one&#39;, &#39;two&#39;, &#39;three&#39;, &#39;&#39;])]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">&quot;split&quot;</span><span class="p">,</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="nb">str</span><span class="p">),</span> <span class="n">pattern</span><span class="p">,</span> <span class="n">limit</span><span class="p">)</span></div>
<div class="viewcode-block" id="rlike"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.rlike.html#pyspark.sql.functions.rlike">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">rlike</span><span class="p">(</span><span class="nb">str</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="n">regexp</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sa">r</span><span class="sd">&quot;&quot;&quot;Returns true if `str` matches the Java regex `regexp`, or false otherwise.</span>
<span class="sd"> .. versionadded:: 3.5.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> str : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> target column to work on.</span>
<span class="sd"> regexp : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> regex pattern to apply.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> true if `str` matches a Java regex, or false otherwise.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(&quot;1a 2b 14m&quot;, r&quot;(\d+)&quot;)], [&quot;str&quot;, &quot;regexp&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(rlike(&#39;str&#39;, lit(r&#39;(\d+)&#39;)).alias(&#39;d&#39;)).collect()</span>
<span class="sd"> [Row(d=True)]</span>
<span class="sd"> &gt;&gt;&gt; df.select(rlike(&#39;str&#39;, lit(r&#39;\d{2}b&#39;)).alias(&#39;d&#39;)).collect()</span>
<span class="sd"> [Row(d=False)]</span>
<span class="sd"> &gt;&gt;&gt; df.select(rlike(&quot;str&quot;, col(&quot;regexp&quot;)).alias(&#39;d&#39;)).collect()</span>
<span class="sd"> [Row(d=True)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;rlike&quot;</span><span class="p">,</span> <span class="nb">str</span><span class="p">,</span> <span class="n">regexp</span><span class="p">)</span></div>
<div class="viewcode-block" id="regexp"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.regexp.html#pyspark.sql.functions.regexp">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">regexp</span><span class="p">(</span><span class="nb">str</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="n">regexp</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sa">r</span><span class="sd">&quot;&quot;&quot;Returns true if `str` matches the Java regex `regexp`, or false otherwise.</span>
<span class="sd"> .. versionadded:: 3.5.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> str : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> target column to work on.</span>
<span class="sd"> regexp : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> regex pattern to apply.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> true if `str` matches a Java regex, or false otherwise.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; import pyspark.sql.functions as sf</span>
<span class="sd"> &gt;&gt;&gt; spark.createDataFrame(</span>
<span class="sd"> ... [(&quot;1a 2b 14m&quot;, r&quot;(\d+)&quot;)], [&quot;str&quot;, &quot;regexp&quot;]</span>
<span class="sd"> ... ).select(sf.regexp(&#39;str&#39;, sf.lit(r&#39;(\d+)&#39;))).show()</span>
<span class="sd"> +------------------+</span>
<span class="sd"> |REGEXP(str, (\d+))|</span>
<span class="sd"> +------------------+</span>
<span class="sd"> | true|</span>
<span class="sd"> +------------------+</span>
<span class="sd"> &gt;&gt;&gt; import pyspark.sql.functions as sf</span>
<span class="sd"> &gt;&gt;&gt; spark.createDataFrame(</span>
<span class="sd"> ... [(&quot;1a 2b 14m&quot;, r&quot;(\d+)&quot;)], [&quot;str&quot;, &quot;regexp&quot;]</span>
<span class="sd"> ... ).select(sf.regexp(&#39;str&#39;, sf.lit(r&#39;\d{2}b&#39;))).show()</span>
<span class="sd"> +-------------------+</span>
<span class="sd"> |REGEXP(str, \d{2}b)|</span>
<span class="sd"> +-------------------+</span>
<span class="sd"> | false|</span>
<span class="sd"> +-------------------+</span>
<span class="sd"> &gt;&gt;&gt; import pyspark.sql.functions as sf</span>
<span class="sd"> &gt;&gt;&gt; spark.createDataFrame(</span>
<span class="sd"> ... [(&quot;1a 2b 14m&quot;, r&quot;(\d+)&quot;)], [&quot;str&quot;, &quot;regexp&quot;]</span>
<span class="sd"> ... ).select(sf.regexp(&#39;str&#39;, sf.col(&quot;regexp&quot;))).show()</span>
<span class="sd"> +-------------------+</span>
<span class="sd"> |REGEXP(str, regexp)|</span>
<span class="sd"> +-------------------+</span>
<span class="sd"> | true|</span>
<span class="sd"> +-------------------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;regexp&quot;</span><span class="p">,</span> <span class="nb">str</span><span class="p">,</span> <span class="n">regexp</span><span class="p">)</span></div>
<div class="viewcode-block" id="regexp_like"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.regexp_like.html#pyspark.sql.functions.regexp_like">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">regexp_like</span><span class="p">(</span><span class="nb">str</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="n">regexp</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sa">r</span><span class="sd">&quot;&quot;&quot;Returns true if `str` matches the Java regex `regexp`, or false otherwise.</span>
<span class="sd"> .. versionadded:: 3.5.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> str : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> target column to work on.</span>
<span class="sd"> regexp : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> regex pattern to apply.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> true if `str` matches a Java regex, or false otherwise.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; import pyspark.sql.functions as sf</span>
<span class="sd"> &gt;&gt;&gt; spark.createDataFrame(</span>
<span class="sd"> ... [(&quot;1a 2b 14m&quot;, r&quot;(\d+)&quot;)], [&quot;str&quot;, &quot;regexp&quot;]</span>
<span class="sd"> ... ).select(sf.regexp_like(&#39;str&#39;, sf.lit(r&#39;(\d+)&#39;))).show()</span>
<span class="sd"> +-----------------------+</span>
<span class="sd"> |REGEXP_LIKE(str, (\d+))|</span>
<span class="sd"> +-----------------------+</span>
<span class="sd"> | true|</span>
<span class="sd"> +-----------------------+</span>
<span class="sd"> &gt;&gt;&gt; import pyspark.sql.functions as sf</span>
<span class="sd"> &gt;&gt;&gt; spark.createDataFrame(</span>
<span class="sd"> ... [(&quot;1a 2b 14m&quot;, r&quot;(\d+)&quot;)], [&quot;str&quot;, &quot;regexp&quot;]</span>
<span class="sd"> ... ).select(sf.regexp_like(&#39;str&#39;, sf.lit(r&#39;\d{2}b&#39;))).show()</span>
<span class="sd"> +------------------------+</span>
<span class="sd"> |REGEXP_LIKE(str, \d{2}b)|</span>
<span class="sd"> +------------------------+</span>
<span class="sd"> | false|</span>
<span class="sd"> +------------------------+</span>
<span class="sd"> &gt;&gt;&gt; import pyspark.sql.functions as sf</span>
<span class="sd"> &gt;&gt;&gt; spark.createDataFrame(</span>
<span class="sd"> ... [(&quot;1a 2b 14m&quot;, r&quot;(\d+)&quot;)], [&quot;str&quot;, &quot;regexp&quot;]</span>
<span class="sd"> ... ).select(sf.regexp_like(&#39;str&#39;, sf.col(&quot;regexp&quot;))).show()</span>
<span class="sd"> +------------------------+</span>
<span class="sd"> |REGEXP_LIKE(str, regexp)|</span>
<span class="sd"> +------------------------+</span>
<span class="sd"> | true|</span>
<span class="sd"> +------------------------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;regexp_like&quot;</span><span class="p">,</span> <span class="nb">str</span><span class="p">,</span> <span class="n">regexp</span><span class="p">)</span></div>
<div class="viewcode-block" id="regexp_count"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.regexp_count.html#pyspark.sql.functions.regexp_count">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">regexp_count</span><span class="p">(</span><span class="nb">str</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="n">regexp</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sa">r</span><span class="sd">&quot;&quot;&quot;Returns a count of the number of times that the Java regex pattern `regexp` is matched</span>
<span class="sd"> in the string `str`.</span>
<span class="sd"> .. versionadded:: 3.5.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> str : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> target column to work on.</span>
<span class="sd"> regexp : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> regex pattern to apply.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> the number of times that a Java regex pattern is matched in the string.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(&quot;1a 2b 14m&quot;, r&quot;\d+&quot;)], [&quot;str&quot;, &quot;regexp&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(regexp_count(&#39;str&#39;, lit(r&#39;\d+&#39;)).alias(&#39;d&#39;)).collect()</span>
<span class="sd"> [Row(d=3)]</span>
<span class="sd"> &gt;&gt;&gt; df.select(regexp_count(&#39;str&#39;, lit(r&#39;mmm&#39;)).alias(&#39;d&#39;)).collect()</span>
<span class="sd"> [Row(d=0)]</span>
<span class="sd"> &gt;&gt;&gt; df.select(regexp_count(&quot;str&quot;, col(&quot;regexp&quot;)).alias(&#39;d&#39;)).collect()</span>
<span class="sd"> [Row(d=3)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;regexp_count&quot;</span><span class="p">,</span> <span class="nb">str</span><span class="p">,</span> <span class="n">regexp</span><span class="p">)</span></div>
<div class="viewcode-block" id="regexp_extract"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.regexp_extract.html#pyspark.sql.functions.regexp_extract">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">regexp_extract</span><span class="p">(</span><span class="nb">str</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="n">pattern</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="n">idx</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sa">r</span><span class="sd">&quot;&quot;&quot;Extract a specific group matched by the Java regex `regexp`, from the specified string column.</span>
<span class="sd"> If the regex did not match, or the specified group did not match, an empty string is returned.</span>
<span class="sd"> .. versionadded:: 1.5.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> str : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> target column to work on.</span>
<span class="sd"> pattern : str</span>
<span class="sd"> regex pattern to apply.</span>
<span class="sd"> idx : int</span>
<span class="sd"> matched group id.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> matched value specified by `idx` group id.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(&#39;100-200&#39;,)], [&#39;str&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(regexp_extract(&#39;str&#39;, r&#39;(\d+)-(\d+)&#39;, 1).alias(&#39;d&#39;)).collect()</span>
<span class="sd"> [Row(d=&#39;100&#39;)]</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(&#39;foo&#39;,)], [&#39;str&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(regexp_extract(&#39;str&#39;, r&#39;(\d+)&#39;, 1).alias(&#39;d&#39;)).collect()</span>
<span class="sd"> [Row(d=&#39;&#39;)]</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(&#39;aaaac&#39;,)], [&#39;str&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(regexp_extract(&#39;str&#39;, &#39;(a+)(b)?(c)&#39;, 2).alias(&#39;d&#39;)).collect()</span>
<span class="sd"> [Row(d=&#39;&#39;)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">&quot;regexp_extract&quot;</span><span class="p">,</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="nb">str</span><span class="p">),</span> <span class="n">pattern</span><span class="p">,</span> <span class="n">idx</span><span class="p">)</span></div>
<div class="viewcode-block" id="regexp_extract_all"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.regexp_extract_all.html#pyspark.sql.functions.regexp_extract_all">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">regexp_extract_all</span><span class="p">(</span>
<span class="nb">str</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="n">regexp</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="n">idx</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">int</span><span class="p">,</span> <span class="n">Column</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sa">r</span><span class="sd">&quot;&quot;&quot;Extract all strings in the `str` that match the Java regex `regexp`</span>
<span class="sd"> and corresponding to the regex group index.</span>
<span class="sd"> .. versionadded:: 3.5.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> str : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> target column to work on.</span>
<span class="sd"> regexp : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> regex pattern to apply.</span>
<span class="sd"> idx : int</span>
<span class="sd"> matched group id.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> all strings in the `str` that match a Java regex and corresponding to the regex group index.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(&quot;100-200, 300-400&quot;, r&quot;(\d+)-(\d+)&quot;)], [&quot;str&quot;, &quot;regexp&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(regexp_extract_all(&#39;str&#39;, lit(r&#39;(\d+)-(\d+)&#39;)).alias(&#39;d&#39;)).collect()</span>
<span class="sd"> [Row(d=[&#39;100&#39;, &#39;300&#39;])]</span>
<span class="sd"> &gt;&gt;&gt; df.select(regexp_extract_all(&#39;str&#39;, lit(r&#39;(\d+)-(\d+)&#39;), 1).alias(&#39;d&#39;)).collect()</span>
<span class="sd"> [Row(d=[&#39;100&#39;, &#39;300&#39;])]</span>
<span class="sd"> &gt;&gt;&gt; df.select(regexp_extract_all(&#39;str&#39;, lit(r&#39;(\d+)-(\d+)&#39;), 2).alias(&#39;d&#39;)).collect()</span>
<span class="sd"> [Row(d=[&#39;200&#39;, &#39;400&#39;])]</span>
<span class="sd"> &gt;&gt;&gt; df.select(regexp_extract_all(&#39;str&#39;, col(&quot;regexp&quot;)).alias(&#39;d&#39;)).collect()</span>
<span class="sd"> [Row(d=[&#39;100&#39;, &#39;300&#39;])]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="n">idx</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;regexp_extract_all&quot;</span><span class="p">,</span> <span class="nb">str</span><span class="p">,</span> <span class="n">regexp</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">idx</span> <span class="o">=</span> <span class="n">lit</span><span class="p">(</span><span class="n">idx</span><span class="p">)</span> <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">idx</span><span class="p">,</span> <span class="nb">int</span><span class="p">)</span> <span class="k">else</span> <span class="n">idx</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;regexp_extract_all&quot;</span><span class="p">,</span> <span class="nb">str</span><span class="p">,</span> <span class="n">regexp</span><span class="p">,</span> <span class="n">idx</span><span class="p">)</span></div>
<div class="viewcode-block" id="regexp_replace"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.regexp_replace.html#pyspark.sql.functions.regexp_replace">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">regexp_replace</span><span class="p">(</span>
<span class="n">string</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="n">pattern</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Column</span><span class="p">],</span> <span class="n">replacement</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Column</span><span class="p">]</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sa">r</span><span class="sd">&quot;&quot;&quot;Replace all substrings of the specified string value that match regexp with replacement.</span>
<span class="sd"> .. versionadded:: 1.5.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> string : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> column name or column containing the string value</span>
<span class="sd"> pattern : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> column object or str containing the regexp pattern</span>
<span class="sd"> replacement : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> column object or str containing the replacement</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> string with all substrings replaced.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(&quot;100-200&quot;, r&quot;(\d+)&quot;, &quot;--&quot;)], [&quot;str&quot;, &quot;pattern&quot;, &quot;replacement&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(regexp_replace(&#39;str&#39;, r&#39;(\d+)&#39;, &#39;--&#39;).alias(&#39;d&#39;)).collect()</span>
<span class="sd"> [Row(d=&#39;-----&#39;)]</span>
<span class="sd"> &gt;&gt;&gt; df.select(regexp_replace(&quot;str&quot;, col(&quot;pattern&quot;), col(&quot;replacement&quot;)).alias(&#39;d&#39;)).collect()</span>
<span class="sd"> [Row(d=&#39;-----&#39;)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">pattern</span><span class="p">,</span> <span class="nb">str</span><span class="p">):</span>
<span class="n">pattern_col</span> <span class="o">=</span> <span class="n">_create_column_from_literal</span><span class="p">(</span><span class="n">pattern</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">pattern_col</span> <span class="o">=</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">pattern</span><span class="p">)</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">replacement</span><span class="p">,</span> <span class="nb">str</span><span class="p">):</span>
<span class="n">replacement_col</span> <span class="o">=</span> <span class="n">_create_column_from_literal</span><span class="p">(</span><span class="n">replacement</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">replacement_col</span> <span class="o">=</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">replacement</span><span class="p">)</span>
<span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">&quot;regexp_replace&quot;</span><span class="p">,</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">string</span><span class="p">),</span> <span class="n">pattern_col</span><span class="p">,</span> <span class="n">replacement_col</span><span class="p">)</span></div>
<div class="viewcode-block" id="regexp_substr"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.regexp_substr.html#pyspark.sql.functions.regexp_substr">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">regexp_substr</span><span class="p">(</span><span class="nb">str</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="n">regexp</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sa">r</span><span class="sd">&quot;&quot;&quot;Returns the substring that matches the Java regex `regexp` within the string `str`.</span>
<span class="sd"> If the regular expression is not found, the result is null.</span>
<span class="sd"> .. versionadded:: 3.5.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> str : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> target column to work on.</span>
<span class="sd"> regexp : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> regex pattern to apply.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> the substring that matches a Java regex within the string `str`.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(&quot;1a 2b 14m&quot;, r&quot;\d+&quot;)], [&quot;str&quot;, &quot;regexp&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(regexp_substr(&#39;str&#39;, lit(r&#39;\d+&#39;)).alias(&#39;d&#39;)).collect()</span>
<span class="sd"> [Row(d=&#39;1&#39;)]</span>
<span class="sd"> &gt;&gt;&gt; df.select(regexp_substr(&#39;str&#39;, lit(r&#39;mmm&#39;)).alias(&#39;d&#39;)).collect()</span>
<span class="sd"> [Row(d=None)]</span>
<span class="sd"> &gt;&gt;&gt; df.select(regexp_substr(&quot;str&quot;, col(&quot;regexp&quot;)).alias(&#39;d&#39;)).collect()</span>
<span class="sd"> [Row(d=&#39;1&#39;)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;regexp_substr&quot;</span><span class="p">,</span> <span class="nb">str</span><span class="p">,</span> <span class="n">regexp</span><span class="p">)</span></div>
<div class="viewcode-block" id="regexp_instr"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.regexp_instr.html#pyspark.sql.functions.regexp_instr">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">regexp_instr</span><span class="p">(</span>
<span class="nb">str</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="n">regexp</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="n">idx</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">int</span><span class="p">,</span> <span class="n">Column</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sa">r</span><span class="sd">&quot;&quot;&quot;Extract all strings in the `str` that match the Java regex `regexp`</span>
<span class="sd"> and corresponding to the regex group index.</span>
<span class="sd"> .. versionadded:: 3.5.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> str : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> target column to work on.</span>
<span class="sd"> regexp : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> regex pattern to apply.</span>
<span class="sd"> idx : int</span>
<span class="sd"> matched group id.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> all strings in the `str` that match a Java regex and corresponding to the regex group index.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(&quot;1a 2b 14m&quot;, r&quot;\d+(a|b|m)&quot;)], [&quot;str&quot;, &quot;regexp&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(regexp_instr(&#39;str&#39;, lit(r&#39;\d+(a|b|m)&#39;)).alias(&#39;d&#39;)).collect()</span>
<span class="sd"> [Row(d=1)]</span>
<span class="sd"> &gt;&gt;&gt; df.select(regexp_instr(&#39;str&#39;, lit(r&#39;\d+(a|b|m)&#39;), 1).alias(&#39;d&#39;)).collect()</span>
<span class="sd"> [Row(d=1)]</span>
<span class="sd"> &gt;&gt;&gt; df.select(regexp_instr(&#39;str&#39;, lit(r&#39;\d+(a|b|m)&#39;), 2).alias(&#39;d&#39;)).collect()</span>
<span class="sd"> [Row(d=1)]</span>
<span class="sd"> &gt;&gt;&gt; df.select(regexp_instr(&#39;str&#39;, col(&quot;regexp&quot;)).alias(&#39;d&#39;)).collect()</span>
<span class="sd"> [Row(d=1)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="n">idx</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;regexp_instr&quot;</span><span class="p">,</span> <span class="nb">str</span><span class="p">,</span> <span class="n">regexp</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">idx</span> <span class="o">=</span> <span class="n">lit</span><span class="p">(</span><span class="n">idx</span><span class="p">)</span> <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">idx</span><span class="p">,</span> <span class="nb">int</span><span class="p">)</span> <span class="k">else</span> <span class="n">idx</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;regexp_instr&quot;</span><span class="p">,</span> <span class="nb">str</span><span class="p">,</span> <span class="n">regexp</span><span class="p">,</span> <span class="n">idx</span><span class="p">)</span></div>
<div class="viewcode-block" id="initcap"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.initcap.html#pyspark.sql.functions.initcap">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">initcap</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Translate the first letter of each word to upper case in the sentence.</span>
<span class="sd"> .. versionadded:: 1.5.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> target column to work on.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> string with all first letters are uppercase in each word.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; spark.createDataFrame([(&#39;ab cd&#39;,)], [&#39;a&#39;]).select(initcap(&quot;a&quot;).alias(&#39;v&#39;)).collect()</span>
<span class="sd"> [Row(v=&#39;Ab Cd&#39;)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;initcap&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="soundex"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.soundex.html#pyspark.sql.functions.soundex">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">soundex</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns the SoundEx encoding for a string</span>
<span class="sd"> .. versionadded:: 1.5.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> target column to work on.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> SoundEx encoded string.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(&quot;Peters&quot;,),(&quot;Uhrbach&quot;,)], [&#39;name&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(soundex(df.name).alias(&quot;soundex&quot;)).collect()</span>
<span class="sd"> [Row(soundex=&#39;P362&#39;), Row(soundex=&#39;U612&#39;)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;soundex&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="bin"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.bin.html#pyspark.sql.functions.bin">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">bin</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Returns the string representation of the binary value of the given column.</span>
<span class="sd"> .. versionadded:: 1.5.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> target column to work on.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> binary representation of given value as string.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([2,5], &quot;INT&quot;)</span>
<span class="sd"> &gt;&gt;&gt; df.select(bin(df.value).alias(&#39;c&#39;)).collect()</span>
<span class="sd"> [Row(c=&#39;10&#39;), Row(c=&#39;101&#39;)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;bin&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="hex"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.hex.html#pyspark.sql.functions.hex">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">hex</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Computes hex value of the given column, which could be :class:`pyspark.sql.types.StringType`,</span>
<span class="sd"> :class:`pyspark.sql.types.BinaryType`, :class:`pyspark.sql.types.IntegerType` or</span>
<span class="sd"> :class:`pyspark.sql.types.LongType`.</span>
<span class="sd"> .. versionadded:: 1.5.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> target column to work on.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> hexadecimal representation of given value as string.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; spark.createDataFrame([(&#39;ABC&#39;, 3)], [&#39;a&#39;, &#39;b&#39;]).select(hex(&#39;a&#39;), hex(&#39;b&#39;)).collect()</span>
<span class="sd"> [Row(hex(a)=&#39;414243&#39;, hex(b)=&#39;3&#39;)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;hex&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="unhex"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.unhex.html#pyspark.sql.functions.unhex">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">unhex</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Inverse of hex. Interprets each pair of characters as a hexadecimal number</span>
<span class="sd"> and converts to the byte representation of number.</span>
<span class="sd"> .. versionadded:: 1.5.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> target column to work on.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> string representation of given hexadecimal value.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; spark.createDataFrame([(&#39;414243&#39;,)], [&#39;a&#39;]).select(unhex(&#39;a&#39;)).collect()</span>
<span class="sd"> [Row(unhex(a)=bytearray(b&#39;ABC&#39;))]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;unhex&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="length"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.length.html#pyspark.sql.functions.length">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">length</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Computes the character length of string data or number of bytes of binary data.</span>
<span class="sd"> The length of character data includes the trailing spaces. The length of binary data</span>
<span class="sd"> includes binary zeros.</span>
<span class="sd"> .. versionadded:: 1.5.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> target column to work on.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> length of the value.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; spark.createDataFrame([(&#39;ABC &#39;,)], [&#39;a&#39;]).select(length(&#39;a&#39;).alias(&#39;length&#39;)).collect()</span>
<span class="sd"> [Row(length=4)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;length&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="octet_length"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.octet_length.html#pyspark.sql.functions.octet_length">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">octet_length</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Calculates the byte length for the specified string column.</span>
<span class="sd"> .. versionadded:: 3.3.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> Source column or strings</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> Byte length of the col</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; from pyspark.sql.functions import octet_length</span>
<span class="sd"> &gt;&gt;&gt; spark.createDataFrame([(&#39;cat&#39;,), ( &#39;\U0001F408&#39;,)], [&#39;cat&#39;]) \\</span>
<span class="sd"> ... .select(octet_length(&#39;cat&#39;)).collect()</span>
<span class="sd"> [Row(octet_length(cat)=3), Row(octet_length(cat)=4)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;octet_length&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="bit_length"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.bit_length.html#pyspark.sql.functions.bit_length">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">bit_length</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Calculates the bit length for the specified string column.</span>
<span class="sd"> .. versionadded:: 3.3.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> Source column or strings</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> Bit length of the col</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; from pyspark.sql.functions import bit_length</span>
<span class="sd"> &gt;&gt;&gt; spark.createDataFrame([(&#39;cat&#39;,), ( &#39;\U0001F408&#39;,)], [&#39;cat&#39;]) \\</span>
<span class="sd"> ... .select(bit_length(&#39;cat&#39;)).collect()</span>
<span class="sd"> [Row(bit_length(cat)=24), Row(bit_length(cat)=32)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;bit_length&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="translate"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.translate.html#pyspark.sql.functions.translate">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">translate</span><span class="p">(</span><span class="n">srcCol</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="n">matching</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="n">replace</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;A function translate any character in the `srcCol` by a character in `matching`.</span>
<span class="sd"> The characters in `replace` is corresponding to the characters in `matching`.</span>
<span class="sd"> Translation will happen whenever any character in the string is matching with the character</span>
<span class="sd"> in the `matching`.</span>
<span class="sd"> .. versionadded:: 1.5.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> srcCol : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> Source column or strings</span>
<span class="sd"> matching : str</span>
<span class="sd"> matching characters.</span>
<span class="sd"> replace : str</span>
<span class="sd"> characters for replacement. If this is shorter than `matching` string then</span>
<span class="sd"> those chars that don&#39;t have replacement will be dropped.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> replaced value.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; spark.createDataFrame([(&#39;translate&#39;,)], [&#39;a&#39;]).select(translate(&#39;a&#39;, &quot;rnlt&quot;, &quot;123&quot;) \\</span>
<span class="sd"> ... .alias(&#39;r&#39;)).collect()</span>
<span class="sd"> [Row(r=&#39;1a2s3ae&#39;)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">&quot;translate&quot;</span><span class="p">,</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">srcCol</span><span class="p">),</span> <span class="n">matching</span><span class="p">,</span> <span class="n">replace</span><span class="p">)</span></div>
<div class="viewcode-block" id="to_binary"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.to_binary.html#pyspark.sql.functions.to_binary">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">to_binary</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="nb">format</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="s2">&quot;ColumnOrName&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Converts the input `col` to a binary value based on the supplied `format`.</span>
<span class="sd"> The `format` can be a case-insensitive string literal of &quot;hex&quot;, &quot;utf-8&quot;, &quot;utf8&quot;,</span>
<span class="sd"> or &quot;base64&quot;. By default, the binary format for conversion is &quot;hex&quot; if</span>
<span class="sd"> `format` is omitted. The function returns NULL if at least one of the</span>
<span class="sd"> input parameters is NULL.</span>
<span class="sd"> .. versionadded:: 3.5.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> Input column or strings.</span>
<span class="sd"> format : :class:`~pyspark.sql.Column` or str, optional</span>
<span class="sd"> format to use to convert binary values.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(&quot;abc&quot;,)], [&quot;e&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(to_binary(df.e, lit(&quot;utf-8&quot;)).alias(&#39;r&#39;)).collect()</span>
<span class="sd"> [Row(r=bytearray(b&#39;abc&#39;))]</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(&quot;414243&quot;,)], [&quot;e&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(to_binary(df.e).alias(&#39;r&#39;)).collect()</span>
<span class="sd"> [Row(r=bytearray(b&#39;ABC&#39;))]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="nb">format</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;to_binary&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">,</span> <span class="nb">format</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;to_binary&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="to_char"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.to_char.html#pyspark.sql.functions.to_char">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">to_char</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="nb">format</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Convert `col` to a string based on the `format`.</span>
<span class="sd"> Throws an exception if the conversion fails. The format can consist of the following</span>
<span class="sd"> characters, case insensitive:</span>
<span class="sd"> &#39;0&#39; or &#39;9&#39;: Specifies an expected digit between 0 and 9. A sequence of 0 or 9 in the</span>
<span class="sd"> format string matches a sequence of digits in the input value, generating a result</span>
<span class="sd"> string of the same length as the corresponding sequence in the format string.</span>
<span class="sd"> The result string is left-padded with zeros if the 0/9 sequence comprises more digits</span>
<span class="sd"> than the matching part of the decimal value, starts with 0, and is before the decimal</span>
<span class="sd"> point. Otherwise, it is padded with spaces.</span>
<span class="sd"> &#39;.&#39; or &#39;D&#39;: Specifies the position of the decimal point (optional, only allowed once).</span>
<span class="sd"> &#39;,&#39; or &#39;G&#39;: Specifies the position of the grouping (thousands) separator (,).</span>
<span class="sd"> There must be a 0 or 9 to the left and right of each grouping separator.</span>
<span class="sd"> &#39;$&#39;: Specifies the location of the $ currency sign. This character may only be specified once.</span>
<span class="sd"> &#39;S&#39; or &#39;MI&#39;: Specifies the position of a &#39;-&#39; or &#39;+&#39; sign (optional, only allowed once at</span>
<span class="sd"> the beginning or end of the format string). Note that &#39;S&#39; prints &#39;+&#39; for positive</span>
<span class="sd"> values but &#39;MI&#39; prints a space.</span>
<span class="sd"> &#39;PR&#39;: Only allowed at the end of the format string; specifies that the result string</span>
<span class="sd"> will be wrapped by angle brackets if the input value is negative.</span>
<span class="sd"> .. versionadded:: 3.5.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> Input column or strings.</span>
<span class="sd"> format : :class:`~pyspark.sql.Column` or str, optional</span>
<span class="sd"> format to use to convert char values.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(78.12,)], [&quot;e&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(to_char(df.e, lit(&quot;$99.99&quot;)).alias(&#39;r&#39;)).collect()</span>
<span class="sd"> [Row(r=&#39;$78.12&#39;)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;to_char&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">,</span> <span class="nb">format</span><span class="p">)</span></div>
<div class="viewcode-block" id="to_varchar"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.to_varchar.html#pyspark.sql.functions.to_varchar">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">to_varchar</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="nb">format</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Convert `col` to a string based on the `format`.</span>
<span class="sd"> Throws an exception if the conversion fails. The format can consist of the following</span>
<span class="sd"> characters, case insensitive:</span>
<span class="sd"> &#39;0&#39; or &#39;9&#39;: Specifies an expected digit between 0 and 9. A sequence of 0 or 9 in the</span>
<span class="sd"> format string matches a sequence of digits in the input value, generating a result</span>
<span class="sd"> string of the same length as the corresponding sequence in the format string.</span>
<span class="sd"> The result string is left-padded with zeros if the 0/9 sequence comprises more digits</span>
<span class="sd"> than the matching part of the decimal value, starts with 0, and is before the decimal</span>
<span class="sd"> point. Otherwise, it is padded with spaces.</span>
<span class="sd"> &#39;.&#39; or &#39;D&#39;: Specifies the position of the decimal point (optional, only allowed once).</span>
<span class="sd"> &#39;,&#39; or &#39;G&#39;: Specifies the position of the grouping (thousands) separator (,).</span>
<span class="sd"> There must be a 0 or 9 to the left and right of each grouping separator.</span>
<span class="sd"> &#39;$&#39;: Specifies the location of the $ currency sign. This character may only be specified once.</span>
<span class="sd"> &#39;S&#39; or &#39;MI&#39;: Specifies the position of a &#39;-&#39; or &#39;+&#39; sign (optional, only allowed once at</span>
<span class="sd"> the beginning or end of the format string). Note that &#39;S&#39; prints &#39;+&#39; for positive</span>
<span class="sd"> values but &#39;MI&#39; prints a space.</span>
<span class="sd"> &#39;PR&#39;: Only allowed at the end of the format string; specifies that the result string</span>
<span class="sd"> will be wrapped by angle brackets if the input value is negative.</span>
<span class="sd"> .. versionadded:: 3.5.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> Input column or strings.</span>
<span class="sd"> format : :class:`~pyspark.sql.Column` or str, optional</span>
<span class="sd"> format to use to convert char values.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(78.12,)], [&quot;e&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(to_varchar(df.e, lit(&quot;$99.99&quot;)).alias(&#39;r&#39;)).collect()</span>
<span class="sd"> [Row(r=&#39;$78.12&#39;)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;to_varchar&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">,</span> <span class="nb">format</span><span class="p">)</span></div>
<div class="viewcode-block" id="to_number"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.to_number.html#pyspark.sql.functions.to_number">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">to_number</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="nb">format</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Convert string &#39;col&#39; to a number based on the string format &#39;format&#39;.</span>
<span class="sd"> Throws an exception if the conversion fails. The format can consist of the following</span>
<span class="sd"> characters, case insensitive:</span>
<span class="sd"> &#39;0&#39; or &#39;9&#39;: Specifies an expected digit between 0 and 9. A sequence of 0 or 9 in the</span>
<span class="sd"> format string matches a sequence of digits in the input string. If the 0/9</span>
<span class="sd"> sequence starts with 0 and is before the decimal point, it can only match a digit</span>
<span class="sd"> sequence of the same size. Otherwise, if the sequence starts with 9 or is after</span>
<span class="sd"> the decimal point, it can match a digit sequence that has the same or smaller size.</span>
<span class="sd"> &#39;.&#39; or &#39;D&#39;: Specifies the position of the decimal point (optional, only allowed once).</span>
<span class="sd"> &#39;,&#39; or &#39;G&#39;: Specifies the position of the grouping (thousands) separator (,).</span>
<span class="sd"> There must be a 0 or 9 to the left and right of each grouping separator.</span>
<span class="sd"> &#39;col&#39; must match the grouping separator relevant for the size of the number.</span>
<span class="sd"> &#39;$&#39;: Specifies the location of the $ currency sign. This character may only be</span>
<span class="sd"> specified once.</span>
<span class="sd"> &#39;S&#39; or &#39;MI&#39;: Specifies the position of a &#39;-&#39; or &#39;+&#39; sign (optional, only allowed</span>
<span class="sd"> once at the beginning or end of the format string). Note that &#39;S&#39; allows &#39;-&#39;</span>
<span class="sd"> but &#39;MI&#39; does not.</span>
<span class="sd"> &#39;PR&#39;: Only allowed at the end of the format string; specifies that &#39;col&#39; indicates a</span>
<span class="sd"> negative number with wrapping angled brackets.</span>
<span class="sd"> .. versionadded:: 3.5.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> Input column or strings.</span>
<span class="sd"> format : :class:`~pyspark.sql.Column` or str, optional</span>
<span class="sd"> format to use to convert number values.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(&quot;$78.12&quot;,)], [&quot;e&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(to_number(df.e, lit(&quot;$99.99&quot;)).alias(&#39;r&#39;)).collect()</span>
<span class="sd"> [Row(r=Decimal(&#39;78.12&#39;))]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;to_number&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">,</span> <span class="nb">format</span><span class="p">)</span></div>
<div class="viewcode-block" id="replace"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.replace.html#pyspark.sql.functions.replace">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">replace</span><span class="p">(</span>
<span class="n">src</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="n">search</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="n">replace</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="s2">&quot;ColumnOrName&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Replaces all occurrences of `search` with `replace`.</span>
<span class="sd"> .. versionadded:: 3.5.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> src : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> A column of string to be replaced.</span>
<span class="sd"> search : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> A column of string, If `search` is not found in `str`, `str` is returned unchanged.</span>
<span class="sd"> replace : :class:`~pyspark.sql.Column` or str, optional</span>
<span class="sd"> A column of string, If `replace` is not specified or is an empty string,</span>
<span class="sd"> nothing replaces the string that is removed from `str`.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(&quot;ABCabc&quot;, &quot;abc&quot;, &quot;DEF&quot;,)], [&quot;a&quot;, &quot;b&quot;, &quot;c&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(replace(df.a, df.b, df.c).alias(&#39;r&#39;)).collect()</span>
<span class="sd"> [Row(r=&#39;ABCDEF&#39;)]</span>
<span class="sd"> &gt;&gt;&gt; df.select(replace(df.a, df.b).alias(&#39;r&#39;)).collect()</span>
<span class="sd"> [Row(r=&#39;ABC&#39;)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="n">replace</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;replace&quot;</span><span class="p">,</span> <span class="n">src</span><span class="p">,</span> <span class="n">search</span><span class="p">,</span> <span class="n">replace</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;replace&quot;</span><span class="p">,</span> <span class="n">src</span><span class="p">,</span> <span class="n">search</span><span class="p">)</span></div>
<div class="viewcode-block" id="split_part"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.split_part.html#pyspark.sql.functions.split_part">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">split_part</span><span class="p">(</span><span class="n">src</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="n">delimiter</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="n">partNum</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Splits `str` by delimiter and return requested part of the split (1-based).</span>
<span class="sd"> If any input is null, returns null. if `partNum` is out of range of split parts,</span>
<span class="sd"> returns empty string. If `partNum` is 0, throws an error. If `partNum` is negative,</span>
<span class="sd"> the parts are counted backward from the end of the string.</span>
<span class="sd"> If the `delimiter` is an empty string, the `str` is not split.</span>
<span class="sd"> .. versionadded:: 3.5.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> src : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> A column of string to be splited.</span>
<span class="sd"> delimiter : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> A column of string, the delimiter used for split.</span>
<span class="sd"> partNum : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> A column of string, requested part of the split (1-based).</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(&quot;11.12.13&quot;, &quot;.&quot;, 3,)], [&quot;a&quot;, &quot;b&quot;, &quot;c&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(split_part(df.a, df.b, df.c).alias(&#39;r&#39;)).collect()</span>
<span class="sd"> [Row(r=&#39;13&#39;)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;split_part&quot;</span><span class="p">,</span> <span class="n">src</span><span class="p">,</span> <span class="n">delimiter</span><span class="p">,</span> <span class="n">partNum</span><span class="p">)</span></div>
<div class="viewcode-block" id="substr"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.substr.html#pyspark.sql.functions.substr">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">substr</span><span class="p">(</span>
<span class="nb">str</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="n">pos</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="nb">len</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="s2">&quot;ColumnOrName&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns the substring of `str` that starts at `pos` and is of length `len`,</span>
<span class="sd"> or the slice of byte array that starts at `pos` and is of length `len`.</span>
<span class="sd"> .. versionadded:: 3.5.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> src : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> A column of string.</span>
<span class="sd"> pos : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> A column of string, the substring of `str` that starts at `pos`.</span>
<span class="sd"> len : :class:`~pyspark.sql.Column` or str, optional</span>
<span class="sd"> A column of string, the substring of `str` is of length `len`.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; import pyspark.sql.functions as sf</span>
<span class="sd"> &gt;&gt;&gt; spark.createDataFrame(</span>
<span class="sd"> ... [(&quot;Spark SQL&quot;, 5, 1,)], [&quot;a&quot;, &quot;b&quot;, &quot;c&quot;]</span>
<span class="sd"> ... ).select(sf.substr(&quot;a&quot;, &quot;b&quot;, &quot;c&quot;)).show()</span>
<span class="sd"> +---------------+</span>
<span class="sd"> |substr(a, b, c)|</span>
<span class="sd"> +---------------+</span>
<span class="sd"> | k|</span>
<span class="sd"> +---------------+</span>
<span class="sd"> &gt;&gt;&gt; import pyspark.sql.functions as sf</span>
<span class="sd"> &gt;&gt;&gt; spark.createDataFrame(</span>
<span class="sd"> ... [(&quot;Spark SQL&quot;, 5, 1,)], [&quot;a&quot;, &quot;b&quot;, &quot;c&quot;]</span>
<span class="sd"> ... ).select(sf.substr(&quot;a&quot;, &quot;b&quot;)).show()</span>
<span class="sd"> +------------------------+</span>
<span class="sd"> |substr(a, b, 2147483647)|</span>
<span class="sd"> +------------------------+</span>
<span class="sd"> | k SQL|</span>
<span class="sd"> +------------------------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="nb">len</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;substr&quot;</span><span class="p">,</span> <span class="nb">str</span><span class="p">,</span> <span class="n">pos</span><span class="p">,</span> <span class="nb">len</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;substr&quot;</span><span class="p">,</span> <span class="nb">str</span><span class="p">,</span> <span class="n">pos</span><span class="p">)</span></div>
<div class="viewcode-block" id="parse_url"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.parse_url.html#pyspark.sql.functions.parse_url">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">parse_url</span><span class="p">(</span>
<span class="n">url</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="n">partToExtract</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="n">key</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="s2">&quot;ColumnOrName&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Extracts a part from a URL.</span>
<span class="sd"> .. versionadded:: 3.5.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> url : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> A column of string.</span>
<span class="sd"> partToExtract : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> A column of string, the path.</span>
<span class="sd"> key : :class:`~pyspark.sql.Column` or str, optional</span>
<span class="sd"> A column of string, the key.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame(</span>
<span class="sd"> ... [(&quot;http://spark.apache.org/path?query=1&quot;, &quot;QUERY&quot;, &quot;query&quot;,)],</span>
<span class="sd"> ... [&quot;a&quot;, &quot;b&quot;, &quot;c&quot;]</span>
<span class="sd"> ... )</span>
<span class="sd"> &gt;&gt;&gt; df.select(parse_url(df.a, df.b, df.c).alias(&#39;r&#39;)).collect()</span>
<span class="sd"> [Row(r=&#39;1&#39;)]</span>
<span class="sd"> &gt;&gt;&gt; df.select(parse_url(df.a, df.b).alias(&#39;r&#39;)).collect()</span>
<span class="sd"> [Row(r=&#39;query=1&#39;)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="n">key</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;parse_url&quot;</span><span class="p">,</span> <span class="n">url</span><span class="p">,</span> <span class="n">partToExtract</span><span class="p">,</span> <span class="n">key</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;parse_url&quot;</span><span class="p">,</span> <span class="n">url</span><span class="p">,</span> <span class="n">partToExtract</span><span class="p">)</span></div>
<div class="viewcode-block" id="printf"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.printf.html#pyspark.sql.functions.printf">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">printf</span><span class="p">(</span><span class="nb">format</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="o">*</span><span class="n">cols</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Formats the arguments in printf-style and returns the result as a string column.</span>
<span class="sd"> .. versionadded:: 3.5.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> format : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> string that can contain embedded format tags and used as result column&#39;s value</span>
<span class="sd"> cols : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> column names or :class:`~pyspark.sql.Column`\\s to be used in formatting</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; import pyspark.sql.functions as sf</span>
<span class="sd"> &gt;&gt;&gt; spark.createDataFrame(</span>
<span class="sd"> ... [(&quot;aa%d%s&quot;, 123, &quot;cc&quot;,)], [&quot;a&quot;, &quot;b&quot;, &quot;c&quot;]</span>
<span class="sd"> ... ).select(sf.printf(&quot;a&quot;, &quot;b&quot;, &quot;c&quot;)).show()</span>
<span class="sd"> +---------------+</span>
<span class="sd"> |printf(a, b, c)|</span>
<span class="sd"> +---------------+</span>
<span class="sd"> | aa123cc|</span>
<span class="sd"> +---------------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">get_active_spark_context</span><span class="p">()</span>
<span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">&quot;printf&quot;</span><span class="p">,</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="nb">format</span><span class="p">),</span> <span class="n">_to_seq</span><span class="p">(</span><span class="n">sc</span><span class="p">,</span> <span class="n">cols</span><span class="p">,</span> <span class="n">_to_java_column</span><span class="p">))</span></div>
<div class="viewcode-block" id="url_decode"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.url_decode.html#pyspark.sql.functions.url_decode">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">url_decode</span><span class="p">(</span><span class="nb">str</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Decodes a `str` in &#39;application/x-www-form-urlencoded&#39; format</span>
<span class="sd"> using a specific encoding scheme.</span>
<span class="sd"> .. versionadded:: 3.5.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> str : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> A column of string to decode.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(&quot;https%3A%2F%2Fspark.apache.org&quot;,)], [&quot;a&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(url_decode(df.a).alias(&#39;r&#39;)).collect()</span>
<span class="sd"> [Row(r=&#39;https://spark.apache.org&#39;)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;url_decode&quot;</span><span class="p">,</span> <span class="nb">str</span><span class="p">)</span></div>
<div class="viewcode-block" id="url_encode"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.url_encode.html#pyspark.sql.functions.url_encode">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">url_encode</span><span class="p">(</span><span class="nb">str</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Translates a string into &#39;application/x-www-form-urlencoded&#39; format</span>
<span class="sd"> using a specific encoding scheme.</span>
<span class="sd"> .. versionadded:: 3.5.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> str : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> A column of string to encode.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(&quot;https://spark.apache.org&quot;,)], [&quot;a&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(url_encode(df.a).alias(&#39;r&#39;)).collect()</span>
<span class="sd"> [Row(r=&#39;https%3A%2F%2Fspark.apache.org&#39;)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;url_encode&quot;</span><span class="p">,</span> <span class="nb">str</span><span class="p">)</span></div>
<div class="viewcode-block" id="position"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.position.html#pyspark.sql.functions.position">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">position</span><span class="p">(</span>
<span class="n">substr</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="nb">str</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="n">start</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="s2">&quot;ColumnOrName&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns the position of the first occurrence of `substr` in `str` after position `start`.</span>
<span class="sd"> The given `start` and return value are 1-based.</span>
<span class="sd"> .. versionadded:: 3.5.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> substr : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> A column of string, substring.</span>
<span class="sd"> str : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> A column of string.</span>
<span class="sd"> start : :class:`~pyspark.sql.Column` or str, optional</span>
<span class="sd"> A column of string, start position.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; import pyspark.sql.functions as sf</span>
<span class="sd"> &gt;&gt;&gt; spark.createDataFrame(</span>
<span class="sd"> ... [(&quot;bar&quot;, &quot;foobarbar&quot;, 5,)], [&quot;a&quot;, &quot;b&quot;, &quot;c&quot;]</span>
<span class="sd"> ... ).select(sf.position(&quot;a&quot;, &quot;b&quot;, &quot;c&quot;)).show()</span>
<span class="sd"> +-----------------+</span>
<span class="sd"> |position(a, b, c)|</span>
<span class="sd"> +-----------------+</span>
<span class="sd"> | 7|</span>
<span class="sd"> +-----------------+</span>
<span class="sd"> &gt;&gt;&gt; spark.createDataFrame(</span>
<span class="sd"> ... [(&quot;bar&quot;, &quot;foobarbar&quot;, 5,)], [&quot;a&quot;, &quot;b&quot;, &quot;c&quot;]</span>
<span class="sd"> ... ).select(sf.position(&quot;a&quot;, &quot;b&quot;)).show()</span>
<span class="sd"> +-----------------+</span>
<span class="sd"> |position(a, b, 1)|</span>
<span class="sd"> +-----------------+</span>
<span class="sd"> | 4|</span>
<span class="sd"> +-----------------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="n">start</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;position&quot;</span><span class="p">,</span> <span class="n">substr</span><span class="p">,</span> <span class="nb">str</span><span class="p">,</span> <span class="n">start</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;position&quot;</span><span class="p">,</span> <span class="n">substr</span><span class="p">,</span> <span class="nb">str</span><span class="p">)</span></div>
<div class="viewcode-block" id="endswith"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.endswith.html#pyspark.sql.functions.endswith">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">endswith</span><span class="p">(</span><span class="nb">str</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="n">suffix</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns a boolean. The value is True if str ends with suffix.</span>
<span class="sd"> Returns NULL if either input expression is NULL. Otherwise, returns False.</span>
<span class="sd"> Both str or suffix must be of STRING or BINARY type.</span>
<span class="sd"> .. versionadded:: 3.5.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> str : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> A column of string.</span>
<span class="sd"> suffix : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> A column of string, the suffix.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(&quot;Spark SQL&quot;, &quot;Spark&quot;,)], [&quot;a&quot;, &quot;b&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(endswith(df.a, df.b).alias(&#39;r&#39;)).collect()</span>
<span class="sd"> [Row(r=False)]</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(&quot;414243&quot;, &quot;4243&quot;,)], [&quot;e&quot;, &quot;f&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df = df.select(to_binary(&quot;e&quot;).alias(&quot;e&quot;), to_binary(&quot;f&quot;).alias(&quot;f&quot;))</span>
<span class="sd"> &gt;&gt;&gt; df.printSchema()</span>
<span class="sd"> root</span>
<span class="sd"> |-- e: binary (nullable = true)</span>
<span class="sd"> |-- f: binary (nullable = true)</span>
<span class="sd"> &gt;&gt;&gt; df.select(endswith(&quot;e&quot;, &quot;f&quot;), endswith(&quot;f&quot;, &quot;e&quot;)).show()</span>
<span class="sd"> +--------------+--------------+</span>
<span class="sd"> |endswith(e, f)|endswith(f, e)|</span>
<span class="sd"> +--------------+--------------+</span>
<span class="sd"> | true| false|</span>
<span class="sd"> +--------------+--------------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;endswith&quot;</span><span class="p">,</span> <span class="nb">str</span><span class="p">,</span> <span class="n">suffix</span><span class="p">)</span></div>
<div class="viewcode-block" id="startswith"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.startswith.html#pyspark.sql.functions.startswith">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">startswith</span><span class="p">(</span><span class="nb">str</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="n">prefix</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns a boolean. The value is True if str starts with prefix.</span>
<span class="sd"> Returns NULL if either input expression is NULL. Otherwise, returns False.</span>
<span class="sd"> Both str or prefix must be of STRING or BINARY type.</span>
<span class="sd"> .. versionadded:: 3.5.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> str : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> A column of string.</span>
<span class="sd"> prefix : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> A column of string, the prefix.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(&quot;Spark SQL&quot;, &quot;Spark&quot;,)], [&quot;a&quot;, &quot;b&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(startswith(df.a, df.b).alias(&#39;r&#39;)).collect()</span>
<span class="sd"> [Row(r=True)]</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(&quot;414243&quot;, &quot;4142&quot;,)], [&quot;e&quot;, &quot;f&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df = df.select(to_binary(&quot;e&quot;).alias(&quot;e&quot;), to_binary(&quot;f&quot;).alias(&quot;f&quot;))</span>
<span class="sd"> &gt;&gt;&gt; df.printSchema()</span>
<span class="sd"> root</span>
<span class="sd"> |-- e: binary (nullable = true)</span>
<span class="sd"> |-- f: binary (nullable = true)</span>
<span class="sd"> &gt;&gt;&gt; df.select(startswith(&quot;e&quot;, &quot;f&quot;), startswith(&quot;f&quot;, &quot;e&quot;)).show()</span>
<span class="sd"> +----------------+----------------+</span>
<span class="sd"> |startswith(e, f)|startswith(f, e)|</span>
<span class="sd"> +----------------+----------------+</span>
<span class="sd"> | true| false|</span>
<span class="sd"> +----------------+----------------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;startswith&quot;</span><span class="p">,</span> <span class="nb">str</span><span class="p">,</span> <span class="n">prefix</span><span class="p">)</span></div>
<div class="viewcode-block" id="char"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.char.html#pyspark.sql.functions.char">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">char</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns the ASCII character having the binary equivalent to `col`. If col is larger than 256 the</span>
<span class="sd"> result is equivalent to char(col % 256)</span>
<span class="sd"> .. versionadded:: 3.5.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> Input column or strings.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; import pyspark.sql.functions as sf</span>
<span class="sd"> &gt;&gt;&gt; spark.range(1).select(sf.char(sf.lit(65))).show()</span>
<span class="sd"> +--------+</span>
<span class="sd"> |char(65)|</span>
<span class="sd"> +--------+</span>
<span class="sd"> | A|</span>
<span class="sd"> +--------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;char&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="btrim"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.btrim.html#pyspark.sql.functions.btrim">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">btrim</span><span class="p">(</span><span class="nb">str</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="n">trim</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="s2">&quot;ColumnOrName&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Remove the leading and trailing `trim` characters from `str`.</span>
<span class="sd"> .. versionadded:: 3.5.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> str : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> Input column or strings.</span>
<span class="sd"> trim : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> The trim string characters to trim, the default value is a single space</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(&quot;SSparkSQLS&quot;, &quot;SL&quot;, )], [&#39;a&#39;, &#39;b&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(btrim(df.a, df.b).alias(&#39;r&#39;)).collect()</span>
<span class="sd"> [Row(r=&#39;parkSQ&#39;)]</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(&quot; SparkSQL &quot;,)], [&#39;a&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(btrim(df.a).alias(&#39;r&#39;)).collect()</span>
<span class="sd"> [Row(r=&#39;SparkSQL&#39;)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="n">trim</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;btrim&quot;</span><span class="p">,</span> <span class="nb">str</span><span class="p">,</span> <span class="n">trim</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;btrim&quot;</span><span class="p">,</span> <span class="nb">str</span><span class="p">)</span></div>
<div class="viewcode-block" id="char_length"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.char_length.html#pyspark.sql.functions.char_length">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">char_length</span><span class="p">(</span><span class="nb">str</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns the character length of string data or number of bytes of binary data.</span>
<span class="sd"> The length of string data includes the trailing spaces.</span>
<span class="sd"> The length of binary data includes binary zeros.</span>
<span class="sd"> .. versionadded:: 3.5.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> str : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> Input column or strings.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; import pyspark.sql.functions as sf</span>
<span class="sd"> &gt;&gt;&gt; spark.range(1).select(sf.char_length(sf.lit(&quot;SparkSQL&quot;))).show()</span>
<span class="sd"> +---------------------+</span>
<span class="sd"> |char_length(SparkSQL)|</span>
<span class="sd"> +---------------------+</span>
<span class="sd"> | 8|</span>
<span class="sd"> +---------------------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;char_length&quot;</span><span class="p">,</span> <span class="nb">str</span><span class="p">)</span></div>
<div class="viewcode-block" id="character_length"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.character_length.html#pyspark.sql.functions.character_length">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">character_length</span><span class="p">(</span><span class="nb">str</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns the character length of string data or number of bytes of binary data.</span>
<span class="sd"> The length of string data includes the trailing spaces.</span>
<span class="sd"> The length of binary data includes binary zeros.</span>
<span class="sd"> .. versionadded:: 3.5.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> str : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> Input column or strings.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; import pyspark.sql.functions as sf</span>
<span class="sd"> &gt;&gt;&gt; spark.range(1).select(sf.character_length(sf.lit(&quot;SparkSQL&quot;))).show()</span>
<span class="sd"> +--------------------------+</span>
<span class="sd"> |character_length(SparkSQL)|</span>
<span class="sd"> +--------------------------+</span>
<span class="sd"> | 8|</span>
<span class="sd"> +--------------------------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;character_length&quot;</span><span class="p">,</span> <span class="nb">str</span><span class="p">)</span></div>
<div class="viewcode-block" id="try_to_binary"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.try_to_binary.html#pyspark.sql.functions.try_to_binary">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">try_to_binary</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="nb">format</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="s2">&quot;ColumnOrName&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> This is a special version of `to_binary` that performs the same operation, but returns a NULL</span>
<span class="sd"> value instead of raising an error if the conversion cannot be performed.</span>
<span class="sd"> .. versionadded:: 3.5.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> Input column or strings.</span>
<span class="sd"> format : :class:`~pyspark.sql.Column` or str, optional</span>
<span class="sd"> format to use to convert binary values.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(&quot;abc&quot;,)], [&quot;e&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(try_to_binary(df.e, lit(&quot;utf-8&quot;)).alias(&#39;r&#39;)).collect()</span>
<span class="sd"> [Row(r=bytearray(b&#39;abc&#39;))]</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(&quot;414243&quot;,)], [&quot;e&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(try_to_binary(df.e).alias(&#39;r&#39;)).collect()</span>
<span class="sd"> [Row(r=bytearray(b&#39;ABC&#39;))]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="nb">format</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;try_to_binary&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">,</span> <span class="nb">format</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;try_to_binary&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="try_to_number"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.try_to_number.html#pyspark.sql.functions.try_to_number">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">try_to_number</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="nb">format</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Convert string &#39;col&#39; to a number based on the string format `format`. Returns NULL if the</span>
<span class="sd"> string &#39;col&#39; does not match the expected format. The format follows the same semantics as the</span>
<span class="sd"> to_number function.</span>
<span class="sd"> .. versionadded:: 3.5.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> Input column or strings.</span>
<span class="sd"> format : :class:`~pyspark.sql.Column` or str, optional</span>
<span class="sd"> format to use to convert number values.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(&quot;$78.12&quot;,)], [&quot;e&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(try_to_number(df.e, lit(&quot;$99.99&quot;)).alias(&#39;r&#39;)).collect()</span>
<span class="sd"> [Row(r=Decimal(&#39;78.12&#39;))]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;try_to_number&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">,</span> <span class="nb">format</span><span class="p">)</span></div>
<div class="viewcode-block" id="contains"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.contains.html#pyspark.sql.functions.contains">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">contains</span><span class="p">(</span><span class="n">left</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="n">right</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns a boolean. The value is True if right is found inside left.</span>
<span class="sd"> Returns NULL if either input expression is NULL. Otherwise, returns False.</span>
<span class="sd"> Both left or right must be of STRING or BINARY type.</span>
<span class="sd"> .. versionadded:: 3.5.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> left : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> The input column or strings to check, may be NULL.</span>
<span class="sd"> right : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> The input column or strings to find, may be NULL.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(&quot;Spark SQL&quot;, &quot;Spark&quot;)], [&#39;a&#39;, &#39;b&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(contains(df.a, df.b).alias(&#39;r&#39;)).collect()</span>
<span class="sd"> [Row(r=True)]</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(&quot;414243&quot;, &quot;4243&quot;,)], [&quot;c&quot;, &quot;d&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df = df.select(to_binary(&quot;c&quot;).alias(&quot;c&quot;), to_binary(&quot;d&quot;).alias(&quot;d&quot;))</span>
<span class="sd"> &gt;&gt;&gt; df.printSchema()</span>
<span class="sd"> root</span>
<span class="sd"> |-- c: binary (nullable = true)</span>
<span class="sd"> |-- d: binary (nullable = true)</span>
<span class="sd"> &gt;&gt;&gt; df.select(contains(&quot;c&quot;, &quot;d&quot;), contains(&quot;d&quot;, &quot;c&quot;)).show()</span>
<span class="sd"> +--------------+--------------+</span>
<span class="sd"> |contains(c, d)|contains(d, c)|</span>
<span class="sd"> +--------------+--------------+</span>
<span class="sd"> | true| false|</span>
<span class="sd"> +--------------+--------------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;contains&quot;</span><span class="p">,</span> <span class="n">left</span><span class="p">,</span> <span class="n">right</span><span class="p">)</span></div>
<div class="viewcode-block" id="elt"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.elt.html#pyspark.sql.functions.elt">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">elt</span><span class="p">(</span><span class="o">*</span><span class="n">inputs</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns the `n`-th input, e.g., returns `input2` when `n` is 2.</span>
<span class="sd"> The function returns NULL if the index exceeds the length of the array</span>
<span class="sd"> and `spark.sql.ansi.enabled` is set to false. If `spark.sql.ansi.enabled` is set to true,</span>
<span class="sd"> it throws ArrayIndexOutOfBoundsException for invalid indices.</span>
<span class="sd"> .. versionadded:: 3.5.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> inputs : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> Input columns or strings.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(1, &quot;scala&quot;, &quot;java&quot;)], [&#39;a&#39;, &#39;b&#39;, &#39;c&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(elt(df.a, df.b, df.c).alias(&#39;r&#39;)).collect()</span>
<span class="sd"> [Row(r=&#39;scala&#39;)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">get_active_spark_context</span><span class="p">()</span>
<span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">&quot;elt&quot;</span><span class="p">,</span> <span class="n">_to_seq</span><span class="p">(</span><span class="n">sc</span><span class="p">,</span> <span class="n">inputs</span><span class="p">,</span> <span class="n">_to_java_column</span><span class="p">))</span></div>
<div class="viewcode-block" id="find_in_set"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.find_in_set.html#pyspark.sql.functions.find_in_set">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">find_in_set</span><span class="p">(</span><span class="nb">str</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="n">str_array</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns the index (1-based) of the given string (`str`) in the comma-delimited</span>
<span class="sd"> list (`strArray`). Returns 0, if the string was not found or if the given string (`str`)</span>
<span class="sd"> contains a comma.</span>
<span class="sd"> .. versionadded:: 3.5.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> str : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> The given string to be found.</span>
<span class="sd"> str_array : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> The comma-delimited list.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(&quot;ab&quot;, &quot;abc,b,ab,c,def&quot;)], [&#39;a&#39;, &#39;b&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(find_in_set(df.a, df.b).alias(&#39;r&#39;)).collect()</span>
<span class="sd"> [Row(r=3)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;find_in_set&quot;</span><span class="p">,</span> <span class="nb">str</span><span class="p">,</span> <span class="n">str_array</span><span class="p">)</span></div>
<div class="viewcode-block" id="like"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.like.html#pyspark.sql.functions.like">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">like</span><span class="p">(</span>
<span class="nb">str</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="n">pattern</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="n">escapeChar</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="s2">&quot;Column&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns true if str matches `pattern` with `escape`,</span>
<span class="sd"> null if any arguments are null, false otherwise.</span>
<span class="sd"> The default escape character is the &#39;\&#39;.</span>
<span class="sd"> .. versionadded:: 3.5.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> str : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> A string.</span>
<span class="sd"> pattern : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> A string. The pattern is a string which is matched literally, with</span>
<span class="sd"> exception to the following special symbols:</span>
<span class="sd"> _ matches any one character in the input (similar to . in posix regular expressions)</span>
<span class="sd"> % matches zero or more characters in the input (similar to .* in posix regular</span>
<span class="sd"> expressions)</span>
<span class="sd"> Since Spark 2.0, string literals are unescaped in our SQL parser. For example, in order</span>
<span class="sd"> to match &quot;\abc&quot;, the pattern should be &quot;\\abc&quot;.</span>
<span class="sd"> When SQL config &#39;spark.sql.parser.escapedStringLiterals&#39; is enabled, it falls back</span>
<span class="sd"> to Spark 1.6 behavior regarding string literal parsing. For example, if the config is</span>
<span class="sd"> enabled, the pattern to match &quot;\abc&quot; should be &quot;\abc&quot;.</span>
<span class="sd"> escape : :class:`~pyspark.sql.Column`</span>
<span class="sd"> An character added since Spark 3.0. The default escape character is the &#39;\&#39;.</span>
<span class="sd"> If an escape character precedes a special symbol or another escape character, the</span>
<span class="sd"> following character is matched literally. It is invalid to escape any other character.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(&quot;Spark&quot;, &quot;_park&quot;)], [&#39;a&#39;, &#39;b&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(like(df.a, df.b).alias(&#39;r&#39;)).collect()</span>
<span class="sd"> [Row(r=True)]</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame(</span>
<span class="sd"> ... [(&quot;%SystemDrive%/Users/John&quot;, &quot;/%SystemDrive/%//Users%&quot;)],</span>
<span class="sd"> ... [&#39;a&#39;, &#39;b&#39;]</span>
<span class="sd"> ... )</span>
<span class="sd"> &gt;&gt;&gt; df.select(like(df.a, df.b, lit(&#39;/&#39;)).alias(&#39;r&#39;)).collect()</span>
<span class="sd"> [Row(r=True)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="n">escapeChar</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;like&quot;</span><span class="p">,</span> <span class="nb">str</span><span class="p">,</span> <span class="n">pattern</span><span class="p">,</span> <span class="n">escapeChar</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;like&quot;</span><span class="p">,</span> <span class="nb">str</span><span class="p">,</span> <span class="n">pattern</span><span class="p">)</span></div>
<div class="viewcode-block" id="ilike"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.ilike.html#pyspark.sql.functions.ilike">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">ilike</span><span class="p">(</span>
<span class="nb">str</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="n">pattern</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="n">escapeChar</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="s2">&quot;Column&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns true if str matches `pattern` with `escape` case-insensitively,</span>
<span class="sd"> null if any arguments are null, false otherwise.</span>
<span class="sd"> The default escape character is the &#39;\&#39;.</span>
<span class="sd"> .. versionadded:: 3.5.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> str : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> A string.</span>
<span class="sd"> pattern : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> A string. The pattern is a string which is matched literally, with</span>
<span class="sd"> exception to the following special symbols:</span>
<span class="sd"> _ matches any one character in the input (similar to . in posix regular expressions)</span>
<span class="sd"> % matches zero or more characters in the input (similar to .* in posix regular</span>
<span class="sd"> expressions)</span>
<span class="sd"> Since Spark 2.0, string literals are unescaped in our SQL parser. For example, in order</span>
<span class="sd"> to match &quot;\abc&quot;, the pattern should be &quot;\\abc&quot;.</span>
<span class="sd"> When SQL config &#39;spark.sql.parser.escapedStringLiterals&#39; is enabled, it falls back</span>
<span class="sd"> to Spark 1.6 behavior regarding string literal parsing. For example, if the config is</span>
<span class="sd"> enabled, the pattern to match &quot;\abc&quot; should be &quot;\abc&quot;.</span>
<span class="sd"> escape : :class:`~pyspark.sql.Column`</span>
<span class="sd"> An character added since Spark 3.0. The default escape character is the &#39;\&#39;.</span>
<span class="sd"> If an escape character precedes a special symbol or another escape character, the</span>
<span class="sd"> following character is matched literally. It is invalid to escape any other character.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(&quot;Spark&quot;, &quot;_park&quot;)], [&#39;a&#39;, &#39;b&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(ilike(df.a, df.b).alias(&#39;r&#39;)).collect()</span>
<span class="sd"> [Row(r=True)]</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame(</span>
<span class="sd"> ... [(&quot;%SystemDrive%/Users/John&quot;, &quot;/%SystemDrive/%//Users%&quot;)],</span>
<span class="sd"> ... [&#39;a&#39;, &#39;b&#39;]</span>
<span class="sd"> ... )</span>
<span class="sd"> &gt;&gt;&gt; df.select(ilike(df.a, df.b, lit(&#39;/&#39;)).alias(&#39;r&#39;)).collect()</span>
<span class="sd"> [Row(r=True)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="n">escapeChar</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;ilike&quot;</span><span class="p">,</span> <span class="nb">str</span><span class="p">,</span> <span class="n">pattern</span><span class="p">,</span> <span class="n">escapeChar</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;ilike&quot;</span><span class="p">,</span> <span class="nb">str</span><span class="p">,</span> <span class="n">pattern</span><span class="p">)</span></div>
<div class="viewcode-block" id="lcase"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.lcase.html#pyspark.sql.functions.lcase">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">lcase</span><span class="p">(</span><span class="nb">str</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns `str` with all characters changed to lowercase.</span>
<span class="sd"> .. versionadded:: 3.5.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> str : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> Input column or strings.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; import pyspark.sql.functions as sf</span>
<span class="sd"> &gt;&gt;&gt; spark.range(1).select(sf.lcase(sf.lit(&quot;Spark&quot;))).show()</span>
<span class="sd"> +------------+</span>
<span class="sd"> |lcase(Spark)|</span>
<span class="sd"> +------------+</span>
<span class="sd"> | spark|</span>
<span class="sd"> +------------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;lcase&quot;</span><span class="p">,</span> <span class="nb">str</span><span class="p">)</span></div>
<div class="viewcode-block" id="ucase"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.ucase.html#pyspark.sql.functions.ucase">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">ucase</span><span class="p">(</span><span class="nb">str</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns `str` with all characters changed to uppercase.</span>
<span class="sd"> .. versionadded:: 3.5.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> str : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> Input column or strings.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; import pyspark.sql.functions as sf</span>
<span class="sd"> &gt;&gt;&gt; spark.range(1).select(sf.ucase(sf.lit(&quot;Spark&quot;))).show()</span>
<span class="sd"> +------------+</span>
<span class="sd"> |ucase(Spark)|</span>
<span class="sd"> +------------+</span>
<span class="sd"> | SPARK|</span>
<span class="sd"> +------------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;ucase&quot;</span><span class="p">,</span> <span class="nb">str</span><span class="p">)</span></div>
<div class="viewcode-block" id="left"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.left.html#pyspark.sql.functions.left">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">left</span><span class="p">(</span><span class="nb">str</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="nb">len</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns the leftmost `len`(`len` can be string type) characters from the string `str`,</span>
<span class="sd"> if `len` is less or equal than 0 the result is an empty string.</span>
<span class="sd"> .. versionadded:: 3.5.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> str : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> Input column or strings.</span>
<span class="sd"> len : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> Input column or strings, the leftmost `len`.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(&quot;Spark SQL&quot;, 3,)], [&#39;a&#39;, &#39;b&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(left(df.a, df.b).alias(&#39;r&#39;)).collect()</span>
<span class="sd"> [Row(r=&#39;Spa&#39;)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;left&quot;</span><span class="p">,</span> <span class="nb">str</span><span class="p">,</span> <span class="nb">len</span><span class="p">)</span></div>
<div class="viewcode-block" id="right"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.right.html#pyspark.sql.functions.right">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">right</span><span class="p">(</span><span class="nb">str</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="nb">len</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns the rightmost `len`(`len` can be string type) characters from the string `str`,</span>
<span class="sd"> if `len` is less or equal than 0 the result is an empty string.</span>
<span class="sd"> .. versionadded:: 3.5.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> str : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> Input column or strings.</span>
<span class="sd"> len : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> Input column or strings, the rightmost `len`.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(&quot;Spark SQL&quot;, 3,)], [&#39;a&#39;, &#39;b&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(right(df.a, df.b).alias(&#39;r&#39;)).collect()</span>
<span class="sd"> [Row(r=&#39;SQL&#39;)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;right&quot;</span><span class="p">,</span> <span class="nb">str</span><span class="p">,</span> <span class="nb">len</span><span class="p">)</span></div>
<div class="viewcode-block" id="mask"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.mask.html#pyspark.sql.functions.mask">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">mask</span><span class="p">(</span>
<span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span>
<span class="n">upperChar</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="s2">&quot;ColumnOrName&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">lowerChar</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="s2">&quot;ColumnOrName&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">digitChar</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="s2">&quot;ColumnOrName&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">otherChar</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="s2">&quot;ColumnOrName&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Masks the given string value. This can be useful for creating copies of tables with sensitive</span>
<span class="sd"> information removed.</span>
<span class="sd"> .. versionadded:: 3.5.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col: :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> target column to compute on.</span>
<span class="sd"> upperChar: :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> character to replace upper-case characters with. Specify NULL to retain original character.</span>
<span class="sd"> lowerChar: :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> character to replace lower-case characters with. Specify NULL to retain original character.</span>
<span class="sd"> digitChar: :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> character to replace digit characters with. Specify NULL to retain original character.</span>
<span class="sd"> otherChar: :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> character to replace all other characters with. Specify NULL to retain original character.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(&quot;AbCD123-@$#&quot;,), (&quot;abcd-EFGH-8765-4321&quot;,)], [&#39;data&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(mask(df.data).alias(&#39;r&#39;)).collect()</span>
<span class="sd"> [Row(r=&#39;XxXXnnn-@$#&#39;), Row(r=&#39;xxxx-XXXX-nnnn-nnnn&#39;)]</span>
<span class="sd"> &gt;&gt;&gt; df.select(mask(df.data, lit(&#39;Y&#39;)).alias(&#39;r&#39;)).collect()</span>
<span class="sd"> [Row(r=&#39;YxYYnnn-@$#&#39;), Row(r=&#39;xxxx-YYYY-nnnn-nnnn&#39;)]</span>
<span class="sd"> &gt;&gt;&gt; df.select(mask(df.data, lit(&#39;Y&#39;), lit(&#39;y&#39;)).alias(&#39;r&#39;)).collect()</span>
<span class="sd"> [Row(r=&#39;YyYYnnn-@$#&#39;), Row(r=&#39;yyyy-YYYY-nnnn-nnnn&#39;)]</span>
<span class="sd"> &gt;&gt;&gt; df.select(mask(df.data, lit(&#39;Y&#39;), lit(&#39;y&#39;), lit(&#39;d&#39;)).alias(&#39;r&#39;)).collect()</span>
<span class="sd"> [Row(r=&#39;YyYYddd-@$#&#39;), Row(r=&#39;yyyy-YYYY-dddd-dddd&#39;)]</span>
<span class="sd"> &gt;&gt;&gt; df.select(mask(df.data, lit(&#39;Y&#39;), lit(&#39;y&#39;), lit(&#39;d&#39;), lit(&#39;*&#39;)).alias(&#39;r&#39;)).collect()</span>
<span class="sd"> [Row(r=&#39;YyYYddd****&#39;), Row(r=&#39;yyyy*YYYY*dddd*dddd&#39;)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">_upperChar</span> <span class="o">=</span> <span class="n">lit</span><span class="p">(</span><span class="s2">&quot;X&quot;</span><span class="p">)</span> <span class="k">if</span> <span class="n">upperChar</span> <span class="ow">is</span> <span class="kc">None</span> <span class="k">else</span> <span class="n">upperChar</span>
<span class="n">_lowerChar</span> <span class="o">=</span> <span class="n">lit</span><span class="p">(</span><span class="s2">&quot;x&quot;</span><span class="p">)</span> <span class="k">if</span> <span class="n">lowerChar</span> <span class="ow">is</span> <span class="kc">None</span> <span class="k">else</span> <span class="n">lowerChar</span>
<span class="n">_digitChar</span> <span class="o">=</span> <span class="n">lit</span><span class="p">(</span><span class="s2">&quot;n&quot;</span><span class="p">)</span> <span class="k">if</span> <span class="n">digitChar</span> <span class="ow">is</span> <span class="kc">None</span> <span class="k">else</span> <span class="n">digitChar</span>
<span class="n">_otherChar</span> <span class="o">=</span> <span class="n">lit</span><span class="p">(</span><span class="kc">None</span><span class="p">)</span> <span class="k">if</span> <span class="n">otherChar</span> <span class="ow">is</span> <span class="kc">None</span> <span class="k">else</span> <span class="n">otherChar</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span>
<span class="s2">&quot;mask&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">,</span> <span class="n">_upperChar</span><span class="p">,</span> <span class="n">_lowerChar</span><span class="p">,</span> <span class="n">_digitChar</span><span class="p">,</span> <span class="n">_otherChar</span>
<span class="p">)</span></div>
<span class="c1"># ---------------------- Collection functions ------------------------------</span>
<span class="nd">@overload</span>
<span class="k">def</span> <span class="nf">create_map</span><span class="p">(</span><span class="o">*</span><span class="n">cols</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="o">...</span>
<span class="nd">@overload</span>
<span class="k">def</span> <span class="nf">create_map</span><span class="p">(</span><span class="n">__cols</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="s2">&quot;ColumnOrName_&quot;</span><span class="p">],</span> <span class="n">Tuple</span><span class="p">[</span><span class="s2">&quot;ColumnOrName_&quot;</span><span class="p">,</span> <span class="o">...</span><span class="p">]])</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="o">...</span>
<div class="viewcode-block" id="create_map"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.create_map.html#pyspark.sql.functions.create_map">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">create_map</span><span class="p">(</span>
<span class="o">*</span><span class="n">cols</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="n">Union</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="s2">&quot;ColumnOrName_&quot;</span><span class="p">],</span> <span class="n">Tuple</span><span class="p">[</span><span class="s2">&quot;ColumnOrName_&quot;</span><span class="p">,</span> <span class="o">...</span><span class="p">]]]</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Creates a new map column.</span>
<span class="sd"> .. versionadded:: 2.0.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> cols : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> column names or :class:`~pyspark.sql.Column`\\s that are</span>
<span class="sd"> grouped as key-value pairs, e.g. (key1, value1, key2, value2, ...).</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(&quot;Alice&quot;, 2), (&quot;Bob&quot;, 5)], (&quot;name&quot;, &quot;age&quot;))</span>
<span class="sd"> &gt;&gt;&gt; df.select(create_map(&#39;name&#39;, &#39;age&#39;).alias(&quot;map&quot;)).collect()</span>
<span class="sd"> [Row(map={&#39;Alice&#39;: 2}), Row(map={&#39;Bob&#39;: 5})]</span>
<span class="sd"> &gt;&gt;&gt; df.select(create_map([df.name, df.age]).alias(&quot;map&quot;)).collect()</span>
<span class="sd"> [Row(map={&#39;Alice&#39;: 2}), Row(map={&#39;Bob&#39;: 5})]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">cols</span><span class="p">)</span> <span class="o">==</span> <span class="mi">1</span> <span class="ow">and</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">cols</span><span class="p">[</span><span class="mi">0</span><span class="p">],</span> <span class="p">(</span><span class="nb">list</span><span class="p">,</span> <span class="nb">set</span><span class="p">)):</span>
<span class="n">cols</span> <span class="o">=</span> <span class="n">cols</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> <span class="c1"># type: ignore[assignment]</span>
<span class="k">return</span> <span class="n">_invoke_function_over_seq_of_columns</span><span class="p">(</span><span class="s2">&quot;map&quot;</span><span class="p">,</span> <span class="n">cols</span><span class="p">)</span> <span class="c1"># type: ignore[arg-type]</span></div>
<div class="viewcode-block" id="map_from_arrays"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.map_from_arrays.html#pyspark.sql.functions.map_from_arrays">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">map_from_arrays</span><span class="p">(</span><span class="n">col1</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="n">col2</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Creates a new map from two arrays.</span>
<span class="sd"> .. versionadded:: 2.4.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col1 : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> name of column containing a set of keys. All elements should not be null</span>
<span class="sd"> col2 : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> name of column containing a set of values</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> a column of map type.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([([2, 5], [&#39;a&#39;, &#39;b&#39;])], [&#39;k&#39;, &#39;v&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df = df.select(map_from_arrays(df.k, df.v).alias(&quot;col&quot;))</span>
<span class="sd"> &gt;&gt;&gt; df.show()</span>
<span class="sd"> +----------------+</span>
<span class="sd"> | col|</span>
<span class="sd"> +----------------+</span>
<span class="sd"> |{2 -&gt; a, 5 -&gt; b}|</span>
<span class="sd"> +----------------+</span>
<span class="sd"> &gt;&gt;&gt; df.printSchema()</span>
<span class="sd"> root</span>
<span class="sd"> |-- col: map (nullable = true)</span>
<span class="sd"> | |-- key: long</span>
<span class="sd"> | |-- value: string (valueContainsNull = true)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;map_from_arrays&quot;</span><span class="p">,</span> <span class="n">col1</span><span class="p">,</span> <span class="n">col2</span><span class="p">)</span></div>
<span class="nd">@overload</span>
<span class="k">def</span> <span class="nf">array</span><span class="p">(</span><span class="o">*</span><span class="n">cols</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="o">...</span>
<span class="nd">@overload</span>
<span class="k">def</span> <span class="nf">array</span><span class="p">(</span><span class="n">__cols</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="s2">&quot;ColumnOrName_&quot;</span><span class="p">],</span> <span class="n">Tuple</span><span class="p">[</span><span class="s2">&quot;ColumnOrName_&quot;</span><span class="p">,</span> <span class="o">...</span><span class="p">]])</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="o">...</span>
<div class="viewcode-block" id="array"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.array.html#pyspark.sql.functions.array">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">array</span><span class="p">(</span>
<span class="o">*</span><span class="n">cols</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="n">Union</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="s2">&quot;ColumnOrName_&quot;</span><span class="p">],</span> <span class="n">Tuple</span><span class="p">[</span><span class="s2">&quot;ColumnOrName_&quot;</span><span class="p">,</span> <span class="o">...</span><span class="p">]]]</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Creates a new array column.</span>
<span class="sd"> .. versionadded:: 1.4.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> cols : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> column names or :class:`~pyspark.sql.Column`\\s that have</span>
<span class="sd"> the same data type.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> a column of array type.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(&quot;Alice&quot;, 2), (&quot;Bob&quot;, 5)], (&quot;name&quot;, &quot;age&quot;))</span>
<span class="sd"> &gt;&gt;&gt; df.select(array(&#39;age&#39;, &#39;age&#39;).alias(&quot;arr&quot;)).collect()</span>
<span class="sd"> [Row(arr=[2, 2]), Row(arr=[5, 5])]</span>
<span class="sd"> &gt;&gt;&gt; df.select(array([df.age, df.age]).alias(&quot;arr&quot;)).collect()</span>
<span class="sd"> [Row(arr=[2, 2]), Row(arr=[5, 5])]</span>
<span class="sd"> &gt;&gt;&gt; df.select(array(&#39;age&#39;, &#39;age&#39;).alias(&quot;col&quot;)).printSchema()</span>
<span class="sd"> root</span>
<span class="sd"> |-- col: array (nullable = false)</span>
<span class="sd"> | |-- element: long (containsNull = true)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">cols</span><span class="p">)</span> <span class="o">==</span> <span class="mi">1</span> <span class="ow">and</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">cols</span><span class="p">[</span><span class="mi">0</span><span class="p">],</span> <span class="p">(</span><span class="nb">list</span><span class="p">,</span> <span class="nb">set</span><span class="p">)):</span>
<span class="n">cols</span> <span class="o">=</span> <span class="n">cols</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> <span class="c1"># type: ignore[assignment]</span>
<span class="k">return</span> <span class="n">_invoke_function_over_seq_of_columns</span><span class="p">(</span><span class="s2">&quot;array&quot;</span><span class="p">,</span> <span class="n">cols</span><span class="p">)</span> <span class="c1"># type: ignore[arg-type]</span></div>
<div class="viewcode-block" id="array_contains"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.array_contains.html#pyspark.sql.functions.array_contains">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">array_contains</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Collection function: returns null if the array is null, true if the array contains the</span>
<span class="sd"> given value, and false otherwise.</span>
<span class="sd"> .. versionadded:: 1.5.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> name of column containing array</span>
<span class="sd"> value :</span>
<span class="sd"> value or column to check for in array</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> a column of Boolean type.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([([&quot;a&quot;, &quot;b&quot;, &quot;c&quot;],), ([],)], [&#39;data&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(array_contains(df.data, &quot;a&quot;)).collect()</span>
<span class="sd"> [Row(array_contains(data, a)=True), Row(array_contains(data, a)=False)]</span>
<span class="sd"> &gt;&gt;&gt; df.select(array_contains(df.data, lit(&quot;a&quot;))).collect()</span>
<span class="sd"> [Row(array_contains(data, a)=True), Row(array_contains(data, a)=False)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">value</span> <span class="o">=</span> <span class="n">value</span><span class="o">.</span><span class="n">_jc</span> <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">value</span><span class="p">,</span> <span class="n">Column</span><span class="p">)</span> <span class="k">else</span> <span class="n">value</span>
<span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">&quot;array_contains&quot;</span><span class="p">,</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">),</span> <span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="arrays_overlap"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.arrays_overlap.html#pyspark.sql.functions.arrays_overlap">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">arrays_overlap</span><span class="p">(</span><span class="n">a1</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="n">a2</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Collection function: returns true if the arrays contain any common non-null element; if not,</span>
<span class="sd"> returns null if both the arrays are non-empty and any of them contains a null element; returns</span>
<span class="sd"> false otherwise.</span>
<span class="sd"> .. versionadded:: 2.4.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> a column of Boolean type.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([([&quot;a&quot;, &quot;b&quot;], [&quot;b&quot;, &quot;c&quot;]), ([&quot;a&quot;], [&quot;b&quot;, &quot;c&quot;])], [&#39;x&#39;, &#39;y&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(arrays_overlap(df.x, df.y).alias(&quot;overlap&quot;)).collect()</span>
<span class="sd"> [Row(overlap=True), Row(overlap=False)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;arrays_overlap&quot;</span><span class="p">,</span> <span class="n">a1</span><span class="p">,</span> <span class="n">a2</span><span class="p">)</span></div>
<div class="viewcode-block" id="slice"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.slice.html#pyspark.sql.functions.slice">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">slice</span><span class="p">(</span>
<span class="n">x</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="n">start</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="nb">int</span><span class="p">],</span> <span class="n">length</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="nb">int</span><span class="p">]</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Collection function: returns an array containing all the elements in `x` from index `start`</span>
<span class="sd"> (array indices start at 1, or from the end if `start` is negative) with the specified `length`.</span>
<span class="sd"> .. versionadded:: 2.4.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> x : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> column name or column containing the array to be sliced</span>
<span class="sd"> start : :class:`~pyspark.sql.Column` or str or int</span>
<span class="sd"> column name, column, or int containing the starting index</span>
<span class="sd"> length : :class:`~pyspark.sql.Column` or str or int</span>
<span class="sd"> column name, column, or int containing the length of the slice</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> a column of array type. Subset of array.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([([1, 2, 3],), ([4, 5],)], [&#39;x&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(slice(df.x, 2, 2).alias(&quot;sliced&quot;)).collect()</span>
<span class="sd"> [Row(sliced=[2, 3]), Row(sliced=[5])]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">start</span> <span class="o">=</span> <span class="n">lit</span><span class="p">(</span><span class="n">start</span><span class="p">)</span> <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">start</span><span class="p">,</span> <span class="nb">int</span><span class="p">)</span> <span class="k">else</span> <span class="n">start</span>
<span class="n">length</span> <span class="o">=</span> <span class="n">lit</span><span class="p">(</span><span class="n">length</span><span class="p">)</span> <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">length</span><span class="p">,</span> <span class="nb">int</span><span class="p">)</span> <span class="k">else</span> <span class="n">length</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;slice&quot;</span><span class="p">,</span> <span class="n">x</span><span class="p">,</span> <span class="n">start</span><span class="p">,</span> <span class="n">length</span><span class="p">)</span></div>
<div class="viewcode-block" id="array_join"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.array_join.html#pyspark.sql.functions.array_join">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">array_join</span><span class="p">(</span>
<span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="n">delimiter</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="n">null_replacement</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Concatenates the elements of `column` using the `delimiter`. Null values are replaced with</span>
<span class="sd"> `null_replacement` if set, otherwise they are ignored.</span>
<span class="sd"> .. versionadded:: 2.4.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> target column to work on.</span>
<span class="sd"> delimiter : str</span>
<span class="sd"> delimiter used to concatenate elements</span>
<span class="sd"> null_replacement : str, optional</span>
<span class="sd"> if set then null values will be replaced by this value</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> a column of string type. Concatenated values.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([([&quot;a&quot;, &quot;b&quot;, &quot;c&quot;],), ([&quot;a&quot;, None],)], [&#39;data&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(array_join(df.data, &quot;,&quot;).alias(&quot;joined&quot;)).collect()</span>
<span class="sd"> [Row(joined=&#39;a,b,c&#39;), Row(joined=&#39;a&#39;)]</span>
<span class="sd"> &gt;&gt;&gt; df.select(array_join(df.data, &quot;,&quot;, &quot;NULL&quot;).alias(&quot;joined&quot;)).collect()</span>
<span class="sd"> [Row(joined=&#39;a,b,c&#39;), Row(joined=&#39;a,NULL&#39;)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">get_active_spark_context</span><span class="p">()</span>
<span class="k">if</span> <span class="n">null_replacement</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
<span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">&quot;array_join&quot;</span><span class="p">,</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">),</span> <span class="n">delimiter</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">&quot;array_join&quot;</span><span class="p">,</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">),</span> <span class="n">delimiter</span><span class="p">,</span> <span class="n">null_replacement</span><span class="p">)</span></div>
<div class="viewcode-block" id="concat"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.concat.html#pyspark.sql.functions.concat">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">concat</span><span class="p">(</span><span class="o">*</span><span class="n">cols</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Concatenates multiple input columns together into a single column.</span>
<span class="sd"> The function works with strings, numeric, binary and compatible array columns.</span>
<span class="sd"> .. versionadded:: 1.5.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> cols : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> target column or columns to work on.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> concatenated values. Type of the `Column` depends on input columns&#39; type.</span>
<span class="sd"> See Also</span>
<span class="sd"> --------</span>
<span class="sd"> :meth:`pyspark.sql.functions.array_join` : to concatenate string columns with delimiter</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(&#39;abcd&#39;,&#39;123&#39;)], [&#39;s&#39;, &#39;d&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df = df.select(concat(df.s, df.d).alias(&#39;s&#39;))</span>
<span class="sd"> &gt;&gt;&gt; df.collect()</span>
<span class="sd"> [Row(s=&#39;abcd123&#39;)]</span>
<span class="sd"> &gt;&gt;&gt; df</span>
<span class="sd"> DataFrame[s: string]</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([([1, 2], [3, 4], [5]), ([1, 2], None, [3])], [&#39;a&#39;, &#39;b&#39;, &#39;c&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df = df.select(concat(df.a, df.b, df.c).alias(&quot;arr&quot;))</span>
<span class="sd"> &gt;&gt;&gt; df.collect()</span>
<span class="sd"> [Row(arr=[1, 2, 3, 4, 5]), Row(arr=None)]</span>
<span class="sd"> &gt;&gt;&gt; df</span>
<span class="sd"> DataFrame[arr: array&lt;bigint&gt;]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_seq_of_columns</span><span class="p">(</span><span class="s2">&quot;concat&quot;</span><span class="p">,</span> <span class="n">cols</span><span class="p">)</span></div>
<div class="viewcode-block" id="array_position"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.array_position.html#pyspark.sql.functions.array_position">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">array_position</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Collection function: Locates the position of the first occurrence of the given value</span>
<span class="sd"> in the given array. Returns null if either of the arguments are null.</span>
<span class="sd"> .. versionadded:: 2.4.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> The position is not zero based, but 1 based index. Returns 0 if the given</span>
<span class="sd"> value could not be found in the array.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> target column to work on.</span>
<span class="sd"> value : Any</span>
<span class="sd"> value to look for.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> position of the value in the given array if found and 0 otherwise.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([([&quot;c&quot;, &quot;b&quot;, &quot;a&quot;],), ([],)], [&#39;data&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(array_position(df.data, &quot;a&quot;)).collect()</span>
<span class="sd"> [Row(array_position(data, a)=3), Row(array_position(data, a)=0)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">&quot;array_position&quot;</span><span class="p">,</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">),</span> <span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="element_at"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.element_at.html#pyspark.sql.functions.element_at">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">element_at</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="n">extraction</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Collection function: Returns element of array at given index in `extraction` if col is array.</span>
<span class="sd"> Returns value for the given key in `extraction` if col is map. If position is negative</span>
<span class="sd"> then location of the element will start from end, if number is outside the</span>
<span class="sd"> array boundaries then None will be returned.</span>
<span class="sd"> .. versionadded:: 2.4.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> name of column containing array or map</span>
<span class="sd"> extraction :</span>
<span class="sd"> index to check for in array or key to check for in map</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> value at given position.</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> The position is not zero based, but 1 based index.</span>
<span class="sd"> See Also</span>
<span class="sd"> --------</span>
<span class="sd"> :meth:`get`</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([([&quot;a&quot;, &quot;b&quot;, &quot;c&quot;],)], [&#39;data&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(element_at(df.data, 1)).collect()</span>
<span class="sd"> [Row(element_at(data, 1)=&#39;a&#39;)]</span>
<span class="sd"> &gt;&gt;&gt; df.select(element_at(df.data, -1)).collect()</span>
<span class="sd"> [Row(element_at(data, -1)=&#39;c&#39;)]</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([({&quot;a&quot;: 1.0, &quot;b&quot;: 2.0},)], [&#39;data&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(element_at(df.data, lit(&quot;a&quot;))).collect()</span>
<span class="sd"> [Row(element_at(data, a)=1.0)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;element_at&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">,</span> <span class="n">lit</span><span class="p">(</span><span class="n">extraction</span><span class="p">))</span></div>
<div class="viewcode-block" id="try_element_at"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.try_element_at.html#pyspark.sql.functions.try_element_at">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">try_element_at</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="n">extraction</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> (array, index) - Returns element of array at given (1-based) index. If Index is 0, Spark will</span>
<span class="sd"> throw an error. If index &lt; 0, accesses elements from the last to the first. The function</span>
<span class="sd"> always returns NULL if the index exceeds the length of the array.</span>
<span class="sd"> (map, key) - Returns value for given key. The function always returns NULL if the key is not</span>
<span class="sd"> contained in the map.</span>
<span class="sd"> .. versionadded:: 3.5.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> name of column containing array or map</span>
<span class="sd"> extraction :</span>
<span class="sd"> index to check for in array or key to check for in map</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([([&quot;a&quot;, &quot;b&quot;, &quot;c&quot;],)], [&#39;data&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(try_element_at(df.data, lit(1)).alias(&#39;r&#39;)).collect()</span>
<span class="sd"> [Row(r=&#39;a&#39;)]</span>
<span class="sd"> &gt;&gt;&gt; df.select(try_element_at(df.data, lit(-1)).alias(&#39;r&#39;)).collect()</span>
<span class="sd"> [Row(r=&#39;c&#39;)]</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([({&quot;a&quot;: 1.0, &quot;b&quot;: 2.0},)], [&#39;data&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(try_element_at(df.data, lit(&quot;a&quot;)).alias(&#39;r&#39;)).collect()</span>
<span class="sd"> [Row(r=1.0)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;try_element_at&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">,</span> <span class="n">extraction</span><span class="p">)</span></div>
<div class="viewcode-block" id="get"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.get.html#pyspark.sql.functions.get">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">get</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="n">index</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="nb">int</span><span class="p">])</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Collection function: Returns element of array at given (0-based) index.</span>
<span class="sd"> If the index points outside of the array boundaries, then this function</span>
<span class="sd"> returns NULL.</span>
<span class="sd"> .. versionadded:: 3.4.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> name of column containing array</span>
<span class="sd"> index : :class:`~pyspark.sql.Column` or str or int</span>
<span class="sd"> index to check for in array</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> value at given position.</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> The position is not 1 based, but 0 based index.</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> See Also</span>
<span class="sd"> --------</span>
<span class="sd"> :meth:`element_at`</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([([&quot;a&quot;, &quot;b&quot;, &quot;c&quot;], 1)], [&#39;data&#39;, &#39;index&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(get(df.data, 1)).show()</span>
<span class="sd"> +------------+</span>
<span class="sd"> |get(data, 1)|</span>
<span class="sd"> +------------+</span>
<span class="sd"> | b|</span>
<span class="sd"> +------------+</span>
<span class="sd"> &gt;&gt;&gt; df.select(get(df.data, -1)).show()</span>
<span class="sd"> +-------------+</span>
<span class="sd"> |get(data, -1)|</span>
<span class="sd"> +-------------+</span>
<span class="sd"> | NULL|</span>
<span class="sd"> +-------------+</span>
<span class="sd"> &gt;&gt;&gt; df.select(get(df.data, 3)).show()</span>
<span class="sd"> +------------+</span>
<span class="sd"> |get(data, 3)|</span>
<span class="sd"> +------------+</span>
<span class="sd"> | NULL|</span>
<span class="sd"> +------------+</span>
<span class="sd"> &gt;&gt;&gt; df.select(get(df.data, &quot;index&quot;)).show()</span>
<span class="sd"> +----------------+</span>
<span class="sd"> |get(data, index)|</span>
<span class="sd"> +----------------+</span>
<span class="sd"> | b|</span>
<span class="sd"> +----------------+</span>
<span class="sd"> &gt;&gt;&gt; df.select(get(df.data, col(&quot;index&quot;) - 1)).show()</span>
<span class="sd"> +----------------------+</span>
<span class="sd"> |get(data, (index - 1))|</span>
<span class="sd"> +----------------------+</span>
<span class="sd"> | a|</span>
<span class="sd"> +----------------------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">index</span> <span class="o">=</span> <span class="n">lit</span><span class="p">(</span><span class="n">index</span><span class="p">)</span> <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">index</span><span class="p">,</span> <span class="nb">int</span><span class="p">)</span> <span class="k">else</span> <span class="n">index</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;get&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">,</span> <span class="n">index</span><span class="p">)</span></div>
<div class="viewcode-block" id="array_prepend"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.array_prepend.html#pyspark.sql.functions.array_prepend">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">array_prepend</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Collection function: Returns an array containing element as</span>
<span class="sd"> well as all elements from array. The new element is positioned</span>
<span class="sd"> at the beginning of the array.</span>
<span class="sd"> .. versionadded:: 3.5.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> name of column containing array</span>
<span class="sd"> value :</span>
<span class="sd"> a literal value, or a :class:`~pyspark.sql.Column` expression.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> an array excluding given value.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([([2, 3, 4],), ([],)], [&#39;data&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(array_prepend(df.data, 1)).collect()</span>
<span class="sd"> [Row(array_prepend(data, 1)=[1, 2, 3, 4]), Row(array_prepend(data, 1)=[1])]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;array_prepend&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">,</span> <span class="n">lit</span><span class="p">(</span><span class="n">value</span><span class="p">))</span></div>
<div class="viewcode-block" id="array_remove"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.array_remove.html#pyspark.sql.functions.array_remove">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">array_remove</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="n">element</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Collection function: Remove all elements that equal to element from the given array.</span>
<span class="sd"> .. versionadded:: 2.4.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> name of column containing array</span>
<span class="sd"> element :</span>
<span class="sd"> element to be removed from the array</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> an array excluding given value.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([([1, 2, 3, 1, 1],), ([],)], [&#39;data&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(array_remove(df.data, 1)).collect()</span>
<span class="sd"> [Row(array_remove(data, 1)=[2, 3]), Row(array_remove(data, 1)=[])]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">&quot;array_remove&quot;</span><span class="p">,</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">),</span> <span class="n">element</span><span class="p">)</span></div>
<div class="viewcode-block" id="array_distinct"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.array_distinct.html#pyspark.sql.functions.array_distinct">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">array_distinct</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Collection function: removes duplicate values from the array.</span>
<span class="sd"> .. versionadded:: 2.4.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> name of column or expression</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> an array of unique values.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([([1, 2, 3, 2],), ([4, 5, 5, 4],)], [&#39;data&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(array_distinct(df.data)).collect()</span>
<span class="sd"> [Row(array_distinct(data)=[1, 2, 3]), Row(array_distinct(data)=[4, 5])]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;array_distinct&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="array_insert"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.array_insert.html#pyspark.sql.functions.array_insert">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">array_insert</span><span class="p">(</span><span class="n">arr</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="n">pos</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="nb">int</span><span class="p">],</span> <span class="n">value</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Collection function: adds an item into a given array at a specified array index.</span>
<span class="sd"> Array indices start at 1, or start from the end if index is negative.</span>
<span class="sd"> Index above array size appends the array, or prepends the array if index is negative,</span>
<span class="sd"> with &#39;null&#39; elements.</span>
<span class="sd"> .. versionadded:: 3.4.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> arr : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> name of column containing an array</span>
<span class="sd"> pos : :class:`~pyspark.sql.Column` or str or int</span>
<span class="sd"> name of Numeric type column indicating position of insertion</span>
<span class="sd"> (starting at index 1, negative position is a start from the back of the array)</span>
<span class="sd"> value :</span>
<span class="sd"> a literal value, or a :class:`~pyspark.sql.Column` expression.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> an array of values, including the new specified value</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame(</span>
<span class="sd"> ... [([&#39;a&#39;, &#39;b&#39;, &#39;c&#39;], 2, &#39;d&#39;), ([&#39;c&#39;, &#39;b&#39;, &#39;a&#39;], -2, &#39;d&#39;)],</span>
<span class="sd"> ... [&#39;data&#39;, &#39;pos&#39;, &#39;val&#39;]</span>
<span class="sd"> ... )</span>
<span class="sd"> &gt;&gt;&gt; df.select(array_insert(df.data, df.pos.cast(&#39;integer&#39;), df.val).alias(&#39;data&#39;)).collect()</span>
<span class="sd"> [Row(data=[&#39;a&#39;, &#39;d&#39;, &#39;b&#39;, &#39;c&#39;]), Row(data=[&#39;c&#39;, &#39;b&#39;, &#39;d&#39;, &#39;a&#39;])]</span>
<span class="sd"> &gt;&gt;&gt; df.select(array_insert(df.data, 5, &#39;hello&#39;).alias(&#39;data&#39;)).collect()</span>
<span class="sd"> [Row(data=[&#39;a&#39;, &#39;b&#39;, &#39;c&#39;, None, &#39;hello&#39;]), Row(data=[&#39;c&#39;, &#39;b&#39;, &#39;a&#39;, None, &#39;hello&#39;])]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">pos</span> <span class="o">=</span> <span class="n">lit</span><span class="p">(</span><span class="n">pos</span><span class="p">)</span> <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">pos</span><span class="p">,</span> <span class="nb">int</span><span class="p">)</span> <span class="k">else</span> <span class="n">pos</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;array_insert&quot;</span><span class="p">,</span> <span class="n">arr</span><span class="p">,</span> <span class="n">pos</span><span class="p">,</span> <span class="n">lit</span><span class="p">(</span><span class="n">value</span><span class="p">))</span></div>
<div class="viewcode-block" id="array_intersect"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.array_intersect.html#pyspark.sql.functions.array_intersect">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">array_intersect</span><span class="p">(</span><span class="n">col1</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="n">col2</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Collection function: returns an array of the elements in the intersection of col1 and col2,</span>
<span class="sd"> without duplicates.</span>
<span class="sd"> .. versionadded:: 2.4.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col1 : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> name of column containing array</span>
<span class="sd"> col2 : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> name of column containing array</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> an array of values in the intersection of two arrays.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; from pyspark.sql import Row</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([Row(c1=[&quot;b&quot;, &quot;a&quot;, &quot;c&quot;], c2=[&quot;c&quot;, &quot;d&quot;, &quot;a&quot;, &quot;f&quot;])])</span>
<span class="sd"> &gt;&gt;&gt; df.select(array_intersect(df.c1, df.c2)).collect()</span>
<span class="sd"> [Row(array_intersect(c1, c2)=[&#39;a&#39;, &#39;c&#39;])]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;array_intersect&quot;</span><span class="p">,</span> <span class="n">col1</span><span class="p">,</span> <span class="n">col2</span><span class="p">)</span></div>
<div class="viewcode-block" id="array_union"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.array_union.html#pyspark.sql.functions.array_union">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">array_union</span><span class="p">(</span><span class="n">col1</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="n">col2</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Collection function: returns an array of the elements in the union of col1 and col2,</span>
<span class="sd"> without duplicates.</span>
<span class="sd"> .. versionadded:: 2.4.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col1 : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> name of column containing array</span>
<span class="sd"> col2 : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> name of column containing array</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> an array of values in union of two arrays.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; from pyspark.sql import Row</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([Row(c1=[&quot;b&quot;, &quot;a&quot;, &quot;c&quot;], c2=[&quot;c&quot;, &quot;d&quot;, &quot;a&quot;, &quot;f&quot;])])</span>
<span class="sd"> &gt;&gt;&gt; df.select(array_union(df.c1, df.c2)).collect()</span>
<span class="sd"> [Row(array_union(c1, c2)=[&#39;b&#39;, &#39;a&#39;, &#39;c&#39;, &#39;d&#39;, &#39;f&#39;])]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;array_union&quot;</span><span class="p">,</span> <span class="n">col1</span><span class="p">,</span> <span class="n">col2</span><span class="p">)</span></div>
<div class="viewcode-block" id="array_except"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.array_except.html#pyspark.sql.functions.array_except">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">array_except</span><span class="p">(</span><span class="n">col1</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="n">col2</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Collection function: returns an array of the elements in col1 but not in col2,</span>
<span class="sd"> without duplicates.</span>
<span class="sd"> .. versionadded:: 2.4.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col1 : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> name of column containing array</span>
<span class="sd"> col2 : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> name of column containing array</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> an array of values from first array that are not in the second.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; from pyspark.sql import Row</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([Row(c1=[&quot;b&quot;, &quot;a&quot;, &quot;c&quot;], c2=[&quot;c&quot;, &quot;d&quot;, &quot;a&quot;, &quot;f&quot;])])</span>
<span class="sd"> &gt;&gt;&gt; df.select(array_except(df.c1, df.c2)).collect()</span>
<span class="sd"> [Row(array_except(c1, c2)=[&#39;b&#39;])]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;array_except&quot;</span><span class="p">,</span> <span class="n">col1</span><span class="p">,</span> <span class="n">col2</span><span class="p">)</span></div>
<div class="viewcode-block" id="array_compact"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.array_compact.html#pyspark.sql.functions.array_compact">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">array_compact</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Collection function: removes null values from the array.</span>
<span class="sd"> .. versionadded:: 3.4.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> name of column or expression</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> an array by excluding the null values.</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([([1, None, 2, 3],), ([4, 5, None, 4],)], [&#39;data&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(array_compact(df.data)).collect()</span>
<span class="sd"> [Row(array_compact(data)=[1, 2, 3]), Row(array_compact(data)=[4, 5, 4])]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;array_compact&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="array_append"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.array_append.html#pyspark.sql.functions.array_append">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">array_append</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Collection function: returns an array of the elements in col1 along</span>
<span class="sd"> with the added element in col2 at the last of the array.</span>
<span class="sd"> .. versionadded:: 3.4.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> name of column containing array</span>
<span class="sd"> value :</span>
<span class="sd"> a literal value, or a :class:`~pyspark.sql.Column` expression.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> an array of values from first array along with the element.</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; from pyspark.sql import Row</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([Row(c1=[&quot;b&quot;, &quot;a&quot;, &quot;c&quot;], c2=&quot;c&quot;)])</span>
<span class="sd"> &gt;&gt;&gt; df.select(array_append(df.c1, df.c2)).collect()</span>
<span class="sd"> [Row(array_append(c1, c2)=[&#39;b&#39;, &#39;a&#39;, &#39;c&#39;, &#39;c&#39;])]</span>
<span class="sd"> &gt;&gt;&gt; df.select(array_append(df.c1, &#39;x&#39;)).collect()</span>
<span class="sd"> [Row(array_append(c1, x)=[&#39;b&#39;, &#39;a&#39;, &#39;c&#39;, &#39;x&#39;])]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;array_append&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">,</span> <span class="n">lit</span><span class="p">(</span><span class="n">value</span><span class="p">))</span></div>
<div class="viewcode-block" id="explode"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.explode.html#pyspark.sql.functions.explode">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">explode</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns a new row for each element in the given array or map.</span>
<span class="sd"> Uses the default column name `col` for elements in the array and</span>
<span class="sd"> `key` and `value` for elements in the map unless specified otherwise.</span>
<span class="sd"> .. versionadded:: 1.4.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> target column to work on.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> one row per array item or map key value.</span>
<span class="sd"> See Also</span>
<span class="sd"> --------</span>
<span class="sd"> :meth:`pyspark.functions.posexplode`</span>
<span class="sd"> :meth:`pyspark.functions.explode_outer`</span>
<span class="sd"> :meth:`pyspark.functions.posexplode_outer`</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; from pyspark.sql import Row</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([Row(a=1, intlist=[1,2,3], mapfield={&quot;a&quot;: &quot;b&quot;})])</span>
<span class="sd"> &gt;&gt;&gt; df.select(explode(df.intlist).alias(&quot;anInt&quot;)).collect()</span>
<span class="sd"> [Row(anInt=1), Row(anInt=2), Row(anInt=3)]</span>
<span class="sd"> &gt;&gt;&gt; df.select(explode(df.mapfield).alias(&quot;key&quot;, &quot;value&quot;)).show()</span>
<span class="sd"> +---+-----+</span>
<span class="sd"> |key|value|</span>
<span class="sd"> +---+-----+</span>
<span class="sd"> | a| b|</span>
<span class="sd"> +---+-----+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;explode&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="posexplode"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.posexplode.html#pyspark.sql.functions.posexplode">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">posexplode</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns a new row for each element with position in the given array or map.</span>
<span class="sd"> Uses the default column name `pos` for position, and `col` for elements in the</span>
<span class="sd"> array and `key` and `value` for elements in the map unless specified otherwise.</span>
<span class="sd"> .. versionadded:: 2.1.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> target column to work on.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> one row per array item or map key value including positions as a separate column.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; from pyspark.sql import Row</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([Row(a=1, intlist=[1,2,3], mapfield={&quot;a&quot;: &quot;b&quot;})])</span>
<span class="sd"> &gt;&gt;&gt; df.select(posexplode(df.intlist)).collect()</span>
<span class="sd"> [Row(pos=0, col=1), Row(pos=1, col=2), Row(pos=2, col=3)]</span>
<span class="sd"> &gt;&gt;&gt; df.select(posexplode(df.mapfield)).show()</span>
<span class="sd"> +---+---+-----+</span>
<span class="sd"> |pos|key|value|</span>
<span class="sd"> +---+---+-----+</span>
<span class="sd"> | 0| a| b|</span>
<span class="sd"> +---+---+-----+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;posexplode&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="inline"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.inline.html#pyspark.sql.functions.inline">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">inline</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Explodes an array of structs into a table.</span>
<span class="sd"> .. versionadded:: 3.4.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> input column of values to explode.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> generator expression with the inline exploded result.</span>
<span class="sd"> See Also</span>
<span class="sd"> --------</span>
<span class="sd"> :meth:`explode`</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; from pyspark.sql import Row</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([Row(structlist=[Row(a=1, b=2), Row(a=3, b=4)])])</span>
<span class="sd"> &gt;&gt;&gt; df.select(inline(df.structlist)).show()</span>
<span class="sd"> +---+---+</span>
<span class="sd"> | a| b|</span>
<span class="sd"> +---+---+</span>
<span class="sd"> | 1| 2|</span>
<span class="sd"> | 3| 4|</span>
<span class="sd"> +---+---+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;inline&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="explode_outer"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.explode_outer.html#pyspark.sql.functions.explode_outer">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">explode_outer</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns a new row for each element in the given array or map.</span>
<span class="sd"> Unlike explode, if the array/map is null or empty then null is produced.</span>
<span class="sd"> Uses the default column name `col` for elements in the array and</span>
<span class="sd"> `key` and `value` for elements in the map unless specified otherwise.</span>
<span class="sd"> .. versionadded:: 2.3.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> target column to work on.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> one row per array item or map key value.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame(</span>
<span class="sd"> ... [(1, [&quot;foo&quot;, &quot;bar&quot;], {&quot;x&quot;: 1.0}), (2, [], {}), (3, None, None)],</span>
<span class="sd"> ... (&quot;id&quot;, &quot;an_array&quot;, &quot;a_map&quot;)</span>
<span class="sd"> ... )</span>
<span class="sd"> &gt;&gt;&gt; df.select(&quot;id&quot;, &quot;an_array&quot;, explode_outer(&quot;a_map&quot;)).show()</span>
<span class="sd"> +---+----------+----+-----+</span>
<span class="sd"> | id| an_array| key|value|</span>
<span class="sd"> +---+----------+----+-----+</span>
<span class="sd"> | 1|[foo, bar]| x| 1.0|</span>
<span class="sd"> | 2| []|NULL| NULL|</span>
<span class="sd"> | 3| NULL|NULL| NULL|</span>
<span class="sd"> +---+----------+----+-----+</span>
<span class="sd"> &gt;&gt;&gt; df.select(&quot;id&quot;, &quot;a_map&quot;, explode_outer(&quot;an_array&quot;)).show()</span>
<span class="sd"> +---+----------+----+</span>
<span class="sd"> | id| a_map| col|</span>
<span class="sd"> +---+----------+----+</span>
<span class="sd"> | 1|{x -&gt; 1.0}| foo|</span>
<span class="sd"> | 1|{x -&gt; 1.0}| bar|</span>
<span class="sd"> | 2| {}|NULL|</span>
<span class="sd"> | 3| NULL|NULL|</span>
<span class="sd"> +---+----------+----+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;explode_outer&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="posexplode_outer"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.posexplode_outer.html#pyspark.sql.functions.posexplode_outer">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">posexplode_outer</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns a new row for each element with position in the given array or map.</span>
<span class="sd"> Unlike posexplode, if the array/map is null or empty then the row (null, null) is produced.</span>
<span class="sd"> Uses the default column name `pos` for position, and `col` for elements in the</span>
<span class="sd"> array and `key` and `value` for elements in the map unless specified otherwise.</span>
<span class="sd"> .. versionadded:: 2.3.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> target column to work on.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> one row per array item or map key value including positions as a separate column.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame(</span>
<span class="sd"> ... [(1, [&quot;foo&quot;, &quot;bar&quot;], {&quot;x&quot;: 1.0}), (2, [], {}), (3, None, None)],</span>
<span class="sd"> ... (&quot;id&quot;, &quot;an_array&quot;, &quot;a_map&quot;)</span>
<span class="sd"> ... )</span>
<span class="sd"> &gt;&gt;&gt; df.select(&quot;id&quot;, &quot;an_array&quot;, posexplode_outer(&quot;a_map&quot;)).show()</span>
<span class="sd"> +---+----------+----+----+-----+</span>
<span class="sd"> | id| an_array| pos| key|value|</span>
<span class="sd"> +---+----------+----+----+-----+</span>
<span class="sd"> | 1|[foo, bar]| 0| x| 1.0|</span>
<span class="sd"> | 2| []|NULL|NULL| NULL|</span>
<span class="sd"> | 3| NULL|NULL|NULL| NULL|</span>
<span class="sd"> +---+----------+----+----+-----+</span>
<span class="sd"> &gt;&gt;&gt; df.select(&quot;id&quot;, &quot;a_map&quot;, posexplode_outer(&quot;an_array&quot;)).show()</span>
<span class="sd"> +---+----------+----+----+</span>
<span class="sd"> | id| a_map| pos| col|</span>
<span class="sd"> +---+----------+----+----+</span>
<span class="sd"> | 1|{x -&gt; 1.0}| 0| foo|</span>
<span class="sd"> | 1|{x -&gt; 1.0}| 1| bar|</span>
<span class="sd"> | 2| {}|NULL|NULL|</span>
<span class="sd"> | 3| NULL|NULL|NULL|</span>
<span class="sd"> +---+----------+----+----+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;posexplode_outer&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="inline_outer"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.inline_outer.html#pyspark.sql.functions.inline_outer">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">inline_outer</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Explodes an array of structs into a table.</span>
<span class="sd"> Unlike inline, if the array is null or empty then null is produced for each nested column.</span>
<span class="sd"> .. versionadded:: 3.4.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> input column of values to explode.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> generator expression with the inline exploded result.</span>
<span class="sd"> See Also</span>
<span class="sd"> --------</span>
<span class="sd"> :meth:`explode_outer`</span>
<span class="sd"> :meth:`inline`</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; from pyspark.sql import Row</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([</span>
<span class="sd"> ... Row(id=1, structlist=[Row(a=1, b=2), Row(a=3, b=4)]),</span>
<span class="sd"> ... Row(id=2, structlist=[])</span>
<span class="sd"> ... ])</span>
<span class="sd"> &gt;&gt;&gt; df.select(&#39;id&#39;, inline_outer(df.structlist)).show()</span>
<span class="sd"> +---+----+----+</span>
<span class="sd"> | id| a| b|</span>
<span class="sd"> +---+----+----+</span>
<span class="sd"> | 1| 1| 2|</span>
<span class="sd"> | 1| 3| 4|</span>
<span class="sd"> | 2|NULL|NULL|</span>
<span class="sd"> +---+----+----+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;inline_outer&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="get_json_object"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.get_json_object.html#pyspark.sql.functions.get_json_object">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">get_json_object</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="n">path</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Extracts json object from a json string based on json `path` specified, and returns json string</span>
<span class="sd"> of the extracted json object. It will return null if the input json string is invalid.</span>
<span class="sd"> .. versionadded:: 1.6.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> string column in json format</span>
<span class="sd"> path : str</span>
<span class="sd"> path to the json object to extract</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> string representation of given JSON object value.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; data = [(&quot;1&quot;, &#39;&#39;&#39;{&quot;f1&quot;: &quot;value1&quot;, &quot;f2&quot;: &quot;value2&quot;}&#39;&#39;&#39;), (&quot;2&quot;, &#39;&#39;&#39;{&quot;f1&quot;: &quot;value12&quot;}&#39;&#39;&#39;)]</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame(data, (&quot;key&quot;, &quot;jstring&quot;))</span>
<span class="sd"> &gt;&gt;&gt; df.select(df.key, get_json_object(df.jstring, &#39;$.f1&#39;).alias(&quot;c0&quot;), \\</span>
<span class="sd"> ... get_json_object(df.jstring, &#39;$.f2&#39;).alias(&quot;c1&quot;) ).collect()</span>
<span class="sd"> [Row(key=&#39;1&#39;, c0=&#39;value1&#39;, c1=&#39;value2&#39;), Row(key=&#39;2&#39;, c0=&#39;value12&#39;, c1=None)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">&quot;get_json_object&quot;</span><span class="p">,</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">),</span> <span class="n">path</span><span class="p">)</span></div>
<div class="viewcode-block" id="json_tuple"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.json_tuple.html#pyspark.sql.functions.json_tuple">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">json_tuple</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="o">*</span><span class="n">fields</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Creates a new row for a json column according to the given field names.</span>
<span class="sd"> .. versionadded:: 1.6.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> string column in json format</span>
<span class="sd"> fields : str</span>
<span class="sd"> a field or fields to extract</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> a new row for each given field value from json object</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; data = [(&quot;1&quot;, &#39;&#39;&#39;{&quot;f1&quot;: &quot;value1&quot;, &quot;f2&quot;: &quot;value2&quot;}&#39;&#39;&#39;), (&quot;2&quot;, &#39;&#39;&#39;{&quot;f1&quot;: &quot;value12&quot;}&#39;&#39;&#39;)]</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame(data, (&quot;key&quot;, &quot;jstring&quot;))</span>
<span class="sd"> &gt;&gt;&gt; df.select(df.key, json_tuple(df.jstring, &#39;f1&#39;, &#39;f2&#39;)).collect()</span>
<span class="sd"> [Row(key=&#39;1&#39;, c0=&#39;value1&#39;, c1=&#39;value2&#39;), Row(key=&#39;2&#39;, c0=&#39;value12&#39;, c1=None)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">get_active_spark_context</span><span class="p">()</span>
<span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">&quot;json_tuple&quot;</span><span class="p">,</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">),</span> <span class="n">_to_seq</span><span class="p">(</span><span class="n">sc</span><span class="p">,</span> <span class="n">fields</span><span class="p">))</span></div>
<div class="viewcode-block" id="from_json"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.from_json.html#pyspark.sql.functions.from_json">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">from_json</span><span class="p">(</span>
<span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span>
<span class="n">schema</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="n">ArrayType</span><span class="p">,</span> <span class="n">StructType</span><span class="p">,</span> <span class="n">Column</span><span class="p">,</span> <span class="nb">str</span><span class="p">],</span>
<span class="n">options</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Parses a column containing a JSON string into a :class:`MapType` with :class:`StringType`</span>
<span class="sd"> as keys type, :class:`StructType` or :class:`ArrayType` with</span>
<span class="sd"> the specified schema. Returns `null`, in the case of an unparseable string.</span>
<span class="sd"> .. versionadded:: 2.1.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> a column or column name in JSON format</span>
<span class="sd"> schema : :class:`DataType` or str</span>
<span class="sd"> a StructType, ArrayType of StructType or Python string literal with a DDL-formatted string</span>
<span class="sd"> to use when parsing the json column</span>
<span class="sd"> options : dict, optional</span>
<span class="sd"> options to control parsing. accepts the same options as the json datasource.</span>
<span class="sd"> See `Data Source Option &lt;https://spark.apache.org/docs/latest/sql-data-sources-json.html#data-source-option&gt;`_</span>
<span class="sd"> for the version you use.</span>
<span class="sd"> .. # noqa</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> a new column of complex type from given JSON object.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; from pyspark.sql.types import *</span>
<span class="sd"> &gt;&gt;&gt; data = [(1, &#39;&#39;&#39;{&quot;a&quot;: 1}&#39;&#39;&#39;)]</span>
<span class="sd"> &gt;&gt;&gt; schema = StructType([StructField(&quot;a&quot;, IntegerType())])</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame(data, (&quot;key&quot;, &quot;value&quot;))</span>
<span class="sd"> &gt;&gt;&gt; df.select(from_json(df.value, schema).alias(&quot;json&quot;)).collect()</span>
<span class="sd"> [Row(json=Row(a=1))]</span>
<span class="sd"> &gt;&gt;&gt; df.select(from_json(df.value, &quot;a INT&quot;).alias(&quot;json&quot;)).collect()</span>
<span class="sd"> [Row(json=Row(a=1))]</span>
<span class="sd"> &gt;&gt;&gt; df.select(from_json(df.value, &quot;MAP&lt;STRING,INT&gt;&quot;).alias(&quot;json&quot;)).collect()</span>
<span class="sd"> [Row(json={&#39;a&#39;: 1})]</span>
<span class="sd"> &gt;&gt;&gt; data = [(1, &#39;&#39;&#39;[{&quot;a&quot;: 1}]&#39;&#39;&#39;)]</span>
<span class="sd"> &gt;&gt;&gt; schema = ArrayType(StructType([StructField(&quot;a&quot;, IntegerType())]))</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame(data, (&quot;key&quot;, &quot;value&quot;))</span>
<span class="sd"> &gt;&gt;&gt; df.select(from_json(df.value, schema).alias(&quot;json&quot;)).collect()</span>
<span class="sd"> [Row(json=[Row(a=1)])]</span>
<span class="sd"> &gt;&gt;&gt; schema = schema_of_json(lit(&#39;&#39;&#39;{&quot;a&quot;: 0}&#39;&#39;&#39;))</span>
<span class="sd"> &gt;&gt;&gt; df.select(from_json(df.value, schema).alias(&quot;json&quot;)).collect()</span>
<span class="sd"> [Row(json=Row(a=None))]</span>
<span class="sd"> &gt;&gt;&gt; data = [(1, &#39;&#39;&#39;[1, 2, 3]&#39;&#39;&#39;)]</span>
<span class="sd"> &gt;&gt;&gt; schema = ArrayType(IntegerType())</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame(data, (&quot;key&quot;, &quot;value&quot;))</span>
<span class="sd"> &gt;&gt;&gt; df.select(from_json(df.value, schema).alias(&quot;json&quot;)).collect()</span>
<span class="sd"> [Row(json=[1, 2, 3])]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">schema</span><span class="p">,</span> <span class="n">DataType</span><span class="p">):</span>
<span class="n">schema</span> <span class="o">=</span> <span class="n">schema</span><span class="o">.</span><span class="n">json</span><span class="p">()</span>
<span class="k">elif</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">schema</span><span class="p">,</span> <span class="n">Column</span><span class="p">):</span>
<span class="n">schema</span> <span class="o">=</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">schema</span><span class="p">)</span>
<span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">&quot;from_json&quot;</span><span class="p">,</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">),</span> <span class="n">schema</span><span class="p">,</span> <span class="n">_options_to_str</span><span class="p">(</span><span class="n">options</span><span class="p">))</span></div>
<div class="viewcode-block" id="to_json"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.to_json.html#pyspark.sql.functions.to_json">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">to_json</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="n">options</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Converts a column containing a :class:`StructType`, :class:`ArrayType` or a :class:`MapType`</span>
<span class="sd"> into a JSON string. Throws an exception, in the case of an unsupported type.</span>
<span class="sd"> .. versionadded:: 2.1.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> name of column containing a struct, an array or a map.</span>
<span class="sd"> options : dict, optional</span>
<span class="sd"> options to control converting. accepts the same options as the JSON datasource.</span>
<span class="sd"> See `Data Source Option &lt;https://spark.apache.org/docs/latest/sql-data-sources-json.html#data-source-option&gt;`_</span>
<span class="sd"> for the version you use.</span>
<span class="sd"> Additionally the function supports the `pretty` option which enables</span>
<span class="sd"> pretty JSON generation.</span>
<span class="sd"> .. # noqa</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> JSON object as string column.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; from pyspark.sql import Row</span>
<span class="sd"> &gt;&gt;&gt; from pyspark.sql.types import *</span>
<span class="sd"> &gt;&gt;&gt; data = [(1, Row(age=2, name=&#39;Alice&#39;))]</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame(data, (&quot;key&quot;, &quot;value&quot;))</span>
<span class="sd"> &gt;&gt;&gt; df.select(to_json(df.value).alias(&quot;json&quot;)).collect()</span>
<span class="sd"> [Row(json=&#39;{&quot;age&quot;:2,&quot;name&quot;:&quot;Alice&quot;}&#39;)]</span>
<span class="sd"> &gt;&gt;&gt; data = [(1, [Row(age=2, name=&#39;Alice&#39;), Row(age=3, name=&#39;Bob&#39;)])]</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame(data, (&quot;key&quot;, &quot;value&quot;))</span>
<span class="sd"> &gt;&gt;&gt; df.select(to_json(df.value).alias(&quot;json&quot;)).collect()</span>
<span class="sd"> [Row(json=&#39;[{&quot;age&quot;:2,&quot;name&quot;:&quot;Alice&quot;},{&quot;age&quot;:3,&quot;name&quot;:&quot;Bob&quot;}]&#39;)]</span>
<span class="sd"> &gt;&gt;&gt; data = [(1, {&quot;name&quot;: &quot;Alice&quot;})]</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame(data, (&quot;key&quot;, &quot;value&quot;))</span>
<span class="sd"> &gt;&gt;&gt; df.select(to_json(df.value).alias(&quot;json&quot;)).collect()</span>
<span class="sd"> [Row(json=&#39;{&quot;name&quot;:&quot;Alice&quot;}&#39;)]</span>
<span class="sd"> &gt;&gt;&gt; data = [(1, [{&quot;name&quot;: &quot;Alice&quot;}, {&quot;name&quot;: &quot;Bob&quot;}])]</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame(data, (&quot;key&quot;, &quot;value&quot;))</span>
<span class="sd"> &gt;&gt;&gt; df.select(to_json(df.value).alias(&quot;json&quot;)).collect()</span>
<span class="sd"> [Row(json=&#39;[{&quot;name&quot;:&quot;Alice&quot;},{&quot;name&quot;:&quot;Bob&quot;}]&#39;)]</span>
<span class="sd"> &gt;&gt;&gt; data = [(1, [&quot;Alice&quot;, &quot;Bob&quot;])]</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame(data, (&quot;key&quot;, &quot;value&quot;))</span>
<span class="sd"> &gt;&gt;&gt; df.select(to_json(df.value).alias(&quot;json&quot;)).collect()</span>
<span class="sd"> [Row(json=&#39;[&quot;Alice&quot;,&quot;Bob&quot;]&#39;)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">&quot;to_json&quot;</span><span class="p">,</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">),</span> <span class="n">_options_to_str</span><span class="p">(</span><span class="n">options</span><span class="p">))</span></div>
<div class="viewcode-block" id="schema_of_json"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.schema_of_json.html#pyspark.sql.functions.schema_of_json">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">schema_of_json</span><span class="p">(</span><span class="n">json</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="n">options</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Parses a JSON string and infers its schema in DDL format.</span>
<span class="sd"> .. versionadded:: 2.4.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> json : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> a JSON string or a foldable string column containing a JSON string.</span>
<span class="sd"> options : dict, optional</span>
<span class="sd"> options to control parsing. accepts the same options as the JSON datasource.</span>
<span class="sd"> See `Data Source Option &lt;https://spark.apache.org/docs/latest/sql-data-sources-json.html#data-source-option&gt;`_</span>
<span class="sd"> for the version you use.</span>
<span class="sd"> .. # noqa</span>
<span class="sd"> .. versionchanged:: 3.0.0</span>
<span class="sd"> It accepts `options` parameter to control schema inferring.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> a string representation of a :class:`StructType` parsed from given JSON.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.range(1)</span>
<span class="sd"> &gt;&gt;&gt; df.select(schema_of_json(lit(&#39;{&quot;a&quot;: 0}&#39;)).alias(&quot;json&quot;)).collect()</span>
<span class="sd"> [Row(json=&#39;STRUCT&lt;a: BIGINT&gt;&#39;)]</span>
<span class="sd"> &gt;&gt;&gt; schema = schema_of_json(&#39;{a: 1}&#39;, {&#39;allowUnquotedFieldNames&#39;:&#39;true&#39;})</span>
<span class="sd"> &gt;&gt;&gt; df.select(schema.alias(&quot;json&quot;)).collect()</span>
<span class="sd"> [Row(json=&#39;STRUCT&lt;a: BIGINT&gt;&#39;)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">json</span><span class="p">,</span> <span class="nb">str</span><span class="p">):</span>
<span class="n">col</span> <span class="o">=</span> <span class="n">_create_column_from_literal</span><span class="p">(</span><span class="n">json</span><span class="p">)</span>
<span class="k">elif</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">json</span><span class="p">,</span> <span class="n">Column</span><span class="p">):</span>
<span class="n">col</span> <span class="o">=</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">json</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">raise</span> <span class="n">PySparkTypeError</span><span class="p">(</span>
<span class="n">error_class</span><span class="o">=</span><span class="s2">&quot;NOT_COLUMN_OR_STR&quot;</span><span class="p">,</span>
<span class="n">message_parameters</span><span class="o">=</span><span class="p">{</span><span class="s2">&quot;arg_name&quot;</span><span class="p">:</span> <span class="s2">&quot;json&quot;</span><span class="p">,</span> <span class="s2">&quot;arg_type&quot;</span><span class="p">:</span> <span class="nb">type</span><span class="p">(</span><span class="n">json</span><span class="p">)</span><span class="o">.</span><span class="vm">__name__</span><span class="p">},</span>
<span class="p">)</span>
<span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">&quot;schema_of_json&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">,</span> <span class="n">_options_to_str</span><span class="p">(</span><span class="n">options</span><span class="p">))</span></div>
<div class="viewcode-block" id="json_array_length"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.json_array_length.html#pyspark.sql.functions.json_array_length">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">json_array_length</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns the number of elements in the outermost JSON array. `NULL` is returned in case of</span>
<span class="sd"> any other valid JSON string, `NULL` or an invalid JSON.</span>
<span class="sd"> .. versionadded:: 3.5.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col: :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> target column to compute on.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> length of json array.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(None,), (&#39;[1, 2, 3]&#39;,), (&#39;[]&#39;,)], [&#39;data&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(json_array_length(df.data).alias(&#39;r&#39;)).collect()</span>
<span class="sd"> [Row(r=None), Row(r=3), Row(r=0)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;json_array_length&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="json_object_keys"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.json_object_keys.html#pyspark.sql.functions.json_object_keys">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">json_object_keys</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns all the keys of the outermost JSON object as an array. If a valid JSON object is</span>
<span class="sd"> given, all the keys of the outermost object will be returned as an array. If it is any</span>
<span class="sd"> other valid JSON string, an invalid JSON string or an empty string, the function returns null.</span>
<span class="sd"> .. versionadded:: 3.5.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col: :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> target column to compute on.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> all the keys of the outermost JSON object.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(None,), (&#39;{}&#39;,), (&#39;{&quot;key1&quot;:1, &quot;key2&quot;:2}&#39;,)], [&#39;data&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(json_object_keys(df.data).alias(&#39;r&#39;)).collect()</span>
<span class="sd"> [Row(r=None), Row(r=[]), Row(r=[&#39;key1&#39;, &#39;key2&#39;])]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;json_object_keys&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="schema_of_csv"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.schema_of_csv.html#pyspark.sql.functions.schema_of_csv">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">schema_of_csv</span><span class="p">(</span><span class="n">csv</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="n">options</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Parses a CSV string and infers its schema in DDL format.</span>
<span class="sd"> .. versionadded:: 3.0.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> csv : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> a CSV string or a foldable string column containing a CSV string.</span>
<span class="sd"> options : dict, optional</span>
<span class="sd"> options to control parsing. accepts the same options as the CSV datasource.</span>
<span class="sd"> See `Data Source Option &lt;https://spark.apache.org/docs/latest/sql-data-sources-csv.html#data-source-option&gt;`_</span>
<span class="sd"> for the version you use.</span>
<span class="sd"> .. # noqa</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> a string representation of a :class:`StructType` parsed from given CSV.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.range(1)</span>
<span class="sd"> &gt;&gt;&gt; df.select(schema_of_csv(lit(&#39;1|a&#39;), {&#39;sep&#39;:&#39;|&#39;}).alias(&quot;csv&quot;)).collect()</span>
<span class="sd"> [Row(csv=&#39;STRUCT&lt;_c0: INT, _c1: STRING&gt;&#39;)]</span>
<span class="sd"> &gt;&gt;&gt; df.select(schema_of_csv(&#39;1|a&#39;, {&#39;sep&#39;:&#39;|&#39;}).alias(&quot;csv&quot;)).collect()</span>
<span class="sd"> [Row(csv=&#39;STRUCT&lt;_c0: INT, _c1: STRING&gt;&#39;)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">csv</span><span class="p">,</span> <span class="nb">str</span><span class="p">):</span>
<span class="n">col</span> <span class="o">=</span> <span class="n">_create_column_from_literal</span><span class="p">(</span><span class="n">csv</span><span class="p">)</span>
<span class="k">elif</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">csv</span><span class="p">,</span> <span class="n">Column</span><span class="p">):</span>
<span class="n">col</span> <span class="o">=</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">csv</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">raise</span> <span class="n">PySparkTypeError</span><span class="p">(</span>
<span class="n">error_class</span><span class="o">=</span><span class="s2">&quot;NOT_COLUMN_OR_STR&quot;</span><span class="p">,</span>
<span class="n">message_parameters</span><span class="o">=</span><span class="p">{</span><span class="s2">&quot;arg_name&quot;</span><span class="p">:</span> <span class="s2">&quot;csv&quot;</span><span class="p">,</span> <span class="s2">&quot;arg_type&quot;</span><span class="p">:</span> <span class="nb">type</span><span class="p">(</span><span class="n">csv</span><span class="p">)</span><span class="o">.</span><span class="vm">__name__</span><span class="p">},</span>
<span class="p">)</span>
<span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">&quot;schema_of_csv&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">,</span> <span class="n">_options_to_str</span><span class="p">(</span><span class="n">options</span><span class="p">))</span></div>
<div class="viewcode-block" id="to_csv"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.to_csv.html#pyspark.sql.functions.to_csv">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">to_csv</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="n">options</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Converts a column containing a :class:`StructType` into a CSV string.</span>
<span class="sd"> Throws an exception, in the case of an unsupported type.</span>
<span class="sd"> .. versionadded:: 3.0.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> name of column containing a struct.</span>
<span class="sd"> options: dict, optional</span>
<span class="sd"> options to control converting. accepts the same options as the CSV datasource.</span>
<span class="sd"> See `Data Source Option &lt;https://spark.apache.org/docs/latest/sql-data-sources-csv.html#data-source-option&gt;`_</span>
<span class="sd"> for the version you use.</span>
<span class="sd"> .. # noqa</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> a CSV string converted from given :class:`StructType`.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; from pyspark.sql import Row</span>
<span class="sd"> &gt;&gt;&gt; data = [(1, Row(age=2, name=&#39;Alice&#39;))]</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame(data, (&quot;key&quot;, &quot;value&quot;))</span>
<span class="sd"> &gt;&gt;&gt; df.select(to_csv(df.value).alias(&quot;csv&quot;)).collect()</span>
<span class="sd"> [Row(csv=&#39;2,Alice&#39;)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">&quot;to_csv&quot;</span><span class="p">,</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">),</span> <span class="n">_options_to_str</span><span class="p">(</span><span class="n">options</span><span class="p">))</span></div>
<div class="viewcode-block" id="size"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.size.html#pyspark.sql.functions.size">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">size</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Collection function: returns the length of the array or map stored in the column.</span>
<span class="sd"> .. versionadded:: 1.5.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> name of column or expression</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> length of the array/map.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([([1, 2, 3],),([1],),([],)], [&#39;data&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(size(df.data)).collect()</span>
<span class="sd"> [Row(size(data)=3), Row(size(data)=1), Row(size(data)=0)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;size&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="array_min"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.array_min.html#pyspark.sql.functions.array_min">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">array_min</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Collection function: returns the minimum value of the array.</span>
<span class="sd"> .. versionadded:: 2.4.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> name of column or expression</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> minimum value of array.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([([2, 1, 3],), ([None, 10, -1],)], [&#39;data&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(array_min(df.data).alias(&#39;min&#39;)).collect()</span>
<span class="sd"> [Row(min=1), Row(min=-1)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;array_min&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="array_max"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.array_max.html#pyspark.sql.functions.array_max">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">array_max</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Collection function: returns the maximum value of the array.</span>
<span class="sd"> .. versionadded:: 2.4.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> name of column or expression</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> maximum value of an array.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([([2, 1, 3],), ([None, 10, -1],)], [&#39;data&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(array_max(df.data).alias(&#39;max&#39;)).collect()</span>
<span class="sd"> [Row(max=3), Row(max=10)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;array_max&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="array_size"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.array_size.html#pyspark.sql.functions.array_size">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">array_size</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns the total number of elements in the array. The function returns null for null input.</span>
<span class="sd"> .. versionadded:: 3.5.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> target column to compute on.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> total number of elements in the array.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([([2, 1, 3],), (None,)], [&#39;data&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(array_size(df.data).alias(&#39;r&#39;)).collect()</span>
<span class="sd"> [Row(r=3), Row(r=None)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;array_size&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="cardinality"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.cardinality.html#pyspark.sql.functions.cardinality">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">cardinality</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Collection function: returns the length of the array or map stored in the column.</span>
<span class="sd"> .. versionadded:: 3.5.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> target column to compute on.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> length of the array/map.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; import pyspark.sql.functions as sf</span>
<span class="sd"> &gt;&gt;&gt; spark.createDataFrame(</span>
<span class="sd"> ... [([1, 2, 3],),([1],),([],)], [&#39;data&#39;]</span>
<span class="sd"> ... ).select(sf.cardinality(&quot;data&quot;)).show()</span>
<span class="sd"> +-----------------+</span>
<span class="sd"> |cardinality(data)|</span>
<span class="sd"> +-----------------+</span>
<span class="sd"> | 3|</span>
<span class="sd"> | 1|</span>
<span class="sd"> | 0|</span>
<span class="sd"> +-----------------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;cardinality&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="sort_array"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.sort_array.html#pyspark.sql.functions.sort_array">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">sort_array</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="n">asc</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Collection function: sorts the input array in ascending or descending order according</span>
<span class="sd"> to the natural ordering of the array elements. Null elements will be placed at the beginning</span>
<span class="sd"> of the returned array in ascending order or at the end of the returned array in descending</span>
<span class="sd"> order.</span>
<span class="sd"> .. versionadded:: 1.5.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> name of column or expression</span>
<span class="sd"> asc : bool, optional</span>
<span class="sd"> whether to sort in ascending or descending order. If `asc` is True (default)</span>
<span class="sd"> then ascending and if False then descending.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> sorted array.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([([2, 1, None, 3],),([1],),([],)], [&#39;data&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(sort_array(df.data).alias(&#39;r&#39;)).collect()</span>
<span class="sd"> [Row(r=[None, 1, 2, 3]), Row(r=[1]), Row(r=[])]</span>
<span class="sd"> &gt;&gt;&gt; df.select(sort_array(df.data, asc=False).alias(&#39;r&#39;)).collect()</span>
<span class="sd"> [Row(r=[3, 2, 1, None]), Row(r=[1]), Row(r=[])]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">&quot;sort_array&quot;</span><span class="p">,</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">),</span> <span class="n">asc</span><span class="p">)</span></div>
<div class="viewcode-block" id="array_sort"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.array_sort.html#pyspark.sql.functions.array_sort">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">array_sort</span><span class="p">(</span>
<span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="n">comparator</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Callable</span><span class="p">[[</span><span class="n">Column</span><span class="p">,</span> <span class="n">Column</span><span class="p">],</span> <span class="n">Column</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Collection function: sorts the input array in ascending order. The elements of the input array</span>
<span class="sd"> must be orderable. Null elements will be placed at the end of the returned array.</span>
<span class="sd"> .. versionadded:: 2.4.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Can take a `comparator` function.</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> name of column or expression</span>
<span class="sd"> comparator : callable, optional</span>
<span class="sd"> A binary ``(Column, Column) -&gt; Column: ...``.</span>
<span class="sd"> The comparator will take two</span>
<span class="sd"> arguments representing two elements of the array. It returns a negative integer, 0, or a</span>
<span class="sd"> positive integer as the first element is less than, equal to, or greater than the second</span>
<span class="sd"> element. If the comparator function returns null, the function will fail and raise an error.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> sorted array.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([([2, 1, None, 3],),([1],),([],)], [&#39;data&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(array_sort(df.data).alias(&#39;r&#39;)).collect()</span>
<span class="sd"> [Row(r=[1, 2, 3, None]), Row(r=[1]), Row(r=[])]</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([([&quot;foo&quot;, &quot;foobar&quot;, None, &quot;bar&quot;],),([&quot;foo&quot;],),([],)], [&#39;data&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(array_sort(</span>
<span class="sd"> ... &quot;data&quot;,</span>
<span class="sd"> ... lambda x, y: when(x.isNull() | y.isNull(), lit(0)).otherwise(length(y) - length(x))</span>
<span class="sd"> ... ).alias(&quot;r&quot;)).collect()</span>
<span class="sd"> [Row(r=[&#39;foobar&#39;, &#39;foo&#39;, None, &#39;bar&#39;]), Row(r=[&#39;foo&#39;]), Row(r=[])]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="n">comparator</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;array_sort&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">return</span> <span class="n">_invoke_higher_order_function</span><span class="p">(</span><span class="s2">&quot;ArraySort&quot;</span><span class="p">,</span> <span class="p">[</span><span class="n">col</span><span class="p">],</span> <span class="p">[</span><span class="n">comparator</span><span class="p">])</span></div>
<div class="viewcode-block" id="shuffle"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.shuffle.html#pyspark.sql.functions.shuffle">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">shuffle</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Collection function: Generates a random permutation of the given array.</span>
<span class="sd"> .. versionadded:: 2.4.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> The function is non-deterministic.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> name of column or expression</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> an array of elements in random order.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([([1, 20, 3, 5],), ([1, 20, None, 3],)], [&#39;data&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(shuffle(df.data).alias(&#39;s&#39;)).collect() # doctest: +SKIP</span>
<span class="sd"> [Row(s=[3, 1, 5, 20]), Row(s=[20, None, 3, 1])]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;shuffle&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="reverse"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.reverse.html#pyspark.sql.functions.reverse">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">reverse</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Collection function: returns a reversed string or an array with reverse order of elements.</span>
<span class="sd"> .. versionadded:: 1.5.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> name of column or expression</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> array of elements in reverse order.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(&#39;Spark SQL&#39;,)], [&#39;data&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(reverse(df.data).alias(&#39;s&#39;)).collect()</span>
<span class="sd"> [Row(s=&#39;LQS krapS&#39;)]</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([([2, 1, 3],) ,([1],) ,([],)], [&#39;data&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(reverse(df.data).alias(&#39;r&#39;)).collect()</span>
<span class="sd"> [Row(r=[3, 1, 2]), Row(r=[1]), Row(r=[])]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;reverse&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="flatten"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.flatten.html#pyspark.sql.functions.flatten">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">flatten</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Collection function: creates a single array from an array of arrays.</span>
<span class="sd"> If a structure of nested arrays is deeper than two levels,</span>
<span class="sd"> only one level of nesting is removed.</span>
<span class="sd"> .. versionadded:: 2.4.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> name of column or expression</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> flattened array.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([([[1, 2, 3], [4, 5], [6]],), ([None, [4, 5]],)], [&#39;data&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.show(truncate=False)</span>
<span class="sd"> +------------------------+</span>
<span class="sd"> |data |</span>
<span class="sd"> +------------------------+</span>
<span class="sd"> |[[1, 2, 3], [4, 5], [6]]|</span>
<span class="sd"> |[NULL, [4, 5]] |</span>
<span class="sd"> +------------------------+</span>
<span class="sd"> &gt;&gt;&gt; df.select(flatten(df.data).alias(&#39;r&#39;)).show()</span>
<span class="sd"> +------------------+</span>
<span class="sd"> | r|</span>
<span class="sd"> +------------------+</span>
<span class="sd"> |[1, 2, 3, 4, 5, 6]|</span>
<span class="sd"> | NULL|</span>
<span class="sd"> +------------------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;flatten&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="map_contains_key"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.map_contains_key.html#pyspark.sql.functions.map_contains_key">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">map_contains_key</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns true if the map contains the key.</span>
<span class="sd"> .. versionadded:: 3.4.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> name of column or expression</span>
<span class="sd"> value :</span>
<span class="sd"> a literal value</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> True if key is in the map and False otherwise.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; from pyspark.sql.functions import map_contains_key</span>
<span class="sd"> &gt;&gt;&gt; df = spark.sql(&quot;SELECT map(1, &#39;a&#39;, 2, &#39;b&#39;) as data&quot;)</span>
<span class="sd"> &gt;&gt;&gt; df.select(map_contains_key(&quot;data&quot;, 1)).show()</span>
<span class="sd"> +---------------------------------+</span>
<span class="sd"> |array_contains(map_keys(data), 1)|</span>
<span class="sd"> +---------------------------------+</span>
<span class="sd"> | true|</span>
<span class="sd"> +---------------------------------+</span>
<span class="sd"> &gt;&gt;&gt; df.select(map_contains_key(&quot;data&quot;, -1)).show()</span>
<span class="sd"> +----------------------------------+</span>
<span class="sd"> |array_contains(map_keys(data), -1)|</span>
<span class="sd"> +----------------------------------+</span>
<span class="sd"> | false|</span>
<span class="sd"> +----------------------------------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">&quot;map_contains_key&quot;</span><span class="p">,</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">),</span> <span class="n">value</span><span class="p">)</span></div>
<div class="viewcode-block" id="map_keys"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.map_keys.html#pyspark.sql.functions.map_keys">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">map_keys</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Collection function: Returns an unordered array containing the keys of the map.</span>
<span class="sd"> .. versionadded:: 2.3.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> name of column or expression</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> keys of the map as an array.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; from pyspark.sql.functions import map_keys</span>
<span class="sd"> &gt;&gt;&gt; df = spark.sql(&quot;SELECT map(1, &#39;a&#39;, 2, &#39;b&#39;) as data&quot;)</span>
<span class="sd"> &gt;&gt;&gt; df.select(map_keys(&quot;data&quot;).alias(&quot;keys&quot;)).show()</span>
<span class="sd"> +------+</span>
<span class="sd"> | keys|</span>
<span class="sd"> +------+</span>
<span class="sd"> |[1, 2]|</span>
<span class="sd"> +------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;map_keys&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="map_values"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.map_values.html#pyspark.sql.functions.map_values">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">map_values</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Collection function: Returns an unordered array containing the values of the map.</span>
<span class="sd"> .. versionadded:: 2.3.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> name of column or expression</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> values of the map as an array.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; from pyspark.sql.functions import map_values</span>
<span class="sd"> &gt;&gt;&gt; df = spark.sql(&quot;SELECT map(1, &#39;a&#39;, 2, &#39;b&#39;) as data&quot;)</span>
<span class="sd"> &gt;&gt;&gt; df.select(map_values(&quot;data&quot;).alias(&quot;values&quot;)).show()</span>
<span class="sd"> +------+</span>
<span class="sd"> |values|</span>
<span class="sd"> +------+</span>
<span class="sd"> |[a, b]|</span>
<span class="sd"> +------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;map_values&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="map_entries"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.map_entries.html#pyspark.sql.functions.map_entries">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">map_entries</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Collection function: Returns an unordered array of all entries in the given map.</span>
<span class="sd"> .. versionadded:: 3.0.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> name of column or expression</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> an array of key value pairs as a struct type</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; from pyspark.sql.functions import map_entries</span>
<span class="sd"> &gt;&gt;&gt; df = spark.sql(&quot;SELECT map(1, &#39;a&#39;, 2, &#39;b&#39;) as data&quot;)</span>
<span class="sd"> &gt;&gt;&gt; df = df.select(map_entries(&quot;data&quot;).alias(&quot;entries&quot;))</span>
<span class="sd"> &gt;&gt;&gt; df.show()</span>
<span class="sd"> +----------------+</span>
<span class="sd"> | entries|</span>
<span class="sd"> +----------------+</span>
<span class="sd"> |[{1, a}, {2, b}]|</span>
<span class="sd"> +----------------+</span>
<span class="sd"> &gt;&gt;&gt; df.printSchema()</span>
<span class="sd"> root</span>
<span class="sd"> |-- entries: array (nullable = false)</span>
<span class="sd"> | |-- element: struct (containsNull = false)</span>
<span class="sd"> | | |-- key: integer (nullable = false)</span>
<span class="sd"> | | |-- value: string (nullable = false)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;map_entries&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="map_from_entries"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.map_from_entries.html#pyspark.sql.functions.map_from_entries">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">map_from_entries</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Collection function: Converts an array of entries (key value struct types) to a map</span>
<span class="sd"> of values.</span>
<span class="sd"> .. versionadded:: 2.4.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> name of column or expression</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> a map created from the given array of entries.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; from pyspark.sql.functions import map_from_entries</span>
<span class="sd"> &gt;&gt;&gt; df = spark.sql(&quot;SELECT array(struct(1, &#39;a&#39;), struct(2, &#39;b&#39;)) as data&quot;)</span>
<span class="sd"> &gt;&gt;&gt; df.select(map_from_entries(&quot;data&quot;).alias(&quot;map&quot;)).show()</span>
<span class="sd"> +----------------+</span>
<span class="sd"> | map|</span>
<span class="sd"> +----------------+</span>
<span class="sd"> |{1 -&gt; a, 2 -&gt; b}|</span>
<span class="sd"> +----------------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;map_from_entries&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="array_repeat"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.array_repeat.html#pyspark.sql.functions.array_repeat">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">array_repeat</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="n">count</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="nb">int</span><span class="p">])</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Collection function: creates an array containing a column repeated count times.</span>
<span class="sd"> .. versionadded:: 2.4.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> column name or column that contains the element to be repeated</span>
<span class="sd"> count : :class:`~pyspark.sql.Column` or str or int</span>
<span class="sd"> column name, column, or int containing the number of times to repeat the first argument</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> an array of repeated elements.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(&#39;ab&#39;,)], [&#39;data&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(array_repeat(df.data, 3).alias(&#39;r&#39;)).collect()</span>
<span class="sd"> [Row(r=[&#39;ab&#39;, &#39;ab&#39;, &#39;ab&#39;])]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">count</span> <span class="o">=</span> <span class="n">lit</span><span class="p">(</span><span class="n">count</span><span class="p">)</span> <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">count</span><span class="p">,</span> <span class="nb">int</span><span class="p">)</span> <span class="k">else</span> <span class="n">count</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;array_repeat&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">,</span> <span class="n">count</span><span class="p">)</span></div>
<div class="viewcode-block" id="arrays_zip"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.arrays_zip.html#pyspark.sql.functions.arrays_zip">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">arrays_zip</span><span class="p">(</span><span class="o">*</span><span class="n">cols</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Collection function: Returns a merged array of structs in which the N-th struct contains all</span>
<span class="sd"> N-th values of input arrays. If one of the arrays is shorter than others then</span>
<span class="sd"> resulting struct type value will be a `null` for missing elements.</span>
<span class="sd"> .. versionadded:: 2.4.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> cols : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> columns of arrays to be merged.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> merged array of entries.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; from pyspark.sql.functions import arrays_zip</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([([1, 2, 3], [2, 4, 6], [3, 6])], [&#39;vals1&#39;, &#39;vals2&#39;, &#39;vals3&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df = df.select(arrays_zip(df.vals1, df.vals2, df.vals3).alias(&#39;zipped&#39;))</span>
<span class="sd"> &gt;&gt;&gt; df.show(truncate=False)</span>
<span class="sd"> +------------------------------------+</span>
<span class="sd"> |zipped |</span>
<span class="sd"> +------------------------------------+</span>
<span class="sd"> |[{1, 2, 3}, {2, 4, 6}, {3, 6, NULL}]|</span>
<span class="sd"> +------------------------------------+</span>
<span class="sd"> &gt;&gt;&gt; df.printSchema()</span>
<span class="sd"> root</span>
<span class="sd"> |-- zipped: array (nullable = true)</span>
<span class="sd"> | |-- element: struct (containsNull = false)</span>
<span class="sd"> | | |-- vals1: long (nullable = true)</span>
<span class="sd"> | | |-- vals2: long (nullable = true)</span>
<span class="sd"> | | |-- vals3: long (nullable = true)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_seq_of_columns</span><span class="p">(</span><span class="s2">&quot;arrays_zip&quot;</span><span class="p">,</span> <span class="n">cols</span><span class="p">)</span></div>
<span class="nd">@overload</span>
<span class="k">def</span> <span class="nf">map_concat</span><span class="p">(</span><span class="o">*</span><span class="n">cols</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="o">...</span>
<span class="nd">@overload</span>
<span class="k">def</span> <span class="nf">map_concat</span><span class="p">(</span><span class="n">__cols</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="s2">&quot;ColumnOrName_&quot;</span><span class="p">],</span> <span class="n">Tuple</span><span class="p">[</span><span class="s2">&quot;ColumnOrName_&quot;</span><span class="p">,</span> <span class="o">...</span><span class="p">]])</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="o">...</span>
<div class="viewcode-block" id="map_concat"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.map_concat.html#pyspark.sql.functions.map_concat">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">map_concat</span><span class="p">(</span>
<span class="o">*</span><span class="n">cols</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="n">Union</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="s2">&quot;ColumnOrName_&quot;</span><span class="p">],</span> <span class="n">Tuple</span><span class="p">[</span><span class="s2">&quot;ColumnOrName_&quot;</span><span class="p">,</span> <span class="o">...</span><span class="p">]]]</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Returns the union of all the given maps.</span>
<span class="sd"> .. versionadded:: 2.4.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> cols : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> column names or :class:`~pyspark.sql.Column`\\s</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> a map of merged entries from other maps.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; from pyspark.sql.functions import map_concat</span>
<span class="sd"> &gt;&gt;&gt; df = spark.sql(&quot;SELECT map(1, &#39;a&#39;, 2, &#39;b&#39;) as map1, map(3, &#39;c&#39;) as map2&quot;)</span>
<span class="sd"> &gt;&gt;&gt; df.select(map_concat(&quot;map1&quot;, &quot;map2&quot;).alias(&quot;map3&quot;)).show(truncate=False)</span>
<span class="sd"> +------------------------+</span>
<span class="sd"> |map3 |</span>
<span class="sd"> +------------------------+</span>
<span class="sd"> |{1 -&gt; a, 2 -&gt; b, 3 -&gt; c}|</span>
<span class="sd"> +------------------------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">cols</span><span class="p">)</span> <span class="o">==</span> <span class="mi">1</span> <span class="ow">and</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">cols</span><span class="p">[</span><span class="mi">0</span><span class="p">],</span> <span class="p">(</span><span class="nb">list</span><span class="p">,</span> <span class="nb">set</span><span class="p">)):</span>
<span class="n">cols</span> <span class="o">=</span> <span class="n">cols</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> <span class="c1"># type: ignore[assignment]</span>
<span class="k">return</span> <span class="n">_invoke_function_over_seq_of_columns</span><span class="p">(</span><span class="s2">&quot;map_concat&quot;</span><span class="p">,</span> <span class="n">cols</span><span class="p">)</span> <span class="c1"># type: ignore[arg-type]</span></div>
<div class="viewcode-block" id="sequence"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.sequence.html#pyspark.sql.functions.sequence">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">sequence</span><span class="p">(</span>
<span class="n">start</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="n">stop</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="n">step</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="s2">&quot;ColumnOrName&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Generate a sequence of integers from `start` to `stop`, incrementing by `step`.</span>
<span class="sd"> If `step` is not set, incrementing by 1 if `start` is less than or equal to `stop`,</span>
<span class="sd"> otherwise -1.</span>
<span class="sd"> .. versionadded:: 2.4.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> start : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> starting value (inclusive)</span>
<span class="sd"> stop : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> last values (inclusive)</span>
<span class="sd"> step : :class:`~pyspark.sql.Column` or str, optional</span>
<span class="sd"> value to add to current to get next element (default is 1)</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> an array of sequence values</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df1 = spark.createDataFrame([(-2, 2)], (&#39;C1&#39;, &#39;C2&#39;))</span>
<span class="sd"> &gt;&gt;&gt; df1.select(sequence(&#39;C1&#39;, &#39;C2&#39;).alias(&#39;r&#39;)).collect()</span>
<span class="sd"> [Row(r=[-2, -1, 0, 1, 2])]</span>
<span class="sd"> &gt;&gt;&gt; df2 = spark.createDataFrame([(4, -4, -2)], (&#39;C1&#39;, &#39;C2&#39;, &#39;C3&#39;))</span>
<span class="sd"> &gt;&gt;&gt; df2.select(sequence(&#39;C1&#39;, &#39;C2&#39;, &#39;C3&#39;).alias(&#39;r&#39;)).collect()</span>
<span class="sd"> [Row(r=[4, 2, 0, -2, -4])]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="n">step</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;sequence&quot;</span><span class="p">,</span> <span class="n">start</span><span class="p">,</span> <span class="n">stop</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;sequence&quot;</span><span class="p">,</span> <span class="n">start</span><span class="p">,</span> <span class="n">stop</span><span class="p">,</span> <span class="n">step</span><span class="p">)</span></div>
<div class="viewcode-block" id="from_csv"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.from_csv.html#pyspark.sql.functions.from_csv">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">from_csv</span><span class="p">(</span>
<span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span>
<span class="n">schema</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="n">Column</span><span class="p">,</span> <span class="nb">str</span><span class="p">],</span>
<span class="n">options</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Parses a column containing a CSV string to a row with the specified schema.</span>
<span class="sd"> Returns `null`, in the case of an unparseable string.</span>
<span class="sd"> .. versionadded:: 3.0.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> a column or column name in CSV format</span>
<span class="sd"> schema :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> a column, or Python string literal with schema in DDL format, to use when parsing the CSV column.</span>
<span class="sd"> options : dict, optional</span>
<span class="sd"> options to control parsing. accepts the same options as the CSV datasource.</span>
<span class="sd"> See `Data Source Option &lt;https://spark.apache.org/docs/latest/sql-data-sources-csv.html#data-source-option&gt;`_</span>
<span class="sd"> for the version you use.</span>
<span class="sd"> .. # noqa</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> a column of parsed CSV values</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; data = [(&quot;1,2,3&quot;,)]</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame(data, (&quot;value&quot;,))</span>
<span class="sd"> &gt;&gt;&gt; df.select(from_csv(df.value, &quot;a INT, b INT, c INT&quot;).alias(&quot;csv&quot;)).collect()</span>
<span class="sd"> [Row(csv=Row(a=1, b=2, c=3))]</span>
<span class="sd"> &gt;&gt;&gt; value = data[0][0]</span>
<span class="sd"> &gt;&gt;&gt; df.select(from_csv(df.value, schema_of_csv(value)).alias(&quot;csv&quot;)).collect()</span>
<span class="sd"> [Row(csv=Row(_c0=1, _c1=2, _c2=3))]</span>
<span class="sd"> &gt;&gt;&gt; data = [(&quot; abc&quot;,)]</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame(data, (&quot;value&quot;,))</span>
<span class="sd"> &gt;&gt;&gt; options = {&#39;ignoreLeadingWhiteSpace&#39;: True}</span>
<span class="sd"> &gt;&gt;&gt; df.select(from_csv(df.value, &quot;s string&quot;, options).alias(&quot;csv&quot;)).collect()</span>
<span class="sd"> [Row(csv=Row(s=&#39;abc&#39;))]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">get_active_spark_context</span><span class="p">()</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">schema</span><span class="p">,</span> <span class="nb">str</span><span class="p">):</span>
<span class="n">schema</span> <span class="o">=</span> <span class="n">_create_column_from_literal</span><span class="p">(</span><span class="n">schema</span><span class="p">)</span>
<span class="k">elif</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">schema</span><span class="p">,</span> <span class="n">Column</span><span class="p">):</span>
<span class="n">schema</span> <span class="o">=</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">schema</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">raise</span> <span class="n">PySparkTypeError</span><span class="p">(</span>
<span class="n">error_class</span><span class="o">=</span><span class="s2">&quot;NOT_COLUMN_OR_STR&quot;</span><span class="p">,</span>
<span class="n">message_parameters</span><span class="o">=</span><span class="p">{</span><span class="s2">&quot;arg_name&quot;</span><span class="p">:</span> <span class="s2">&quot;schema&quot;</span><span class="p">,</span> <span class="s2">&quot;arg_type&quot;</span><span class="p">:</span> <span class="nb">type</span><span class="p">(</span><span class="n">schema</span><span class="p">)</span><span class="o">.</span><span class="vm">__name__</span><span class="p">},</span>
<span class="p">)</span>
<span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">&quot;from_csv&quot;</span><span class="p">,</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">),</span> <span class="n">schema</span><span class="p">,</span> <span class="n">_options_to_str</span><span class="p">(</span><span class="n">options</span><span class="p">))</span></div>
<span class="k">def</span> <span class="nf">_unresolved_named_lambda_variable</span><span class="p">(</span><span class="o">*</span><span class="n">name_parts</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Create `o.a.s.sql.expressions.UnresolvedNamedLambdaVariable`,</span>
<span class="sd"> convert it to o.s.sql.Column and wrap in Python `Column`</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> name_parts : str</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">get_active_spark_context</span><span class="p">()</span>
<span class="n">name_parts_seq</span> <span class="o">=</span> <span class="n">_to_seq</span><span class="p">(</span><span class="n">sc</span><span class="p">,</span> <span class="n">name_parts</span><span class="p">)</span>
<span class="n">expressions</span> <span class="o">=</span> <span class="n">cast</span><span class="p">(</span><span class="n">JVMView</span><span class="p">,</span> <span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="p">)</span><span class="o">.</span><span class="n">org</span><span class="o">.</span><span class="n">apache</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">sql</span><span class="o">.</span><span class="n">catalyst</span><span class="o">.</span><span class="n">expressions</span>
<span class="k">return</span> <span class="n">Column</span><span class="p">(</span>
<span class="n">cast</span><span class="p">(</span><span class="n">JVMView</span><span class="p">,</span> <span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="p">)</span><span class="o">.</span><span class="n">Column</span><span class="p">(</span><span class="n">expressions</span><span class="o">.</span><span class="n">UnresolvedNamedLambdaVariable</span><span class="p">(</span><span class="n">name_parts_seq</span><span class="p">))</span>
<span class="p">)</span>
<span class="k">def</span> <span class="nf">_get_lambda_parameters</span><span class="p">(</span><span class="n">f</span><span class="p">:</span> <span class="n">Callable</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">ValuesView</span><span class="p">[</span><span class="n">inspect</span><span class="o">.</span><span class="n">Parameter</span><span class="p">]:</span>
<span class="n">signature</span> <span class="o">=</span> <span class="n">inspect</span><span class="o">.</span><span class="n">signature</span><span class="p">(</span><span class="n">f</span><span class="p">)</span>
<span class="n">parameters</span> <span class="o">=</span> <span class="n">signature</span><span class="o">.</span><span class="n">parameters</span><span class="o">.</span><span class="n">values</span><span class="p">()</span>
<span class="c1"># We should exclude functions that use</span>
<span class="c1"># variable args and keyword argnames</span>
<span class="c1"># as well as keyword only args</span>
<span class="n">supported_parameter_types</span> <span class="o">=</span> <span class="p">{</span>
<span class="n">inspect</span><span class="o">.</span><span class="n">Parameter</span><span class="o">.</span><span class="n">POSITIONAL_OR_KEYWORD</span><span class="p">,</span>
<span class="n">inspect</span><span class="o">.</span><span class="n">Parameter</span><span class="o">.</span><span class="n">POSITIONAL_ONLY</span><span class="p">,</span>
<span class="p">}</span>
<span class="c1"># Validate that</span>
<span class="c1"># function arity is between 1 and 3</span>
<span class="k">if</span> <span class="ow">not</span> <span class="p">(</span><span class="mi">1</span> <span class="o">&lt;=</span> <span class="nb">len</span><span class="p">(</span><span class="n">parameters</span><span class="p">)</span> <span class="o">&lt;=</span> <span class="mi">3</span><span class="p">):</span>
<span class="k">raise</span> <span class="n">PySparkValueError</span><span class="p">(</span>
<span class="n">error_class</span><span class="o">=</span><span class="s2">&quot;WRONG_NUM_ARGS_FOR_HIGHER_ORDER_FUNCTION&quot;</span><span class="p">,</span>
<span class="n">message_parameters</span><span class="o">=</span><span class="p">{</span><span class="s2">&quot;func_name&quot;</span><span class="p">:</span> <span class="n">f</span><span class="o">.</span><span class="vm">__name__</span><span class="p">,</span> <span class="s2">&quot;num_args&quot;</span><span class="p">:</span> <span class="nb">str</span><span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="n">parameters</span><span class="p">))},</span>
<span class="p">)</span>
<span class="c1"># and all arguments can be used as positional</span>
<span class="k">if</span> <span class="ow">not</span> <span class="nb">all</span><span class="p">(</span><span class="n">p</span><span class="o">.</span><span class="n">kind</span> <span class="ow">in</span> <span class="n">supported_parameter_types</span> <span class="k">for</span> <span class="n">p</span> <span class="ow">in</span> <span class="n">parameters</span><span class="p">):</span>
<span class="k">raise</span> <span class="n">PySparkValueError</span><span class="p">(</span>
<span class="n">error_class</span><span class="o">=</span><span class="s2">&quot;UNSUPPORTED_PARAM_TYPE_FOR_HIGHER_ORDER_FUNCTION&quot;</span><span class="p">,</span>
<span class="n">message_parameters</span><span class="o">=</span><span class="p">{</span><span class="s2">&quot;func_name&quot;</span><span class="p">:</span> <span class="n">f</span><span class="o">.</span><span class="vm">__name__</span><span class="p">},</span>
<span class="p">)</span>
<span class="k">return</span> <span class="n">parameters</span>
<span class="k">def</span> <span class="nf">_create_lambda</span><span class="p">(</span><span class="n">f</span><span class="p">:</span> <span class="n">Callable</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Callable</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Create `o.a.s.sql.expressions.LambdaFunction` corresponding</span>
<span class="sd"> to transformation described by f</span>
<span class="sd"> :param f: A Python of one of the following forms:</span>
<span class="sd"> - (Column) -&gt; Column: ...</span>
<span class="sd"> - (Column, Column) -&gt; Column: ...</span>
<span class="sd"> - (Column, Column, Column) -&gt; Column: ...</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">parameters</span> <span class="o">=</span> <span class="n">_get_lambda_parameters</span><span class="p">(</span><span class="n">f</span><span class="p">)</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">get_active_spark_context</span><span class="p">()</span>
<span class="n">expressions</span> <span class="o">=</span> <span class="n">cast</span><span class="p">(</span><span class="n">JVMView</span><span class="p">,</span> <span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="p">)</span><span class="o">.</span><span class="n">org</span><span class="o">.</span><span class="n">apache</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">sql</span><span class="o">.</span><span class="n">catalyst</span><span class="o">.</span><span class="n">expressions</span>
<span class="n">argnames</span> <span class="o">=</span> <span class="p">[</span><span class="s2">&quot;x&quot;</span><span class="p">,</span> <span class="s2">&quot;y&quot;</span><span class="p">,</span> <span class="s2">&quot;z&quot;</span><span class="p">]</span>
<span class="n">args</span> <span class="o">=</span> <span class="p">[</span>
<span class="n">_unresolved_named_lambda_variable</span><span class="p">(</span>
<span class="n">expressions</span><span class="o">.</span><span class="n">UnresolvedNamedLambdaVariable</span><span class="o">.</span><span class="n">freshVarName</span><span class="p">(</span><span class="n">arg</span><span class="p">)</span>
<span class="p">)</span>
<span class="k">for</span> <span class="n">arg</span> <span class="ow">in</span> <span class="n">argnames</span><span class="p">[:</span> <span class="nb">len</span><span class="p">(</span><span class="n">parameters</span><span class="p">)]</span>
<span class="p">]</span>
<span class="n">result</span> <span class="o">=</span> <span class="n">f</span><span class="p">(</span><span class="o">*</span><span class="n">args</span><span class="p">)</span>
<span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">result</span><span class="p">,</span> <span class="n">Column</span><span class="p">):</span>
<span class="k">raise</span> <span class="n">PySparkValueError</span><span class="p">(</span>
<span class="n">error_class</span><span class="o">=</span><span class="s2">&quot;HIGHER_ORDER_FUNCTION_SHOULD_RETURN_COLUMN&quot;</span><span class="p">,</span>
<span class="n">message_parameters</span><span class="o">=</span><span class="p">{</span><span class="s2">&quot;func_name&quot;</span><span class="p">:</span> <span class="n">f</span><span class="o">.</span><span class="vm">__name__</span><span class="p">,</span> <span class="s2">&quot;return_type&quot;</span><span class="p">:</span> <span class="nb">type</span><span class="p">(</span><span class="n">result</span><span class="p">)</span><span class="o">.</span><span class="vm">__name__</span><span class="p">},</span>
<span class="p">)</span>
<span class="n">jexpr</span> <span class="o">=</span> <span class="n">result</span><span class="o">.</span><span class="n">_jc</span><span class="o">.</span><span class="n">expr</span><span class="p">()</span>
<span class="n">jargs</span> <span class="o">=</span> <span class="n">_to_seq</span><span class="p">(</span><span class="n">sc</span><span class="p">,</span> <span class="p">[</span><span class="n">arg</span><span class="o">.</span><span class="n">_jc</span><span class="o">.</span><span class="n">expr</span><span class="p">()</span> <span class="k">for</span> <span class="n">arg</span> <span class="ow">in</span> <span class="n">args</span><span class="p">])</span>
<span class="k">return</span> <span class="n">expressions</span><span class="o">.</span><span class="n">LambdaFunction</span><span class="p">(</span><span class="n">jexpr</span><span class="p">,</span> <span class="n">jargs</span><span class="p">,</span> <span class="kc">False</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">_invoke_higher_order_function</span><span class="p">(</span>
<span class="n">name</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span>
<span class="n">cols</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="s2">&quot;ColumnOrName&quot;</span><span class="p">],</span>
<span class="n">funs</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="n">Callable</span><span class="p">],</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Invokes expression identified by name,</span>
<span class="sd"> (relative to ```org.apache.spark.sql.catalyst.expressions``)</span>
<span class="sd"> and wraps the result with Column (first Scala one, then Python).</span>
<span class="sd"> :param name: Name of the expression</span>
<span class="sd"> :param cols: a list of columns</span>
<span class="sd"> :param funs: a list of (*Column) -&gt; Column functions.</span>
<span class="sd"> :return: a Column</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">get_active_spark_context</span><span class="p">()</span>
<span class="n">expressions</span> <span class="o">=</span> <span class="n">cast</span><span class="p">(</span><span class="n">JVMView</span><span class="p">,</span> <span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="p">)</span><span class="o">.</span><span class="n">org</span><span class="o">.</span><span class="n">apache</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">sql</span><span class="o">.</span><span class="n">catalyst</span><span class="o">.</span><span class="n">expressions</span>
<span class="n">expr</span> <span class="o">=</span> <span class="nb">getattr</span><span class="p">(</span><span class="n">expressions</span><span class="p">,</span> <span class="n">name</span><span class="p">)</span>
<span class="n">jcols</span> <span class="o">=</span> <span class="p">[</span><span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">)</span><span class="o">.</span><span class="n">expr</span><span class="p">()</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">cols</span><span class="p">]</span>
<span class="n">jfuns</span> <span class="o">=</span> <span class="p">[</span><span class="n">_create_lambda</span><span class="p">(</span><span class="n">f</span><span class="p">)</span> <span class="k">for</span> <span class="n">f</span> <span class="ow">in</span> <span class="n">funs</span><span class="p">]</span>
<span class="k">return</span> <span class="n">Column</span><span class="p">(</span><span class="n">cast</span><span class="p">(</span><span class="n">JVMView</span><span class="p">,</span> <span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="p">)</span><span class="o">.</span><span class="n">Column</span><span class="p">(</span><span class="n">expr</span><span class="p">(</span><span class="o">*</span><span class="n">jcols</span> <span class="o">+</span> <span class="n">jfuns</span><span class="p">)))</span>
<span class="nd">@overload</span>
<span class="k">def</span> <span class="nf">transform</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="n">f</span><span class="p">:</span> <span class="n">Callable</span><span class="p">[[</span><span class="n">Column</span><span class="p">],</span> <span class="n">Column</span><span class="p">])</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="o">...</span>
<span class="nd">@overload</span>
<span class="k">def</span> <span class="nf">transform</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="n">f</span><span class="p">:</span> <span class="n">Callable</span><span class="p">[[</span><span class="n">Column</span><span class="p">,</span> <span class="n">Column</span><span class="p">],</span> <span class="n">Column</span><span class="p">])</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="o">...</span>
<div class="viewcode-block" id="transform"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.transform.html#pyspark.sql.functions.transform">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">transform</span><span class="p">(</span>
<span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span>
<span class="n">f</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="n">Callable</span><span class="p">[[</span><span class="n">Column</span><span class="p">],</span> <span class="n">Column</span><span class="p">],</span> <span class="n">Callable</span><span class="p">[[</span><span class="n">Column</span><span class="p">,</span> <span class="n">Column</span><span class="p">],</span> <span class="n">Column</span><span class="p">]],</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns an array of elements after applying a transformation to each element in the input array.</span>
<span class="sd"> .. versionadded:: 3.1.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> name of column or expression</span>
<span class="sd"> f : function</span>
<span class="sd"> a function that is applied to each element of the input array.</span>
<span class="sd"> Can take one of the following forms:</span>
<span class="sd"> - Unary ``(x: Column) -&gt; Column: ...``</span>
<span class="sd"> - Binary ``(x: Column, i: Column) -&gt; Column...``, where the second argument is</span>
<span class="sd"> a 0-based index of the element.</span>
<span class="sd"> and can use methods of :class:`~pyspark.sql.Column`, functions defined in</span>
<span class="sd"> :py:mod:`pyspark.sql.functions` and Scala ``UserDefinedFunctions``.</span>
<span class="sd"> Python ``UserDefinedFunctions`` are not supported</span>
<span class="sd"> (`SPARK-27052 &lt;https://issues.apache.org/jira/browse/SPARK-27052&gt;`__).</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> a new array of transformed elements.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(1, [1, 2, 3, 4])], (&quot;key&quot;, &quot;values&quot;))</span>
<span class="sd"> &gt;&gt;&gt; df.select(transform(&quot;values&quot;, lambda x: x * 2).alias(&quot;doubled&quot;)).show()</span>
<span class="sd"> +------------+</span>
<span class="sd"> | doubled|</span>
<span class="sd"> +------------+</span>
<span class="sd"> |[2, 4, 6, 8]|</span>
<span class="sd"> +------------+</span>
<span class="sd"> &gt;&gt;&gt; def alternate(x, i):</span>
<span class="sd"> ... return when(i % 2 == 0, x).otherwise(-x)</span>
<span class="sd"> ...</span>
<span class="sd"> &gt;&gt;&gt; df.select(transform(&quot;values&quot;, alternate).alias(&quot;alternated&quot;)).show()</span>
<span class="sd"> +--------------+</span>
<span class="sd"> | alternated|</span>
<span class="sd"> +--------------+</span>
<span class="sd"> |[1, -2, 3, -4]|</span>
<span class="sd"> +--------------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_higher_order_function</span><span class="p">(</span><span class="s2">&quot;ArrayTransform&quot;</span><span class="p">,</span> <span class="p">[</span><span class="n">col</span><span class="p">],</span> <span class="p">[</span><span class="n">f</span><span class="p">])</span></div>
<div class="viewcode-block" id="exists"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.exists.html#pyspark.sql.functions.exists">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">exists</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="n">f</span><span class="p">:</span> <span class="n">Callable</span><span class="p">[[</span><span class="n">Column</span><span class="p">],</span> <span class="n">Column</span><span class="p">])</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns whether a predicate holds for one or more elements in the array.</span>
<span class="sd"> .. versionadded:: 3.1.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> name of column or expression</span>
<span class="sd"> f : function</span>
<span class="sd"> ``(x: Column) -&gt; Column: ...`` returning the Boolean expression.</span>
<span class="sd"> Can use methods of :class:`~pyspark.sql.Column`, functions defined in</span>
<span class="sd"> :py:mod:`pyspark.sql.functions` and Scala ``UserDefinedFunctions``.</span>
<span class="sd"> Python ``UserDefinedFunctions`` are not supported</span>
<span class="sd"> (`SPARK-27052 &lt;https://issues.apache.org/jira/browse/SPARK-27052&gt;`__).</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> True if &quot;any&quot; element of an array evaluates to True when passed as an argument to</span>
<span class="sd"> given function and False otherwise.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(1, [1, 2, 3, 4]), (2, [3, -1, 0])],(&quot;key&quot;, &quot;values&quot;))</span>
<span class="sd"> &gt;&gt;&gt; df.select(exists(&quot;values&quot;, lambda x: x &lt; 0).alias(&quot;any_negative&quot;)).show()</span>
<span class="sd"> +------------+</span>
<span class="sd"> |any_negative|</span>
<span class="sd"> +------------+</span>
<span class="sd"> | false|</span>
<span class="sd"> | true|</span>
<span class="sd"> +------------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_higher_order_function</span><span class="p">(</span><span class="s2">&quot;ArrayExists&quot;</span><span class="p">,</span> <span class="p">[</span><span class="n">col</span><span class="p">],</span> <span class="p">[</span><span class="n">f</span><span class="p">])</span></div>
<div class="viewcode-block" id="forall"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.forall.html#pyspark.sql.functions.forall">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">forall</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="n">f</span><span class="p">:</span> <span class="n">Callable</span><span class="p">[[</span><span class="n">Column</span><span class="p">],</span> <span class="n">Column</span><span class="p">])</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns whether a predicate holds for every element in the array.</span>
<span class="sd"> .. versionadded:: 3.1.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> name of column or expression</span>
<span class="sd"> f : function</span>
<span class="sd"> ``(x: Column) -&gt; Column: ...`` returning the Boolean expression.</span>
<span class="sd"> Can use methods of :class:`~pyspark.sql.Column`, functions defined in</span>
<span class="sd"> :py:mod:`pyspark.sql.functions` and Scala ``UserDefinedFunctions``.</span>
<span class="sd"> Python ``UserDefinedFunctions`` are not supported</span>
<span class="sd"> (`SPARK-27052 &lt;https://issues.apache.org/jira/browse/SPARK-27052&gt;`__).</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> True if &quot;all&quot; elements of an array evaluates to True when passed as an argument to</span>
<span class="sd"> given function and False otherwise.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame(</span>
<span class="sd"> ... [(1, [&quot;bar&quot;]), (2, [&quot;foo&quot;, &quot;bar&quot;]), (3, [&quot;foobar&quot;, &quot;foo&quot;])],</span>
<span class="sd"> ... (&quot;key&quot;, &quot;values&quot;)</span>
<span class="sd"> ... )</span>
<span class="sd"> &gt;&gt;&gt; df.select(forall(&quot;values&quot;, lambda x: x.rlike(&quot;foo&quot;)).alias(&quot;all_foo&quot;)).show()</span>
<span class="sd"> +-------+</span>
<span class="sd"> |all_foo|</span>
<span class="sd"> +-------+</span>
<span class="sd"> | false|</span>
<span class="sd"> | false|</span>
<span class="sd"> | true|</span>
<span class="sd"> +-------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_higher_order_function</span><span class="p">(</span><span class="s2">&quot;ArrayForAll&quot;</span><span class="p">,</span> <span class="p">[</span><span class="n">col</span><span class="p">],</span> <span class="p">[</span><span class="n">f</span><span class="p">])</span></div>
<span class="nd">@overload</span>
<span class="k">def</span> <span class="nf">filter</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="n">f</span><span class="p">:</span> <span class="n">Callable</span><span class="p">[[</span><span class="n">Column</span><span class="p">],</span> <span class="n">Column</span><span class="p">])</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="o">...</span>
<span class="nd">@overload</span>
<span class="k">def</span> <span class="nf">filter</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="n">f</span><span class="p">:</span> <span class="n">Callable</span><span class="p">[[</span><span class="n">Column</span><span class="p">,</span> <span class="n">Column</span><span class="p">],</span> <span class="n">Column</span><span class="p">])</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="o">...</span>
<div class="viewcode-block" id="filter"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.filter.html#pyspark.sql.functions.filter">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">filter</span><span class="p">(</span>
<span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span>
<span class="n">f</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="n">Callable</span><span class="p">[[</span><span class="n">Column</span><span class="p">],</span> <span class="n">Column</span><span class="p">],</span> <span class="n">Callable</span><span class="p">[[</span><span class="n">Column</span><span class="p">,</span> <span class="n">Column</span><span class="p">],</span> <span class="n">Column</span><span class="p">]],</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns an array of elements for which a predicate holds in a given array.</span>
<span class="sd"> .. versionadded:: 3.1.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> name of column or expression</span>
<span class="sd"> f : function</span>
<span class="sd"> A function that returns the Boolean expression.</span>
<span class="sd"> Can take one of the following forms:</span>
<span class="sd"> - Unary ``(x: Column) -&gt; Column: ...``</span>
<span class="sd"> - Binary ``(x: Column, i: Column) -&gt; Column...``, where the second argument is</span>
<span class="sd"> a 0-based index of the element.</span>
<span class="sd"> and can use methods of :class:`~pyspark.sql.Column`, functions defined in</span>
<span class="sd"> :py:mod:`pyspark.sql.functions` and Scala ``UserDefinedFunctions``.</span>
<span class="sd"> Python ``UserDefinedFunctions`` are not supported</span>
<span class="sd"> (`SPARK-27052 &lt;https://issues.apache.org/jira/browse/SPARK-27052&gt;`__).</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> filtered array of elements where given function evaluated to True</span>
<span class="sd"> when passed as an argument.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame(</span>
<span class="sd"> ... [(1, [&quot;2018-09-20&quot;, &quot;2019-02-03&quot;, &quot;2019-07-01&quot;, &quot;2020-06-01&quot;])],</span>
<span class="sd"> ... (&quot;key&quot;, &quot;values&quot;)</span>
<span class="sd"> ... )</span>
<span class="sd"> &gt;&gt;&gt; def after_second_quarter(x):</span>
<span class="sd"> ... return month(to_date(x)) &gt; 6</span>
<span class="sd"> ...</span>
<span class="sd"> &gt;&gt;&gt; df.select(</span>
<span class="sd"> ... filter(&quot;values&quot;, after_second_quarter).alias(&quot;after_second_quarter&quot;)</span>
<span class="sd"> ... ).show(truncate=False)</span>
<span class="sd"> +------------------------+</span>
<span class="sd"> |after_second_quarter |</span>
<span class="sd"> +------------------------+</span>
<span class="sd"> |[2018-09-20, 2019-07-01]|</span>
<span class="sd"> +------------------------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_higher_order_function</span><span class="p">(</span><span class="s2">&quot;ArrayFilter&quot;</span><span class="p">,</span> <span class="p">[</span><span class="n">col</span><span class="p">],</span> <span class="p">[</span><span class="n">f</span><span class="p">])</span></div>
<div class="viewcode-block" id="aggregate"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.aggregate.html#pyspark.sql.functions.aggregate">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">aggregate</span><span class="p">(</span>
<span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span>
<span class="n">initialValue</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span>
<span class="n">merge</span><span class="p">:</span> <span class="n">Callable</span><span class="p">[[</span><span class="n">Column</span><span class="p">,</span> <span class="n">Column</span><span class="p">],</span> <span class="n">Column</span><span class="p">],</span>
<span class="n">finish</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Callable</span><span class="p">[[</span><span class="n">Column</span><span class="p">],</span> <span class="n">Column</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Applies a binary operator to an initial state and all elements in the array,</span>
<span class="sd"> and reduces this to a single state. The final state is converted into the final result</span>
<span class="sd"> by applying a finish function.</span>
<span class="sd"> Both functions can use methods of :class:`~pyspark.sql.Column`, functions defined in</span>
<span class="sd"> :py:mod:`pyspark.sql.functions` and Scala ``UserDefinedFunctions``.</span>
<span class="sd"> Python ``UserDefinedFunctions`` are not supported</span>
<span class="sd"> (`SPARK-27052 &lt;https://issues.apache.org/jira/browse/SPARK-27052&gt;`__).</span>
<span class="sd"> .. versionadded:: 3.1.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> name of column or expression</span>
<span class="sd"> initialValue : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> initial value. Name of column or expression</span>
<span class="sd"> merge : function</span>
<span class="sd"> a binary function ``(acc: Column, x: Column) -&gt; Column...`` returning expression</span>
<span class="sd"> of the same type as ``zero``</span>
<span class="sd"> finish : function</span>
<span class="sd"> an optional unary function ``(x: Column) -&gt; Column: ...``</span>
<span class="sd"> used to convert accumulated value.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> final value after aggregate function is applied.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(1, [20.0, 4.0, 2.0, 6.0, 10.0])], (&quot;id&quot;, &quot;values&quot;))</span>
<span class="sd"> &gt;&gt;&gt; df.select(aggregate(&quot;values&quot;, lit(0.0), lambda acc, x: acc + x).alias(&quot;sum&quot;)).show()</span>
<span class="sd"> +----+</span>
<span class="sd"> | sum|</span>
<span class="sd"> +----+</span>
<span class="sd"> |42.0|</span>
<span class="sd"> +----+</span>
<span class="sd"> &gt;&gt;&gt; def merge(acc, x):</span>
<span class="sd"> ... count = acc.count + 1</span>
<span class="sd"> ... sum = acc.sum + x</span>
<span class="sd"> ... return struct(count.alias(&quot;count&quot;), sum.alias(&quot;sum&quot;))</span>
<span class="sd"> ...</span>
<span class="sd"> &gt;&gt;&gt; df.select(</span>
<span class="sd"> ... aggregate(</span>
<span class="sd"> ... &quot;values&quot;,</span>
<span class="sd"> ... struct(lit(0).alias(&quot;count&quot;), lit(0.0).alias(&quot;sum&quot;)),</span>
<span class="sd"> ... merge,</span>
<span class="sd"> ... lambda acc: acc.sum / acc.count,</span>
<span class="sd"> ... ).alias(&quot;mean&quot;)</span>
<span class="sd"> ... ).show()</span>
<span class="sd"> +----+</span>
<span class="sd"> |mean|</span>
<span class="sd"> +----+</span>
<span class="sd"> | 8.4|</span>
<span class="sd"> +----+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="n">finish</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="k">return</span> <span class="n">_invoke_higher_order_function</span><span class="p">(</span><span class="s2">&quot;ArrayAggregate&quot;</span><span class="p">,</span> <span class="p">[</span><span class="n">col</span><span class="p">,</span> <span class="n">initialValue</span><span class="p">],</span> <span class="p">[</span><span class="n">merge</span><span class="p">,</span> <span class="n">finish</span><span class="p">])</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">return</span> <span class="n">_invoke_higher_order_function</span><span class="p">(</span><span class="s2">&quot;ArrayAggregate&quot;</span><span class="p">,</span> <span class="p">[</span><span class="n">col</span><span class="p">,</span> <span class="n">initialValue</span><span class="p">],</span> <span class="p">[</span><span class="n">merge</span><span class="p">])</span></div>
<div class="viewcode-block" id="reduce"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.reduce.html#pyspark.sql.functions.reduce">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">reduce</span><span class="p">(</span>
<span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span>
<span class="n">initialValue</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span>
<span class="n">merge</span><span class="p">:</span> <span class="n">Callable</span><span class="p">[[</span><span class="n">Column</span><span class="p">,</span> <span class="n">Column</span><span class="p">],</span> <span class="n">Column</span><span class="p">],</span>
<span class="n">finish</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Callable</span><span class="p">[[</span><span class="n">Column</span><span class="p">],</span> <span class="n">Column</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Applies a binary operator to an initial state and all elements in the array,</span>
<span class="sd"> and reduces this to a single state. The final state is converted into the final result</span>
<span class="sd"> by applying a finish function.</span>
<span class="sd"> Both functions can use methods of :class:`~pyspark.sql.Column`, functions defined in</span>
<span class="sd"> :py:mod:`pyspark.sql.functions` and Scala ``UserDefinedFunctions``.</span>
<span class="sd"> Python ``UserDefinedFunctions`` are not supported</span>
<span class="sd"> (`SPARK-27052 &lt;https://issues.apache.org/jira/browse/SPARK-27052&gt;`__).</span>
<span class="sd"> .. versionadded:: 3.5.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> name of column or expression</span>
<span class="sd"> initialValue : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> initial value. Name of column or expression</span>
<span class="sd"> merge : function</span>
<span class="sd"> a binary function ``(acc: Column, x: Column) -&gt; Column...`` returning expression</span>
<span class="sd"> of the same type as ``zero``</span>
<span class="sd"> finish : function</span>
<span class="sd"> an optional unary function ``(x: Column) -&gt; Column: ...``</span>
<span class="sd"> used to convert accumulated value.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> final value after aggregate function is applied.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(1, [20.0, 4.0, 2.0, 6.0, 10.0])], (&quot;id&quot;, &quot;values&quot;))</span>
<span class="sd"> &gt;&gt;&gt; df.select(reduce(&quot;values&quot;, lit(0.0), lambda acc, x: acc + x).alias(&quot;sum&quot;)).show()</span>
<span class="sd"> +----+</span>
<span class="sd"> | sum|</span>
<span class="sd"> +----+</span>
<span class="sd"> |42.0|</span>
<span class="sd"> +----+</span>
<span class="sd"> &gt;&gt;&gt; def merge(acc, x):</span>
<span class="sd"> ... count = acc.count + 1</span>
<span class="sd"> ... sum = acc.sum + x</span>
<span class="sd"> ... return struct(count.alias(&quot;count&quot;), sum.alias(&quot;sum&quot;))</span>
<span class="sd"> ...</span>
<span class="sd"> &gt;&gt;&gt; df.select(</span>
<span class="sd"> ... reduce(</span>
<span class="sd"> ... &quot;values&quot;,</span>
<span class="sd"> ... struct(lit(0).alias(&quot;count&quot;), lit(0.0).alias(&quot;sum&quot;)),</span>
<span class="sd"> ... merge,</span>
<span class="sd"> ... lambda acc: acc.sum / acc.count,</span>
<span class="sd"> ... ).alias(&quot;mean&quot;)</span>
<span class="sd"> ... ).show()</span>
<span class="sd"> +----+</span>
<span class="sd"> |mean|</span>
<span class="sd"> +----+</span>
<span class="sd"> | 8.4|</span>
<span class="sd"> +----+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="n">finish</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="k">return</span> <span class="n">_invoke_higher_order_function</span><span class="p">(</span><span class="s2">&quot;ArrayAggregate&quot;</span><span class="p">,</span> <span class="p">[</span><span class="n">col</span><span class="p">,</span> <span class="n">initialValue</span><span class="p">],</span> <span class="p">[</span><span class="n">merge</span><span class="p">,</span> <span class="n">finish</span><span class="p">])</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">return</span> <span class="n">_invoke_higher_order_function</span><span class="p">(</span><span class="s2">&quot;ArrayAggregate&quot;</span><span class="p">,</span> <span class="p">[</span><span class="n">col</span><span class="p">,</span> <span class="n">initialValue</span><span class="p">],</span> <span class="p">[</span><span class="n">merge</span><span class="p">])</span></div>
<div class="viewcode-block" id="zip_with"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.zip_with.html#pyspark.sql.functions.zip_with">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">zip_with</span><span class="p">(</span>
<span class="n">left</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span>
<span class="n">right</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span>
<span class="n">f</span><span class="p">:</span> <span class="n">Callable</span><span class="p">[[</span><span class="n">Column</span><span class="p">,</span> <span class="n">Column</span><span class="p">],</span> <span class="n">Column</span><span class="p">],</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Merge two given arrays, element-wise, into a single array using a function.</span>
<span class="sd"> If one array is shorter, nulls are appended at the end to match the length of the longer</span>
<span class="sd"> array, before applying the function.</span>
<span class="sd"> .. versionadded:: 3.1.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> left : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> name of the first column or expression</span>
<span class="sd"> right : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> name of the second column or expression</span>
<span class="sd"> f : function</span>
<span class="sd"> a binary function ``(x1: Column, x2: Column) -&gt; Column...``</span>
<span class="sd"> Can use methods of :class:`~pyspark.sql.Column`, functions defined in</span>
<span class="sd"> :py:mod:`pyspark.sql.functions` and Scala ``UserDefinedFunctions``.</span>
<span class="sd"> Python ``UserDefinedFunctions`` are not supported</span>
<span class="sd"> (`SPARK-27052 &lt;https://issues.apache.org/jira/browse/SPARK-27052&gt;`__).</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> array of calculated values derived by applying given function to each pair of arguments.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(1, [1, 3, 5, 8], [0, 2, 4, 6])], (&quot;id&quot;, &quot;xs&quot;, &quot;ys&quot;))</span>
<span class="sd"> &gt;&gt;&gt; df.select(zip_with(&quot;xs&quot;, &quot;ys&quot;, lambda x, y: x ** y).alias(&quot;powers&quot;)).show(truncate=False)</span>
<span class="sd"> +---------------------------+</span>
<span class="sd"> |powers |</span>
<span class="sd"> +---------------------------+</span>
<span class="sd"> |[1.0, 9.0, 625.0, 262144.0]|</span>
<span class="sd"> +---------------------------+</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(1, [&quot;foo&quot;, &quot;bar&quot;], [1, 2, 3])], (&quot;id&quot;, &quot;xs&quot;, &quot;ys&quot;))</span>
<span class="sd"> &gt;&gt;&gt; df.select(zip_with(&quot;xs&quot;, &quot;ys&quot;, lambda x, y: concat_ws(&quot;_&quot;, x, y)).alias(&quot;xs_ys&quot;)).show()</span>
<span class="sd"> +-----------------+</span>
<span class="sd"> | xs_ys|</span>
<span class="sd"> +-----------------+</span>
<span class="sd"> |[foo_1, bar_2, 3]|</span>
<span class="sd"> +-----------------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_higher_order_function</span><span class="p">(</span><span class="s2">&quot;ZipWith&quot;</span><span class="p">,</span> <span class="p">[</span><span class="n">left</span><span class="p">,</span> <span class="n">right</span><span class="p">],</span> <span class="p">[</span><span class="n">f</span><span class="p">])</span></div>
<div class="viewcode-block" id="transform_keys"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.transform_keys.html#pyspark.sql.functions.transform_keys">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">transform_keys</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="n">f</span><span class="p">:</span> <span class="n">Callable</span><span class="p">[[</span><span class="n">Column</span><span class="p">,</span> <span class="n">Column</span><span class="p">],</span> <span class="n">Column</span><span class="p">])</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Applies a function to every key-value pair in a map and returns</span>
<span class="sd"> a map with the results of those applications as the new keys for the pairs.</span>
<span class="sd"> .. versionadded:: 3.1.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> name of column or expression</span>
<span class="sd"> f : function</span>
<span class="sd"> a binary function ``(k: Column, v: Column) -&gt; Column...``</span>
<span class="sd"> Can use methods of :class:`~pyspark.sql.Column`, functions defined in</span>
<span class="sd"> :py:mod:`pyspark.sql.functions` and Scala ``UserDefinedFunctions``.</span>
<span class="sd"> Python ``UserDefinedFunctions`` are not supported</span>
<span class="sd"> (`SPARK-27052 &lt;https://issues.apache.org/jira/browse/SPARK-27052&gt;`__).</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> a new map of enties where new keys were calculated by applying given function to</span>
<span class="sd"> each key value argument.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(1, {&quot;foo&quot;: -2.0, &quot;bar&quot;: 2.0})], (&quot;id&quot;, &quot;data&quot;))</span>
<span class="sd"> &gt;&gt;&gt; row = df.select(transform_keys(</span>
<span class="sd"> ... &quot;data&quot;, lambda k, _: upper(k)).alias(&quot;data_upper&quot;)</span>
<span class="sd"> ... ).head()</span>
<span class="sd"> &gt;&gt;&gt; sorted(row[&quot;data_upper&quot;].items())</span>
<span class="sd"> [(&#39;BAR&#39;, 2.0), (&#39;FOO&#39;, -2.0)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_higher_order_function</span><span class="p">(</span><span class="s2">&quot;TransformKeys&quot;</span><span class="p">,</span> <span class="p">[</span><span class="n">col</span><span class="p">],</span> <span class="p">[</span><span class="n">f</span><span class="p">])</span></div>
<div class="viewcode-block" id="transform_values"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.transform_values.html#pyspark.sql.functions.transform_values">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">transform_values</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="n">f</span><span class="p">:</span> <span class="n">Callable</span><span class="p">[[</span><span class="n">Column</span><span class="p">,</span> <span class="n">Column</span><span class="p">],</span> <span class="n">Column</span><span class="p">])</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Applies a function to every key-value pair in a map and returns</span>
<span class="sd"> a map with the results of those applications as the new values for the pairs.</span>
<span class="sd"> .. versionadded:: 3.1.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> name of column or expression</span>
<span class="sd"> f : function</span>
<span class="sd"> a binary function ``(k: Column, v: Column) -&gt; Column...``</span>
<span class="sd"> Can use methods of :class:`~pyspark.sql.Column`, functions defined in</span>
<span class="sd"> :py:mod:`pyspark.sql.functions` and Scala ``UserDefinedFunctions``.</span>
<span class="sd"> Python ``UserDefinedFunctions`` are not supported</span>
<span class="sd"> (`SPARK-27052 &lt;https://issues.apache.org/jira/browse/SPARK-27052&gt;`__).</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> a new map of enties where new values were calculated by applying given function to</span>
<span class="sd"> each key value argument.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(1, {&quot;IT&quot;: 10.0, &quot;SALES&quot;: 2.0, &quot;OPS&quot;: 24.0})], (&quot;id&quot;, &quot;data&quot;))</span>
<span class="sd"> &gt;&gt;&gt; row = df.select(transform_values(</span>
<span class="sd"> ... &quot;data&quot;, lambda k, v: when(k.isin(&quot;IT&quot;, &quot;OPS&quot;), v + 10.0).otherwise(v)</span>
<span class="sd"> ... ).alias(&quot;new_data&quot;)).head()</span>
<span class="sd"> &gt;&gt;&gt; sorted(row[&quot;new_data&quot;].items())</span>
<span class="sd"> [(&#39;IT&#39;, 20.0), (&#39;OPS&#39;, 34.0), (&#39;SALES&#39;, 2.0)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_higher_order_function</span><span class="p">(</span><span class="s2">&quot;TransformValues&quot;</span><span class="p">,</span> <span class="p">[</span><span class="n">col</span><span class="p">],</span> <span class="p">[</span><span class="n">f</span><span class="p">])</span></div>
<div class="viewcode-block" id="map_filter"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.map_filter.html#pyspark.sql.functions.map_filter">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">map_filter</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="n">f</span><span class="p">:</span> <span class="n">Callable</span><span class="p">[[</span><span class="n">Column</span><span class="p">,</span> <span class="n">Column</span><span class="p">],</span> <span class="n">Column</span><span class="p">])</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns a map whose key-value pairs satisfy a predicate.</span>
<span class="sd"> .. versionadded:: 3.1.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> name of column or expression</span>
<span class="sd"> f : function</span>
<span class="sd"> a binary function ``(k: Column, v: Column) -&gt; Column...``</span>
<span class="sd"> Can use methods of :class:`~pyspark.sql.Column`, functions defined in</span>
<span class="sd"> :py:mod:`pyspark.sql.functions` and Scala ``UserDefinedFunctions``.</span>
<span class="sd"> Python ``UserDefinedFunctions`` are not supported</span>
<span class="sd"> (`SPARK-27052 &lt;https://issues.apache.org/jira/browse/SPARK-27052&gt;`__).</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> filtered map.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(1, {&quot;foo&quot;: 42.0, &quot;bar&quot;: 1.0, &quot;baz&quot;: 32.0})], (&quot;id&quot;, &quot;data&quot;))</span>
<span class="sd"> &gt;&gt;&gt; row = df.select(map_filter(</span>
<span class="sd"> ... &quot;data&quot;, lambda _, v: v &gt; 30.0).alias(&quot;data_filtered&quot;)</span>
<span class="sd"> ... ).head()</span>
<span class="sd"> &gt;&gt;&gt; sorted(row[&quot;data_filtered&quot;].items())</span>
<span class="sd"> [(&#39;baz&#39;, 32.0), (&#39;foo&#39;, 42.0)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_higher_order_function</span><span class="p">(</span><span class="s2">&quot;MapFilter&quot;</span><span class="p">,</span> <span class="p">[</span><span class="n">col</span><span class="p">],</span> <span class="p">[</span><span class="n">f</span><span class="p">])</span></div>
<div class="viewcode-block" id="map_zip_with"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.map_zip_with.html#pyspark.sql.functions.map_zip_with">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">map_zip_with</span><span class="p">(</span>
<span class="n">col1</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span>
<span class="n">col2</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span>
<span class="n">f</span><span class="p">:</span> <span class="n">Callable</span><span class="p">[[</span><span class="n">Column</span><span class="p">,</span> <span class="n">Column</span><span class="p">,</span> <span class="n">Column</span><span class="p">],</span> <span class="n">Column</span><span class="p">],</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Merge two given maps, key-wise into a single map using a function.</span>
<span class="sd"> .. versionadded:: 3.1.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col1 : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> name of the first column or expression</span>
<span class="sd"> col2 : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> name of the second column or expression</span>
<span class="sd"> f : function</span>
<span class="sd"> a ternary function ``(k: Column, v1: Column, v2: Column) -&gt; Column...``</span>
<span class="sd"> Can use methods of :class:`~pyspark.sql.Column`, functions defined in</span>
<span class="sd"> :py:mod:`pyspark.sql.functions` and Scala ``UserDefinedFunctions``.</span>
<span class="sd"> Python ``UserDefinedFunctions`` are not supported</span>
<span class="sd"> (`SPARK-27052 &lt;https://issues.apache.org/jira/browse/SPARK-27052&gt;`__).</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> zipped map where entries are calculated by applying given function to each</span>
<span class="sd"> pair of arguments.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([</span>
<span class="sd"> ... (1, {&quot;IT&quot;: 24.0, &quot;SALES&quot;: 12.00}, {&quot;IT&quot;: 2.0, &quot;SALES&quot;: 1.4})],</span>
<span class="sd"> ... (&quot;id&quot;, &quot;base&quot;, &quot;ratio&quot;)</span>
<span class="sd"> ... )</span>
<span class="sd"> &gt;&gt;&gt; row = df.select(map_zip_with(</span>
<span class="sd"> ... &quot;base&quot;, &quot;ratio&quot;, lambda k, v1, v2: round(v1 * v2, 2)).alias(&quot;updated_data&quot;)</span>
<span class="sd"> ... ).head()</span>
<span class="sd"> &gt;&gt;&gt; sorted(row[&quot;updated_data&quot;].items())</span>
<span class="sd"> [(&#39;IT&#39;, 48.0), (&#39;SALES&#39;, 16.8)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_higher_order_function</span><span class="p">(</span><span class="s2">&quot;MapZipWith&quot;</span><span class="p">,</span> <span class="p">[</span><span class="n">col1</span><span class="p">,</span> <span class="n">col2</span><span class="p">],</span> <span class="p">[</span><span class="n">f</span><span class="p">])</span></div>
<div class="viewcode-block" id="str_to_map"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.str_to_map.html#pyspark.sql.functions.str_to_map">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">str_to_map</span><span class="p">(</span>
<span class="n">text</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span>
<span class="n">pairDelim</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="s2">&quot;ColumnOrName&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">keyValueDelim</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="s2">&quot;ColumnOrName&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Creates a map after splitting the text into key/value pairs using delimiters.</span>
<span class="sd"> Both `pairDelim` and `keyValueDelim` are treated as regular expressions.</span>
<span class="sd"> .. versionadded:: 3.5.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> text : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> Input column or strings.</span>
<span class="sd"> pairDelim : :class:`~pyspark.sql.Column` or str, optional</span>
<span class="sd"> delimiter to use to split pair.</span>
<span class="sd"> keyValueDelim : :class:`~pyspark.sql.Column` or str, optional</span>
<span class="sd"> delimiter to use to split key/value.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(&quot;a:1,b:2,c:3&quot;,)], [&quot;e&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(str_to_map(df.e, lit(&quot;,&quot;), lit(&quot;:&quot;)).alias(&#39;r&#39;)).collect()</span>
<span class="sd"> [Row(r={&#39;a&#39;: &#39;1&#39;, &#39;b&#39;: &#39;2&#39;, &#39;c&#39;: &#39;3&#39;})]</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(&quot;a:1,b:2,c:3&quot;,)], [&quot;e&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(str_to_map(df.e, lit(&quot;,&quot;)).alias(&#39;r&#39;)).collect()</span>
<span class="sd"> [Row(r={&#39;a&#39;: &#39;1&#39;, &#39;b&#39;: &#39;2&#39;, &#39;c&#39;: &#39;3&#39;})]</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(&quot;a:1,b:2,c:3&quot;,)], [&quot;e&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(str_to_map(df.e).alias(&#39;r&#39;)).collect()</span>
<span class="sd"> [Row(r={&#39;a&#39;: &#39;1&#39;, &#39;b&#39;: &#39;2&#39;, &#39;c&#39;: &#39;3&#39;})]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="n">pairDelim</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
<span class="n">pairDelim</span> <span class="o">=</span> <span class="n">lit</span><span class="p">(</span><span class="s2">&quot;,&quot;</span><span class="p">)</span>
<span class="k">if</span> <span class="n">keyValueDelim</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
<span class="n">keyValueDelim</span> <span class="o">=</span> <span class="n">lit</span><span class="p">(</span><span class="s2">&quot;:&quot;</span><span class="p">)</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;str_to_map&quot;</span><span class="p">,</span> <span class="n">text</span><span class="p">,</span> <span class="n">pairDelim</span><span class="p">,</span> <span class="n">keyValueDelim</span><span class="p">)</span></div>
<span class="c1"># ---------------------- Partition transform functions --------------------------------</span>
<div class="viewcode-block" id="years"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.years.html#pyspark.sql.functions.years">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">years</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Partition transform function: A transform for timestamps and dates</span>
<span class="sd"> to partition data into years.</span>
<span class="sd"> .. versionadded:: 3.1.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> target date or timestamp column to work on.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> data partitioned by years.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df.writeTo(&quot;catalog.db.table&quot;).partitionedBy( # doctest: +SKIP</span>
<span class="sd"> ... years(&quot;ts&quot;)</span>
<span class="sd"> ... ).createOrReplace()</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> This function can be used only in combination with</span>
<span class="sd"> :py:meth:`~pyspark.sql.readwriter.DataFrameWriterV2.partitionedBy`</span>
<span class="sd"> method of the `DataFrameWriterV2`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;years&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="months"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.months.html#pyspark.sql.functions.months">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">months</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Partition transform function: A transform for timestamps and dates</span>
<span class="sd"> to partition data into months.</span>
<span class="sd"> .. versionadded:: 3.1.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> target date or timestamp column to work on.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> data partitioned by months.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df.writeTo(&quot;catalog.db.table&quot;).partitionedBy(</span>
<span class="sd"> ... months(&quot;ts&quot;)</span>
<span class="sd"> ... ).createOrReplace() # doctest: +SKIP</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> This function can be used only in combination with</span>
<span class="sd"> :py:meth:`~pyspark.sql.readwriter.DataFrameWriterV2.partitionedBy`</span>
<span class="sd"> method of the `DataFrameWriterV2`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;months&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="days"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.days.html#pyspark.sql.functions.days">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">days</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Partition transform function: A transform for timestamps and dates</span>
<span class="sd"> to partition data into days.</span>
<span class="sd"> .. versionadded:: 3.1.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> target date or timestamp column to work on.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> data partitioned by days.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df.writeTo(&quot;catalog.db.table&quot;).partitionedBy( # doctest: +SKIP</span>
<span class="sd"> ... days(&quot;ts&quot;)</span>
<span class="sd"> ... ).createOrReplace()</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> This function can be used only in combination with</span>
<span class="sd"> :py:meth:`~pyspark.sql.readwriter.DataFrameWriterV2.partitionedBy`</span>
<span class="sd"> method of the `DataFrameWriterV2`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;days&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="hours"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.hours.html#pyspark.sql.functions.hours">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">hours</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Partition transform function: A transform for timestamps</span>
<span class="sd"> to partition data into hours.</span>
<span class="sd"> .. versionadded:: 3.1.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> target date or timestamp column to work on.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> data partitioned by hours.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df.writeTo(&quot;catalog.db.table&quot;).partitionedBy( # doctest: +SKIP</span>
<span class="sd"> ... hours(&quot;ts&quot;)</span>
<span class="sd"> ... ).createOrReplace()</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> This function can be used only in combination with</span>
<span class="sd"> :py:meth:`~pyspark.sql.readwriter.DataFrameWriterV2.partitionedBy`</span>
<span class="sd"> method of the `DataFrameWriterV2`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;hours&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="convert_timezone"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.convert_timezone.html#pyspark.sql.functions.convert_timezone">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">convert_timezone</span><span class="p">(</span>
<span class="n">sourceTz</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Column</span><span class="p">],</span> <span class="n">targetTz</span><span class="p">:</span> <span class="n">Column</span><span class="p">,</span> <span class="n">sourceTs</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Converts the timestamp without time zone `sourceTs`</span>
<span class="sd"> from the `sourceTz` time zone to `targetTz`.</span>
<span class="sd"> .. versionadded:: 3.5.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> sourceTz : :class:`~pyspark.sql.Column`</span>
<span class="sd"> the time zone for the input timestamp. If it is missed,</span>
<span class="sd"> the current session time zone is used as the source time zone.</span>
<span class="sd"> targetTz : :class:`~pyspark.sql.Column`</span>
<span class="sd"> the time zone to which the input timestamp should be converted.</span>
<span class="sd"> sourceTs : :class:`~pyspark.sql.Column`</span>
<span class="sd"> a timestamp without time zone.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> timestamp for converted time zone.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(&#39;2015-04-08&#39;,)], [&#39;dt&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(convert_timezone( # doctest: +SKIP</span>
<span class="sd"> ... None, lit(&#39;Asia/Hong_Kong&#39;), &#39;dt&#39;).alias(&#39;ts&#39;)</span>
<span class="sd"> ... ).show()</span>
<span class="sd"> +-------------------+</span>
<span class="sd"> | ts|</span>
<span class="sd"> +-------------------+</span>
<span class="sd"> |2015-04-08 00:00:00|</span>
<span class="sd"> +-------------------+</span>
<span class="sd"> &gt;&gt;&gt; df.select(convert_timezone(</span>
<span class="sd"> ... lit(&#39;America/Los_Angeles&#39;), lit(&#39;Asia/Hong_Kong&#39;), &#39;dt&#39;).alias(&#39;ts&#39;)</span>
<span class="sd"> ... ).show()</span>
<span class="sd"> +-------------------+</span>
<span class="sd"> | ts|</span>
<span class="sd"> +-------------------+</span>
<span class="sd"> |2015-04-08 15:00:00|</span>
<span class="sd"> +-------------------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="n">sourceTz</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;convert_timezone&quot;</span><span class="p">,</span> <span class="n">targetTz</span><span class="p">,</span> <span class="n">sourceTs</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;convert_timezone&quot;</span><span class="p">,</span> <span class="n">sourceTz</span><span class="p">,</span> <span class="n">targetTz</span><span class="p">,</span> <span class="n">sourceTs</span><span class="p">)</span></div>
<div class="viewcode-block" id="make_dt_interval"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.make_dt_interval.html#pyspark.sql.functions.make_dt_interval">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">make_dt_interval</span><span class="p">(</span>
<span class="n">days</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="s2">&quot;ColumnOrName&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">hours</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="s2">&quot;ColumnOrName&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">mins</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="s2">&quot;ColumnOrName&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">secs</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="s2">&quot;ColumnOrName&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Make DayTimeIntervalType duration from days, hours, mins and secs.</span>
<span class="sd"> .. versionadded:: 3.5.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> days : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> the number of days, positive or negative</span>
<span class="sd"> hours : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> the number of hours, positive or negative</span>
<span class="sd"> mins : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> the number of minutes, positive or negative</span>
<span class="sd"> secs : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> the number of seconds with the fractional part in microsecond precision.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([[1, 12, 30, 01.001001]],</span>
<span class="sd"> ... [&quot;day&quot;, &quot;hour&quot;, &quot;min&quot;, &quot;sec&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(make_dt_interval(</span>
<span class="sd"> ... df.day, df.hour, df.min, df.sec).alias(&#39;r&#39;)</span>
<span class="sd"> ... ).show(truncate=False)</span>
<span class="sd"> +------------------------------------------+</span>
<span class="sd"> |r |</span>
<span class="sd"> +------------------------------------------+</span>
<span class="sd"> |INTERVAL &#39;1 12:30:01.001001&#39; DAY TO SECOND|</span>
<span class="sd"> +------------------------------------------+</span>
<span class="sd"> &gt;&gt;&gt; df.select(make_dt_interval(</span>
<span class="sd"> ... df.day, df.hour, df.min).alias(&#39;r&#39;)</span>
<span class="sd"> ... ).show(truncate=False)</span>
<span class="sd"> +-----------------------------------+</span>
<span class="sd"> |r |</span>
<span class="sd"> +-----------------------------------+</span>
<span class="sd"> |INTERVAL &#39;1 12:30:00&#39; DAY TO SECOND|</span>
<span class="sd"> +-----------------------------------+</span>
<span class="sd"> &gt;&gt;&gt; df.select(make_dt_interval(</span>
<span class="sd"> ... df.day, df.hour).alias(&#39;r&#39;)</span>
<span class="sd"> ... ).show(truncate=False)</span>
<span class="sd"> +-----------------------------------+</span>
<span class="sd"> |r |</span>
<span class="sd"> +-----------------------------------+</span>
<span class="sd"> |INTERVAL &#39;1 12:00:00&#39; DAY TO SECOND|</span>
<span class="sd"> +-----------------------------------+</span>
<span class="sd"> &gt;&gt;&gt; df.select(make_dt_interval(df.day).alias(&#39;r&#39;)).show(truncate=False)</span>
<span class="sd"> +-----------------------------------+</span>
<span class="sd"> |r |</span>
<span class="sd"> +-----------------------------------+</span>
<span class="sd"> |INTERVAL &#39;1 00:00:00&#39; DAY TO SECOND|</span>
<span class="sd"> +-----------------------------------+</span>
<span class="sd"> &gt;&gt;&gt; df.select(make_dt_interval().alias(&#39;r&#39;)).show(truncate=False)</span>
<span class="sd"> +-----------------------------------+</span>
<span class="sd"> |r |</span>
<span class="sd"> +-----------------------------------+</span>
<span class="sd"> |INTERVAL &#39;0 00:00:00&#39; DAY TO SECOND|</span>
<span class="sd"> +-----------------------------------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">_days</span> <span class="o">=</span> <span class="n">lit</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span> <span class="k">if</span> <span class="n">days</span> <span class="ow">is</span> <span class="kc">None</span> <span class="k">else</span> <span class="n">days</span>
<span class="n">_hours</span> <span class="o">=</span> <span class="n">lit</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span> <span class="k">if</span> <span class="n">hours</span> <span class="ow">is</span> <span class="kc">None</span> <span class="k">else</span> <span class="n">hours</span>
<span class="n">_mins</span> <span class="o">=</span> <span class="n">lit</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span> <span class="k">if</span> <span class="n">mins</span> <span class="ow">is</span> <span class="kc">None</span> <span class="k">else</span> <span class="n">mins</span>
<span class="n">_secs</span> <span class="o">=</span> <span class="n">lit</span><span class="p">(</span><span class="n">decimal</span><span class="o">.</span><span class="n">Decimal</span><span class="p">(</span><span class="mi">0</span><span class="p">))</span> <span class="k">if</span> <span class="n">secs</span> <span class="ow">is</span> <span class="kc">None</span> <span class="k">else</span> <span class="n">secs</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;make_dt_interval&quot;</span><span class="p">,</span> <span class="n">_days</span><span class="p">,</span> <span class="n">_hours</span><span class="p">,</span> <span class="n">_mins</span><span class="p">,</span> <span class="n">_secs</span><span class="p">)</span></div>
<div class="viewcode-block" id="make_interval"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.make_interval.html#pyspark.sql.functions.make_interval">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">make_interval</span><span class="p">(</span>
<span class="n">years</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="s2">&quot;ColumnOrName&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">months</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="s2">&quot;ColumnOrName&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">weeks</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="s2">&quot;ColumnOrName&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">days</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="s2">&quot;ColumnOrName&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">hours</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="s2">&quot;ColumnOrName&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">mins</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="s2">&quot;ColumnOrName&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">secs</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="s2">&quot;ColumnOrName&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Make interval from years, months, weeks, days, hours, mins and secs.</span>
<span class="sd"> .. versionadded:: 3.5.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> years : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> the number of years, positive or negative</span>
<span class="sd"> months : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> the number of months, positive or negative</span>
<span class="sd"> weeks : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> the number of weeks, positive or negative</span>
<span class="sd"> days : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> the number of days, positive or negative</span>
<span class="sd"> hours : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> the number of hours, positive or negative</span>
<span class="sd"> mins : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> the number of minutes, positive or negative</span>
<span class="sd"> secs : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> the number of seconds with the fractional part in microsecond precision.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([[100, 11, 1, 1, 12, 30, 01.001001]],</span>
<span class="sd"> ... [&quot;year&quot;, &quot;month&quot;, &quot;week&quot;, &quot;day&quot;, &quot;hour&quot;, &quot;min&quot;, &quot;sec&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(make_interval(</span>
<span class="sd"> ... df.year, df.month, df.week, df.day, df.hour, df.min, df.sec).alias(&#39;r&#39;)</span>
<span class="sd"> ... ).show(truncate=False)</span>
<span class="sd"> +---------------------------------------------------------------+</span>
<span class="sd"> |r |</span>
<span class="sd"> +---------------------------------------------------------------+</span>
<span class="sd"> |100 years 11 months 8 days 12 hours 30 minutes 1.001001 seconds|</span>
<span class="sd"> +---------------------------------------------------------------+</span>
<span class="sd"> &gt;&gt;&gt; df.select(make_interval(</span>
<span class="sd"> ... df.year, df.month, df.week, df.day, df.hour, df.min).alias(&#39;r&#39;)</span>
<span class="sd"> ... ).show(truncate=False)</span>
<span class="sd"> +----------------------------------------------+</span>
<span class="sd"> |r |</span>
<span class="sd"> +----------------------------------------------+</span>
<span class="sd"> |100 years 11 months 8 days 12 hours 30 minutes|</span>
<span class="sd"> +----------------------------------------------+</span>
<span class="sd"> &gt;&gt;&gt; df.select(make_interval(</span>
<span class="sd"> ... df.year, df.month, df.week, df.day, df.hour).alias(&#39;r&#39;)</span>
<span class="sd"> ... ).show(truncate=False)</span>
<span class="sd"> +-----------------------------------+</span>
<span class="sd"> |r |</span>
<span class="sd"> +-----------------------------------+</span>
<span class="sd"> |100 years 11 months 8 days 12 hours|</span>
<span class="sd"> +-----------------------------------+</span>
<span class="sd"> &gt;&gt;&gt; df.select(make_interval(</span>
<span class="sd"> ... df.year, df.month, df.week, df.day).alias(&#39;r&#39;)</span>
<span class="sd"> ... ).show(truncate=False)</span>
<span class="sd"> +--------------------------+</span>
<span class="sd"> |r |</span>
<span class="sd"> +--------------------------+</span>
<span class="sd"> |100 years 11 months 8 days|</span>
<span class="sd"> +--------------------------+</span>
<span class="sd"> &gt;&gt;&gt; df.select(make_interval(</span>
<span class="sd"> ... df.year, df.month, df.week).alias(&#39;r&#39;)</span>
<span class="sd"> ... ).show(truncate=False)</span>
<span class="sd"> +--------------------------+</span>
<span class="sd"> |r |</span>
<span class="sd"> +--------------------------+</span>
<span class="sd"> |100 years 11 months 7 days|</span>
<span class="sd"> +--------------------------+</span>
<span class="sd"> &gt;&gt;&gt; df.select(make_interval(df.year, df.month).alias(&#39;r&#39;)).show(truncate=False)</span>
<span class="sd"> +-------------------+</span>
<span class="sd"> |r |</span>
<span class="sd"> +-------------------+</span>
<span class="sd"> |100 years 11 months|</span>
<span class="sd"> +-------------------+</span>
<span class="sd"> &gt;&gt;&gt; df.select(make_interval(df.year).alias(&#39;r&#39;)).show(truncate=False)</span>
<span class="sd"> +---------+</span>
<span class="sd"> |r |</span>
<span class="sd"> +---------+</span>
<span class="sd"> |100 years|</span>
<span class="sd"> +---------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">_years</span> <span class="o">=</span> <span class="n">lit</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span> <span class="k">if</span> <span class="n">years</span> <span class="ow">is</span> <span class="kc">None</span> <span class="k">else</span> <span class="n">years</span>
<span class="n">_months</span> <span class="o">=</span> <span class="n">lit</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span> <span class="k">if</span> <span class="n">months</span> <span class="ow">is</span> <span class="kc">None</span> <span class="k">else</span> <span class="n">months</span>
<span class="n">_weeks</span> <span class="o">=</span> <span class="n">lit</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span> <span class="k">if</span> <span class="n">weeks</span> <span class="ow">is</span> <span class="kc">None</span> <span class="k">else</span> <span class="n">weeks</span>
<span class="n">_days</span> <span class="o">=</span> <span class="n">lit</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span> <span class="k">if</span> <span class="n">days</span> <span class="ow">is</span> <span class="kc">None</span> <span class="k">else</span> <span class="n">days</span>
<span class="n">_hours</span> <span class="o">=</span> <span class="n">lit</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span> <span class="k">if</span> <span class="n">hours</span> <span class="ow">is</span> <span class="kc">None</span> <span class="k">else</span> <span class="n">hours</span>
<span class="n">_mins</span> <span class="o">=</span> <span class="n">lit</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span> <span class="k">if</span> <span class="n">mins</span> <span class="ow">is</span> <span class="kc">None</span> <span class="k">else</span> <span class="n">mins</span>
<span class="n">_secs</span> <span class="o">=</span> <span class="n">lit</span><span class="p">(</span><span class="n">decimal</span><span class="o">.</span><span class="n">Decimal</span><span class="p">(</span><span class="mi">0</span><span class="p">))</span> <span class="k">if</span> <span class="n">secs</span> <span class="ow">is</span> <span class="kc">None</span> <span class="k">else</span> <span class="n">secs</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span>
<span class="s2">&quot;make_interval&quot;</span><span class="p">,</span> <span class="n">_years</span><span class="p">,</span> <span class="n">_months</span><span class="p">,</span> <span class="n">_weeks</span><span class="p">,</span> <span class="n">_days</span><span class="p">,</span> <span class="n">_hours</span><span class="p">,</span> <span class="n">_mins</span><span class="p">,</span> <span class="n">_secs</span>
<span class="p">)</span></div>
<div class="viewcode-block" id="make_timestamp"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.make_timestamp.html#pyspark.sql.functions.make_timestamp">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">make_timestamp</span><span class="p">(</span>
<span class="n">years</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span>
<span class="n">months</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span>
<span class="n">days</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span>
<span class="n">hours</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span>
<span class="n">mins</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span>
<span class="n">secs</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span>
<span class="n">timezone</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="s2">&quot;ColumnOrName&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Create timestamp from years, months, days, hours, mins, secs and timezone fields.</span>
<span class="sd"> The result data type is consistent with the value of configuration `spark.sql.timestampType`.</span>
<span class="sd"> If the configuration `spark.sql.ansi.enabled` is false, the function returns NULL</span>
<span class="sd"> on invalid inputs. Otherwise, it will throw an error instead.</span>
<span class="sd"> .. versionadded:: 3.5.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> years : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> the year to represent, from 1 to 9999</span>
<span class="sd"> months : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> the month-of-year to represent, from 1 (January) to 12 (December)</span>
<span class="sd"> days : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> the day-of-month to represent, from 1 to 31</span>
<span class="sd"> hours : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> the hour-of-day to represent, from 0 to 23</span>
<span class="sd"> mins : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> the minute-of-hour to represent, from 0 to 59</span>
<span class="sd"> secs : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> the second-of-minute and its micro-fraction to represent, from 0 to 60.</span>
<span class="sd"> The value can be either an integer like 13 , or a fraction like 13.123.</span>
<span class="sd"> If the sec argument equals to 60, the seconds field is set</span>
<span class="sd"> to 0 and 1 minute is added to the final timestamp.</span>
<span class="sd"> timezone : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> the time zone identifier. For example, CET, UTC and etc.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; spark.conf.set(&quot;spark.sql.session.timeZone&quot;, &quot;America/Los_Angeles&quot;)</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([[2014, 12, 28, 6, 30, 45.887, &#39;CET&#39;]],</span>
<span class="sd"> ... [&quot;year&quot;, &quot;month&quot;, &quot;day&quot;, &quot;hour&quot;, &quot;min&quot;, &quot;sec&quot;, &quot;timezone&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(make_timestamp(</span>
<span class="sd"> ... df.year, df.month, df.day, df.hour, df.min, df.sec, df.timezone).alias(&#39;r&#39;)</span>
<span class="sd"> ... ).show(truncate=False)</span>
<span class="sd"> +-----------------------+</span>
<span class="sd"> |r |</span>
<span class="sd"> +-----------------------+</span>
<span class="sd"> |2014-12-27 21:30:45.887|</span>
<span class="sd"> +-----------------------+</span>
<span class="sd"> &gt;&gt;&gt; df.select(make_timestamp(</span>
<span class="sd"> ... df.year, df.month, df.day, df.hour, df.min, df.sec).alias(&#39;r&#39;)</span>
<span class="sd"> ... ).show(truncate=False)</span>
<span class="sd"> +-----------------------+</span>
<span class="sd"> |r |</span>
<span class="sd"> +-----------------------+</span>
<span class="sd"> |2014-12-28 06:30:45.887|</span>
<span class="sd"> +-----------------------+</span>
<span class="sd"> &gt;&gt;&gt; spark.conf.unset(&quot;spark.sql.session.timeZone&quot;)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="n">timezone</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span>
<span class="s2">&quot;make_timestamp&quot;</span><span class="p">,</span> <span class="n">years</span><span class="p">,</span> <span class="n">months</span><span class="p">,</span> <span class="n">days</span><span class="p">,</span> <span class="n">hours</span><span class="p">,</span> <span class="n">mins</span><span class="p">,</span> <span class="n">secs</span><span class="p">,</span> <span class="n">timezone</span>
<span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span>
<span class="s2">&quot;make_timestamp&quot;</span><span class="p">,</span> <span class="n">years</span><span class="p">,</span> <span class="n">months</span><span class="p">,</span> <span class="n">days</span><span class="p">,</span> <span class="n">hours</span><span class="p">,</span> <span class="n">mins</span><span class="p">,</span> <span class="n">secs</span>
<span class="p">)</span></div>
<div class="viewcode-block" id="make_timestamp_ltz"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.make_timestamp_ltz.html#pyspark.sql.functions.make_timestamp_ltz">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">make_timestamp_ltz</span><span class="p">(</span>
<span class="n">years</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span>
<span class="n">months</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span>
<span class="n">days</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span>
<span class="n">hours</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span>
<span class="n">mins</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span>
<span class="n">secs</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span>
<span class="n">timezone</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="s2">&quot;ColumnOrName&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Create the current timestamp with local time zone from years, months, days, hours, mins,</span>
<span class="sd"> secs and timezone fields. If the configuration `spark.sql.ansi.enabled` is false,</span>
<span class="sd"> the function returns NULL on invalid inputs. Otherwise, it will throw an error instead.</span>
<span class="sd"> .. versionadded:: 3.5.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> years : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> the year to represent, from 1 to 9999</span>
<span class="sd"> months : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> the month-of-year to represent, from 1 (January) to 12 (December)</span>
<span class="sd"> days : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> the day-of-month to represent, from 1 to 31</span>
<span class="sd"> hours : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> the hour-of-day to represent, from 0 to 23</span>
<span class="sd"> mins : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> the minute-of-hour to represent, from 0 to 59</span>
<span class="sd"> secs : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> the second-of-minute and its micro-fraction to represent, from 0 to 60.</span>
<span class="sd"> The value can be either an integer like 13 , or a fraction like 13.123.</span>
<span class="sd"> If the sec argument equals to 60, the seconds field is set</span>
<span class="sd"> to 0 and 1 minute is added to the final timestamp.</span>
<span class="sd"> timezone : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> the time zone identifier. For example, CET, UTC and etc.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; import pyspark.sql.functions as sf</span>
<span class="sd"> &gt;&gt;&gt; spark.conf.set(&quot;spark.sql.session.timeZone&quot;, &quot;America/Los_Angeles&quot;)</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([[2014, 12, 28, 6, 30, 45.887, &#39;CET&#39;]],</span>
<span class="sd"> ... [&quot;year&quot;, &quot;month&quot;, &quot;day&quot;, &quot;hour&quot;, &quot;min&quot;, &quot;sec&quot;, &quot;timezone&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(sf.make_timestamp_ltz(</span>
<span class="sd"> ... df.year, df.month, df.day, df.hour, df.min, df.sec, df.timezone)</span>
<span class="sd"> ... ).show(truncate=False)</span>
<span class="sd"> +--------------------------------------------------------------+</span>
<span class="sd"> |make_timestamp_ltz(year, month, day, hour, min, sec, timezone)|</span>
<span class="sd"> +--------------------------------------------------------------+</span>
<span class="sd"> |2014-12-27 21:30:45.887 |</span>
<span class="sd"> +--------------------------------------------------------------+</span>
<span class="sd"> &gt;&gt;&gt; df.select(sf.make_timestamp_ltz(</span>
<span class="sd"> ... df.year, df.month, df.day, df.hour, df.min, df.sec)</span>
<span class="sd"> ... ).show(truncate=False)</span>
<span class="sd"> +----------------------------------------------------+</span>
<span class="sd"> |make_timestamp_ltz(year, month, day, hour, min, sec)|</span>
<span class="sd"> +----------------------------------------------------+</span>
<span class="sd"> |2014-12-28 06:30:45.887 |</span>
<span class="sd"> +----------------------------------------------------+</span>
<span class="sd"> &gt;&gt;&gt; spark.conf.unset(&quot;spark.sql.session.timeZone&quot;)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="n">timezone</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span>
<span class="s2">&quot;make_timestamp_ltz&quot;</span><span class="p">,</span> <span class="n">years</span><span class="p">,</span> <span class="n">months</span><span class="p">,</span> <span class="n">days</span><span class="p">,</span> <span class="n">hours</span><span class="p">,</span> <span class="n">mins</span><span class="p">,</span> <span class="n">secs</span><span class="p">,</span> <span class="n">timezone</span>
<span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span>
<span class="s2">&quot;make_timestamp_ltz&quot;</span><span class="p">,</span> <span class="n">years</span><span class="p">,</span> <span class="n">months</span><span class="p">,</span> <span class="n">days</span><span class="p">,</span> <span class="n">hours</span><span class="p">,</span> <span class="n">mins</span><span class="p">,</span> <span class="n">secs</span>
<span class="p">)</span></div>
<div class="viewcode-block" id="make_timestamp_ntz"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.make_timestamp_ntz.html#pyspark.sql.functions.make_timestamp_ntz">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">make_timestamp_ntz</span><span class="p">(</span>
<span class="n">years</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span>
<span class="n">months</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span>
<span class="n">days</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span>
<span class="n">hours</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span>
<span class="n">mins</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span>
<span class="n">secs</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Create local date-time from years, months, days, hours, mins, secs fields.</span>
<span class="sd"> If the configuration `spark.sql.ansi.enabled` is false, the function returns NULL</span>
<span class="sd"> on invalid inputs. Otherwise, it will throw an error instead.</span>
<span class="sd"> .. versionadded:: 3.5.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> years : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> the year to represent, from 1 to 9999</span>
<span class="sd"> months : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> the month-of-year to represent, from 1 (January) to 12 (December)</span>
<span class="sd"> days : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> the day-of-month to represent, from 1 to 31</span>
<span class="sd"> hours : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> the hour-of-day to represent, from 0 to 23</span>
<span class="sd"> mins : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> the minute-of-hour to represent, from 0 to 59</span>
<span class="sd"> secs : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> the second-of-minute and its micro-fraction to represent, from 0 to 60.</span>
<span class="sd"> The value can be either an integer like 13 , or a fraction like 13.123.</span>
<span class="sd"> If the sec argument equals to 60, the seconds field is set</span>
<span class="sd"> to 0 and 1 minute is added to the final timestamp.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; import pyspark.sql.functions as sf</span>
<span class="sd"> &gt;&gt;&gt; spark.conf.set(&quot;spark.sql.session.timeZone&quot;, &quot;America/Los_Angeles&quot;)</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([[2014, 12, 28, 6, 30, 45.887]],</span>
<span class="sd"> ... [&quot;year&quot;, &quot;month&quot;, &quot;day&quot;, &quot;hour&quot;, &quot;min&quot;, &quot;sec&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(sf.make_timestamp_ntz(</span>
<span class="sd"> ... df.year, df.month, df.day, df.hour, df.min, df.sec)</span>
<span class="sd"> ... ).show(truncate=False)</span>
<span class="sd"> +----------------------------------------------------+</span>
<span class="sd"> |make_timestamp_ntz(year, month, day, hour, min, sec)|</span>
<span class="sd"> +----------------------------------------------------+</span>
<span class="sd"> |2014-12-28 06:30:45.887 |</span>
<span class="sd"> +----------------------------------------------------+</span>
<span class="sd"> &gt;&gt;&gt; spark.conf.unset(&quot;spark.sql.session.timeZone&quot;)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span>
<span class="s2">&quot;make_timestamp_ntz&quot;</span><span class="p">,</span> <span class="n">years</span><span class="p">,</span> <span class="n">months</span><span class="p">,</span> <span class="n">days</span><span class="p">,</span> <span class="n">hours</span><span class="p">,</span> <span class="n">mins</span><span class="p">,</span> <span class="n">secs</span>
<span class="p">)</span></div>
<div class="viewcode-block" id="make_ym_interval"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.make_ym_interval.html#pyspark.sql.functions.make_ym_interval">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">make_ym_interval</span><span class="p">(</span>
<span class="n">years</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="s2">&quot;ColumnOrName&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">months</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="s2">&quot;ColumnOrName&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Make year-month interval from years, months.</span>
<span class="sd"> .. versionadded:: 3.5.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> years : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> the number of years, positive or negative</span>
<span class="sd"> months : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> the number of months, positive or negative</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; spark.conf.set(&quot;spark.sql.session.timeZone&quot;, &quot;America/Los_Angeles&quot;)</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([[2014, 12]], [&quot;year&quot;, &quot;month&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(make_ym_interval(df.year, df.month).alias(&#39;r&#39;)).show(truncate=False)</span>
<span class="sd"> +-------------------------------+</span>
<span class="sd"> |r |</span>
<span class="sd"> +-------------------------------+</span>
<span class="sd"> |INTERVAL &#39;2015-0&#39; YEAR TO MONTH|</span>
<span class="sd"> +-------------------------------+</span>
<span class="sd"> &gt;&gt;&gt; spark.conf.unset(&quot;spark.sql.session.timeZone&quot;)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">_years</span> <span class="o">=</span> <span class="n">lit</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span> <span class="k">if</span> <span class="n">years</span> <span class="ow">is</span> <span class="kc">None</span> <span class="k">else</span> <span class="n">years</span>
<span class="n">_months</span> <span class="o">=</span> <span class="n">lit</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span> <span class="k">if</span> <span class="n">months</span> <span class="ow">is</span> <span class="kc">None</span> <span class="k">else</span> <span class="n">months</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;make_ym_interval&quot;</span><span class="p">,</span> <span class="n">_years</span><span class="p">,</span> <span class="n">_months</span><span class="p">)</span></div>
<div class="viewcode-block" id="bucket"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.bucket.html#pyspark.sql.functions.bucket">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">bucket</span><span class="p">(</span><span class="n">numBuckets</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="n">Column</span><span class="p">,</span> <span class="nb">int</span><span class="p">],</span> <span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Partition transform function: A transform for any type that partitions</span>
<span class="sd"> by a hash of the input column.</span>
<span class="sd"> .. versionadded:: 3.1.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df.writeTo(&quot;catalog.db.table&quot;).partitionedBy( # doctest: +SKIP</span>
<span class="sd"> ... bucket(42, &quot;ts&quot;)</span>
<span class="sd"> ... ).createOrReplace()</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> target date or timestamp column to work on.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> data partitioned by given columns.</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> This function can be used only in combination with</span>
<span class="sd"> :py:meth:`~pyspark.sql.readwriter.DataFrameWriterV2.partitionedBy`</span>
<span class="sd"> method of the `DataFrameWriterV2`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">numBuckets</span><span class="p">,</span> <span class="p">(</span><span class="nb">int</span><span class="p">,</span> <span class="n">Column</span><span class="p">)):</span>
<span class="k">raise</span> <span class="n">PySparkTypeError</span><span class="p">(</span>
<span class="n">error_class</span><span class="o">=</span><span class="s2">&quot;NOT_COLUMN_OR_INT&quot;</span><span class="p">,</span>
<span class="n">message_parameters</span><span class="o">=</span><span class="p">{</span><span class="s2">&quot;arg_name&quot;</span><span class="p">:</span> <span class="s2">&quot;numBuckets&quot;</span><span class="p">,</span> <span class="s2">&quot;arg_type&quot;</span><span class="p">:</span> <span class="nb">type</span><span class="p">(</span><span class="n">numBuckets</span><span class="p">)</span><span class="o">.</span><span class="vm">__name__</span><span class="p">},</span>
<span class="p">)</span>
<span class="n">get_active_spark_context</span><span class="p">()</span>
<span class="n">numBuckets</span> <span class="o">=</span> <span class="p">(</span>
<span class="n">_create_column_from_literal</span><span class="p">(</span><span class="n">numBuckets</span><span class="p">)</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">numBuckets</span><span class="p">,</span> <span class="nb">int</span><span class="p">)</span>
<span class="k">else</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">numBuckets</span><span class="p">)</span>
<span class="p">)</span>
<span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">&quot;bucket&quot;</span><span class="p">,</span> <span class="n">numBuckets</span><span class="p">,</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">))</span></div>
<div class="viewcode-block" id="call_udf"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.call_udf.html#pyspark.sql.functions.call_udf">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">call_udf</span><span class="p">(</span><span class="n">udfName</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="o">*</span><span class="n">cols</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Call an user-defined function.</span>
<span class="sd"> .. versionadded:: 3.4.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> udfName : str</span>
<span class="sd"> name of the user defined function (UDF)</span>
<span class="sd"> cols : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> column names or :class:`~pyspark.sql.Column`\\s to be used in the UDF</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> result of executed udf.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; from pyspark.sql.functions import call_udf, col</span>
<span class="sd"> &gt;&gt;&gt; from pyspark.sql.types import IntegerType, StringType</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(1, &quot;a&quot;),(2, &quot;b&quot;), (3, &quot;c&quot;)],[&quot;id&quot;, &quot;name&quot;])</span>
<span class="sd"> &gt;&gt;&gt; _ = spark.udf.register(&quot;intX2&quot;, lambda i: i * 2, IntegerType())</span>
<span class="sd"> &gt;&gt;&gt; df.select(call_udf(&quot;intX2&quot;, &quot;id&quot;)).show()</span>
<span class="sd"> +---------+</span>
<span class="sd"> |intX2(id)|</span>
<span class="sd"> +---------+</span>
<span class="sd"> | 2|</span>
<span class="sd"> | 4|</span>
<span class="sd"> | 6|</span>
<span class="sd"> +---------+</span>
<span class="sd"> &gt;&gt;&gt; _ = spark.udf.register(&quot;strX2&quot;, lambda s: s * 2, StringType())</span>
<span class="sd"> &gt;&gt;&gt; df.select(call_udf(&quot;strX2&quot;, col(&quot;name&quot;))).show()</span>
<span class="sd"> +-----------+</span>
<span class="sd"> |strX2(name)|</span>
<span class="sd"> +-----------+</span>
<span class="sd"> | aa|</span>
<span class="sd"> | bb|</span>
<span class="sd"> | cc|</span>
<span class="sd"> +-----------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">get_active_spark_context</span><span class="p">()</span>
<span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">&quot;call_udf&quot;</span><span class="p">,</span> <span class="n">udfName</span><span class="p">,</span> <span class="n">_to_seq</span><span class="p">(</span><span class="n">sc</span><span class="p">,</span> <span class="n">cols</span><span class="p">,</span> <span class="n">_to_java_column</span><span class="p">))</span></div>
<div class="viewcode-block" id="call_function"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.call_function.html#pyspark.sql.functions.call_function">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">call_function</span><span class="p">(</span><span class="n">funcName</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="o">*</span><span class="n">cols</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Call a SQL function.</span>
<span class="sd"> .. versionadded:: 3.5.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> funcName : str</span>
<span class="sd"> function name that follows the SQL identifier syntax (can be quoted, can be qualified)</span>
<span class="sd"> cols : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> column names or :class:`~pyspark.sql.Column`\\s to be used in the function</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> result of executed function.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; from pyspark.sql.functions import call_udf, col</span>
<span class="sd"> &gt;&gt;&gt; from pyspark.sql.types import IntegerType, StringType</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(1, &quot;a&quot;),(2, &quot;b&quot;), (3, &quot;c&quot;)],[&quot;id&quot;, &quot;name&quot;])</span>
<span class="sd"> &gt;&gt;&gt; _ = spark.udf.register(&quot;intX2&quot;, lambda i: i * 2, IntegerType())</span>
<span class="sd"> &gt;&gt;&gt; df.select(call_function(&quot;intX2&quot;, &quot;id&quot;)).show()</span>
<span class="sd"> +---------+</span>
<span class="sd"> |intX2(id)|</span>
<span class="sd"> +---------+</span>
<span class="sd"> | 2|</span>
<span class="sd"> | 4|</span>
<span class="sd"> | 6|</span>
<span class="sd"> +---------+</span>
<span class="sd"> &gt;&gt;&gt; _ = spark.udf.register(&quot;strX2&quot;, lambda s: s * 2, StringType())</span>
<span class="sd"> &gt;&gt;&gt; df.select(call_function(&quot;strX2&quot;, col(&quot;name&quot;))).show()</span>
<span class="sd"> +-----------+</span>
<span class="sd"> |strX2(name)|</span>
<span class="sd"> +-----------+</span>
<span class="sd"> | aa|</span>
<span class="sd"> | bb|</span>
<span class="sd"> | cc|</span>
<span class="sd"> +-----------+</span>
<span class="sd"> &gt;&gt;&gt; df.select(call_function(&quot;avg&quot;, col(&quot;id&quot;))).show()</span>
<span class="sd"> +-------+</span>
<span class="sd"> |avg(id)|</span>
<span class="sd"> +-------+</span>
<span class="sd"> | 2.0|</span>
<span class="sd"> +-------+</span>
<span class="sd"> &gt;&gt;&gt; _ = spark.sql(&quot;CREATE FUNCTION custom_avg AS &#39;test.org.apache.spark.sql.MyDoubleAvg&#39;&quot;)</span>
<span class="sd"> ... # doctest: +SKIP</span>
<span class="sd"> &gt;&gt;&gt; df.select(call_function(&quot;custom_avg&quot;, col(&quot;id&quot;))).show()</span>
<span class="sd"> ... # doctest: +SKIP</span>
<span class="sd"> +------------------------------------+</span>
<span class="sd"> |spark_catalog.default.custom_avg(id)|</span>
<span class="sd"> +------------------------------------+</span>
<span class="sd"> | 102.0|</span>
<span class="sd"> +------------------------------------+</span>
<span class="sd"> &gt;&gt;&gt; df.select(call_function(&quot;spark_catalog.default.custom_avg&quot;, col(&quot;id&quot;))).show()</span>
<span class="sd"> ... # doctest: +SKIP</span>
<span class="sd"> +------------------------------------+</span>
<span class="sd"> |spark_catalog.default.custom_avg(id)|</span>
<span class="sd"> +------------------------------------+</span>
<span class="sd"> | 102.0|</span>
<span class="sd"> +------------------------------------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">get_active_spark_context</span><span class="p">()</span>
<span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">&quot;call_function&quot;</span><span class="p">,</span> <span class="n">funcName</span><span class="p">,</span> <span class="n">_to_seq</span><span class="p">(</span><span class="n">sc</span><span class="p">,</span> <span class="n">cols</span><span class="p">,</span> <span class="n">_to_java_column</span><span class="p">))</span></div>
<div class="viewcode-block" id="unwrap_udt"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.unwrap_udt.html#pyspark.sql.functions.unwrap_udt">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">unwrap_udt</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Unwrap UDT data type column into its underlying type.</span>
<span class="sd"> .. versionadded:: 3.4.0</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">&quot;unwrap_udt&quot;</span><span class="p">,</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">))</span></div>
<div class="viewcode-block" id="hll_sketch_agg"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.hll_sketch_agg.html#pyspark.sql.functions.hll_sketch_agg">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">hll_sketch_agg</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="n">lgConfigK</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">int</span><span class="p">,</span> <span class="n">Column</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Aggregate function: returns the updatable binary representation of the Datasketches</span>
<span class="sd"> HllSketch configured with lgConfigK arg.</span>
<span class="sd"> .. versionadded:: 3.5.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str or int</span>
<span class="sd"> lgConfigK : int, optional</span>
<span class="sd"> The log-base-2 of K, where K is the number of buckets or slots for the HllSketch</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> The binary representation of the HllSketch.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([1,2,2,3], &quot;INT&quot;)</span>
<span class="sd"> &gt;&gt;&gt; df1 = df.agg(hll_sketch_estimate(hll_sketch_agg(&quot;value&quot;)).alias(&quot;distinct_cnt&quot;))</span>
<span class="sd"> &gt;&gt;&gt; df1.show()</span>
<span class="sd"> +------------+</span>
<span class="sd"> |distinct_cnt|</span>
<span class="sd"> +------------+</span>
<span class="sd"> | 3|</span>
<span class="sd"> +------------+</span>
<span class="sd"> &gt;&gt;&gt; df2 = df.agg(hll_sketch_estimate(</span>
<span class="sd"> ... hll_sketch_agg(&quot;value&quot;, lit(12))</span>
<span class="sd"> ... ).alias(&quot;distinct_cnt&quot;))</span>
<span class="sd"> &gt;&gt;&gt; df2.show()</span>
<span class="sd"> +------------+</span>
<span class="sd"> |distinct_cnt|</span>
<span class="sd"> +------------+</span>
<span class="sd"> | 3|</span>
<span class="sd"> +------------+</span>
<span class="sd"> &gt;&gt;&gt; df3 = df.agg(hll_sketch_estimate(</span>
<span class="sd"> ... hll_sketch_agg(col(&quot;value&quot;), lit(12))).alias(&quot;distinct_cnt&quot;))</span>
<span class="sd"> &gt;&gt;&gt; df3.show()</span>
<span class="sd"> +------------+</span>
<span class="sd"> |distinct_cnt|</span>
<span class="sd"> +------------+</span>
<span class="sd"> | 3|</span>
<span class="sd"> +------------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="n">lgConfigK</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;hll_sketch_agg&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">_lgConfigK</span> <span class="o">=</span> <span class="n">lit</span><span class="p">(</span><span class="n">lgConfigK</span><span class="p">)</span> <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">lgConfigK</span><span class="p">,</span> <span class="nb">int</span><span class="p">)</span> <span class="k">else</span> <span class="n">lgConfigK</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;hll_sketch_agg&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">,</span> <span class="n">_lgConfigK</span><span class="p">)</span></div>
<div class="viewcode-block" id="hll_union_agg"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.hll_union_agg.html#pyspark.sql.functions.hll_union_agg">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">hll_union_agg</span><span class="p">(</span>
<span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="n">allowDifferentLgConfigK</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">bool</span><span class="p">,</span> <span class="n">Column</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Aggregate function: returns the updatable binary representation of the Datasketches</span>
<span class="sd"> HllSketch, generated by merging previously created Datasketches HllSketch instances</span>
<span class="sd"> via a Datasketches Union instance. Throws an exception if sketches have different</span>
<span class="sd"> lgConfigK values and allowDifferentLgConfigK is unset or set to false.</span>
<span class="sd"> .. versionadded:: 3.5.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str or bool</span>
<span class="sd"> allowDifferentLgConfigK : bool, optional</span>
<span class="sd"> Allow sketches with different lgConfigK values to be merged (defaults to false).</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> The binary representation of the merged HllSketch.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df1 = spark.createDataFrame([1,2,2,3], &quot;INT&quot;)</span>
<span class="sd"> &gt;&gt;&gt; df1 = df1.agg(hll_sketch_agg(&quot;value&quot;).alias(&quot;sketch&quot;))</span>
<span class="sd"> &gt;&gt;&gt; df2 = spark.createDataFrame([4,5,5,6], &quot;INT&quot;)</span>
<span class="sd"> &gt;&gt;&gt; df2 = df2.agg(hll_sketch_agg(&quot;value&quot;).alias(&quot;sketch&quot;))</span>
<span class="sd"> &gt;&gt;&gt; df3 = df1.union(df2).agg(hll_sketch_estimate(</span>
<span class="sd"> ... hll_union_agg(&quot;sketch&quot;)</span>
<span class="sd"> ... ).alias(&quot;distinct_cnt&quot;))</span>
<span class="sd"> &gt;&gt;&gt; df3.drop(&quot;sketch&quot;).show()</span>
<span class="sd"> +------------+</span>
<span class="sd"> |distinct_cnt|</span>
<span class="sd"> +------------+</span>
<span class="sd"> | 6|</span>
<span class="sd"> +------------+</span>
<span class="sd"> &gt;&gt;&gt; df4 = df1.union(df2).agg(hll_sketch_estimate(</span>
<span class="sd"> ... hll_union_agg(&quot;sketch&quot;, lit(False))</span>
<span class="sd"> ... ).alias(&quot;distinct_cnt&quot;))</span>
<span class="sd"> &gt;&gt;&gt; df4.drop(&quot;sketch&quot;).show()</span>
<span class="sd"> +------------+</span>
<span class="sd"> |distinct_cnt|</span>
<span class="sd"> +------------+</span>
<span class="sd"> | 6|</span>
<span class="sd"> +------------+</span>
<span class="sd"> &gt;&gt;&gt; df5 = df1.union(df2).agg(hll_sketch_estimate(</span>
<span class="sd"> ... hll_union_agg(col(&quot;sketch&quot;), lit(False))</span>
<span class="sd"> ... ).alias(&quot;distinct_cnt&quot;))</span>
<span class="sd"> &gt;&gt;&gt; df5.drop(&quot;sketch&quot;).show()</span>
<span class="sd"> +------------+</span>
<span class="sd"> |distinct_cnt|</span>
<span class="sd"> +------------+</span>
<span class="sd"> | 6|</span>
<span class="sd"> +------------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="n">allowDifferentLgConfigK</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;hll_union_agg&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">_allowDifferentLgConfigK</span> <span class="o">=</span> <span class="p">(</span>
<span class="n">lit</span><span class="p">(</span><span class="n">allowDifferentLgConfigK</span><span class="p">)</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">allowDifferentLgConfigK</span><span class="p">,</span> <span class="nb">bool</span><span class="p">)</span>
<span class="k">else</span> <span class="n">allowDifferentLgConfigK</span>
<span class="p">)</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;hll_union_agg&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">,</span> <span class="n">_allowDifferentLgConfigK</span><span class="p">)</span></div>
<div class="viewcode-block" id="hll_sketch_estimate"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.hll_sketch_estimate.html#pyspark.sql.functions.hll_sketch_estimate">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">hll_sketch_estimate</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns the estimated number of unique values given the binary representation</span>
<span class="sd"> of a Datasketches HllSketch.</span>
<span class="sd"> .. versionadded:: 3.5.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> The estimated number of unique values for the HllSketch.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([1,2,2,3], &quot;INT&quot;)</span>
<span class="sd"> &gt;&gt;&gt; df = df.agg(hll_sketch_estimate(hll_sketch_agg(&quot;value&quot;)).alias(&quot;distinct_cnt&quot;))</span>
<span class="sd"> &gt;&gt;&gt; df.show()</span>
<span class="sd"> +------------+</span>
<span class="sd"> |distinct_cnt|</span>
<span class="sd"> +------------+</span>
<span class="sd"> | 3|</span>
<span class="sd"> +------------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">&quot;hll_sketch_estimate&quot;</span><span class="p">,</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">))</span></div>
<div class="viewcode-block" id="hll_union"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.hll_union.html#pyspark.sql.functions.hll_union">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">hll_union</span><span class="p">(</span>
<span class="n">col1</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="n">col2</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="n">allowDifferentLgConfigK</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">bool</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Merges two binary representations of Datasketches HllSketch objects, using a</span>
<span class="sd"> Datasketches Union object. Throws an exception if sketches have different</span>
<span class="sd"> lgConfigK values and allowDifferentLgConfigK is unset or set to false.</span>
<span class="sd"> .. versionadded:: 3.5.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col1 : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> col2 : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> allowDifferentLgConfigK : bool, optional</span>
<span class="sd"> Allow sketches with different lgConfigK values to be merged (defaults to false).</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> The binary representation of the merged HllSketch.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(1,4),(2,5),(2,5),(3,6)], &quot;struct&lt;v1:int,v2:int&gt;&quot;)</span>
<span class="sd"> &gt;&gt;&gt; df = df.agg(hll_sketch_agg(&quot;v1&quot;).alias(&quot;sketch1&quot;), hll_sketch_agg(&quot;v2&quot;).alias(&quot;sketch2&quot;))</span>
<span class="sd"> &gt;&gt;&gt; df = df.withColumn(&quot;distinct_cnt&quot;, hll_sketch_estimate(hll_union(&quot;sketch1&quot;, &quot;sketch2&quot;)))</span>
<span class="sd"> &gt;&gt;&gt; df.drop(&quot;sketch1&quot;, &quot;sketch2&quot;).show()</span>
<span class="sd"> +------------+</span>
<span class="sd"> |distinct_cnt|</span>
<span class="sd"> +------------+</span>
<span class="sd"> | 6|</span>
<span class="sd"> +------------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="n">allowDifferentLgConfigK</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span>
<span class="s2">&quot;hll_union&quot;</span><span class="p">,</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">col1</span><span class="p">),</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">col2</span><span class="p">),</span> <span class="n">allowDifferentLgConfigK</span>
<span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">&quot;hll_union&quot;</span><span class="p">,</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">col1</span><span class="p">),</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">col2</span><span class="p">))</span></div>
<span class="c1"># ---------------------- Predicates functions ------------------------------</span>
<div class="viewcode-block" id="ifnull"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.ifnull.html#pyspark.sql.functions.ifnull">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">ifnull</span><span class="p">(</span><span class="n">col1</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="n">col2</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns `col2` if `col1` is null, or `col1` otherwise.</span>
<span class="sd"> .. versionadded:: 3.5.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col1 : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> col2 : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; import pyspark.sql.functions as sf</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(None,), (1,)], [&quot;e&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(sf.ifnull(df.e, sf.lit(8))).show()</span>
<span class="sd"> +------------+</span>
<span class="sd"> |ifnull(e, 8)|</span>
<span class="sd"> +------------+</span>
<span class="sd"> | 8|</span>
<span class="sd"> | 1|</span>
<span class="sd"> +------------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;ifnull&quot;</span><span class="p">,</span> <span class="n">col1</span><span class="p">,</span> <span class="n">col2</span><span class="p">)</span></div>
<div class="viewcode-block" id="isnotnull"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.isnotnull.html#pyspark.sql.functions.isnotnull">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">isnotnull</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns true if `col` is not null, or false otherwise.</span>
<span class="sd"> .. versionadded:: 3.5.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(None,), (1,)], [&quot;e&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(isnotnull(df.e).alias(&#39;r&#39;)).collect()</span>
<span class="sd"> [Row(r=False), Row(r=True)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;isnotnull&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="equal_null"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.equal_null.html#pyspark.sql.functions.equal_null">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">equal_null</span><span class="p">(</span><span class="n">col1</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="n">col2</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns same result as the EQUAL(=) operator for non-null operands,</span>
<span class="sd"> but returns true if both are null, false if one of the them is null.</span>
<span class="sd"> .. versionadded:: 3.5.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col1 : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> col2 : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(None, None,), (1, 9,)], [&quot;a&quot;, &quot;b&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(equal_null(df.a, df.b).alias(&#39;r&#39;)).collect()</span>
<span class="sd"> [Row(r=True), Row(r=False)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;equal_null&quot;</span><span class="p">,</span> <span class="n">col1</span><span class="p">,</span> <span class="n">col2</span><span class="p">)</span></div>
<div class="viewcode-block" id="nullif"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.nullif.html#pyspark.sql.functions.nullif">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">nullif</span><span class="p">(</span><span class="n">col1</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="n">col2</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns null if `col1` equals to `col2`, or `col1` otherwise.</span>
<span class="sd"> .. versionadded:: 3.5.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col1 : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> col2 : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(None, None,), (1, 9,)], [&quot;a&quot;, &quot;b&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(nullif(df.a, df.b).alias(&#39;r&#39;)).collect()</span>
<span class="sd"> [Row(r=None), Row(r=1)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;nullif&quot;</span><span class="p">,</span> <span class="n">col1</span><span class="p">,</span> <span class="n">col2</span><span class="p">)</span></div>
<div class="viewcode-block" id="nvl"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.nvl.html#pyspark.sql.functions.nvl">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">nvl</span><span class="p">(</span><span class="n">col1</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="n">col2</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns `col2` if `col1` is null, or `col1` otherwise.</span>
<span class="sd"> .. versionadded:: 3.5.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col1 : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> col2 : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(None, 8,), (1, 9,)], [&quot;a&quot;, &quot;b&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(nvl(df.a, df.b).alias(&#39;r&#39;)).collect()</span>
<span class="sd"> [Row(r=8), Row(r=1)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;nvl&quot;</span><span class="p">,</span> <span class="n">col1</span><span class="p">,</span> <span class="n">col2</span><span class="p">)</span></div>
<div class="viewcode-block" id="nvl2"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.nvl2.html#pyspark.sql.functions.nvl2">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">nvl2</span><span class="p">(</span><span class="n">col1</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="n">col2</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="n">col3</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns `col2` if `col1` is not null, or `col3` otherwise.</span>
<span class="sd"> .. versionadded:: 3.5.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col1 : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> col2 : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> col3 : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(None, 8, 6,), (1, 9, 9,)], [&quot;a&quot;, &quot;b&quot;, &quot;c&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(nvl2(df.a, df.b, df.c).alias(&#39;r&#39;)).collect()</span>
<span class="sd"> [Row(r=6), Row(r=9)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;nvl2&quot;</span><span class="p">,</span> <span class="n">col1</span><span class="p">,</span> <span class="n">col2</span><span class="p">,</span> <span class="n">col3</span><span class="p">)</span></div>
<div class="viewcode-block" id="aes_encrypt"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.aes_encrypt.html#pyspark.sql.functions.aes_encrypt">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">aes_encrypt</span><span class="p">(</span>
<span class="nb">input</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span>
<span class="n">key</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span>
<span class="n">mode</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="s2">&quot;ColumnOrName&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">padding</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="s2">&quot;ColumnOrName&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">iv</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="s2">&quot;ColumnOrName&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">aad</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="s2">&quot;ColumnOrName&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns an encrypted value of `input` using AES in given `mode` with the specified `padding`.</span>
<span class="sd"> Key lengths of 16, 24 and 32 bits are supported. Supported combinations of (`mode`,</span>
<span class="sd"> `padding`) are (&#39;ECB&#39;, &#39;PKCS&#39;), (&#39;GCM&#39;, &#39;NONE&#39;) and (&#39;CBC&#39;, &#39;PKCS&#39;). Optional initialization</span>
<span class="sd"> vectors (IVs) are only supported for CBC and GCM modes. These must be 16 bytes for CBC and 12</span>
<span class="sd"> bytes for GCM. If not provided, a random vector will be generated and prepended to the</span>
<span class="sd"> output. Optional additional authenticated data (AAD) is only supported for GCM. If provided</span>
<span class="sd"> for encryption, the identical AAD value must be provided for decryption. The default mode is</span>
<span class="sd"> GCM.</span>
<span class="sd"> .. versionadded:: 3.5.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> input : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> The binary value to encrypt.</span>
<span class="sd"> key : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> The passphrase to use to encrypt the data.</span>
<span class="sd"> mode : :class:`~pyspark.sql.Column` or str, optional</span>
<span class="sd"> Specifies which block cipher mode should be used to encrypt messages. Valid modes: ECB,</span>
<span class="sd"> GCM, CBC.</span>
<span class="sd"> padding : :class:`~pyspark.sql.Column` or str, optional</span>
<span class="sd"> Specifies how to pad messages whose length is not a multiple of the block size. Valid</span>
<span class="sd"> values: PKCS, NONE, DEFAULT. The DEFAULT padding means PKCS for ECB, NONE for GCM and PKCS</span>
<span class="sd"> for CBC.</span>
<span class="sd"> iv : :class:`~pyspark.sql.Column` or str, optional</span>
<span class="sd"> Optional initialization vector. Only supported for CBC and GCM modes. Valid values: None or</span>
<span class="sd"> &quot;&quot;. 16-byte array for CBC mode. 12-byte array for GCM mode.</span>
<span class="sd"> aad : :class:`~pyspark.sql.Column` or str, optional</span>
<span class="sd"> Optional additional authenticated data. Only supported for GCM mode. This can be any</span>
<span class="sd"> free-form input and must be provided for both encryption and decryption.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(</span>
<span class="sd"> ... &quot;Spark&quot;, &quot;abcdefghijklmnop12345678ABCDEFGH&quot;, &quot;GCM&quot;, &quot;DEFAULT&quot;,</span>
<span class="sd"> ... &quot;000000000000000000000000&quot;, &quot;This is an AAD mixed into the input&quot;,)],</span>
<span class="sd"> ... [&quot;input&quot;, &quot;key&quot;, &quot;mode&quot;, &quot;padding&quot;, &quot;iv&quot;, &quot;aad&quot;]</span>
<span class="sd"> ... )</span>
<span class="sd"> &gt;&gt;&gt; df.select(base64(aes_encrypt(</span>
<span class="sd"> ... df.input, df.key, df.mode, df.padding, to_binary(df.iv, lit(&quot;hex&quot;)), df.aad)</span>
<span class="sd"> ... ).alias(&#39;r&#39;)).collect()</span>
<span class="sd"> [Row(r=&#39;AAAAAAAAAAAAAAAAQiYi+sTLm7KD9UcZ2nlRdYDe/PX4&#39;)]</span>
<span class="sd"> &gt;&gt;&gt; df.select(base64(aes_encrypt(</span>
<span class="sd"> ... df.input, df.key, df.mode, df.padding, to_binary(df.iv, lit(&quot;hex&quot;)))</span>
<span class="sd"> ... ).alias(&#39;r&#39;)).collect()</span>
<span class="sd"> [Row(r=&#39;AAAAAAAAAAAAAAAAQiYi+sRNYDAOTjdSEcYBFsAWPL1f&#39;)]</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(</span>
<span class="sd"> ... &quot;Spark SQL&quot;, &quot;1234567890abcdef&quot;, &quot;ECB&quot;, &quot;PKCS&quot;,)],</span>
<span class="sd"> ... [&quot;input&quot;, &quot;key&quot;, &quot;mode&quot;, &quot;padding&quot;]</span>
<span class="sd"> ... )</span>
<span class="sd"> &gt;&gt;&gt; df.select(aes_decrypt(aes_encrypt(df.input, df.key, df.mode, df.padding),</span>
<span class="sd"> ... df.key, df.mode, df.padding).alias(&#39;r&#39;)</span>
<span class="sd"> ... ).collect()</span>
<span class="sd"> [Row(r=bytearray(b&#39;Spark SQL&#39;))]</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(</span>
<span class="sd"> ... &quot;Spark SQL&quot;, &quot;0000111122223333&quot;, &quot;ECB&quot;,)],</span>
<span class="sd"> ... [&quot;input&quot;, &quot;key&quot;, &quot;mode&quot;]</span>
<span class="sd"> ... )</span>
<span class="sd"> &gt;&gt;&gt; df.select(aes_decrypt(aes_encrypt(df.input, df.key, df.mode),</span>
<span class="sd"> ... df.key, df.mode).alias(&#39;r&#39;)</span>
<span class="sd"> ... ).collect()</span>
<span class="sd"> [Row(r=bytearray(b&#39;Spark SQL&#39;))]</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(</span>
<span class="sd"> ... &quot;Spark SQL&quot;, &quot;abcdefghijklmnop&quot;,)],</span>
<span class="sd"> ... [&quot;input&quot;, &quot;key&quot;]</span>
<span class="sd"> ... )</span>
<span class="sd"> &gt;&gt;&gt; df.select(aes_decrypt(</span>
<span class="sd"> ... unbase64(base64(aes_encrypt(df.input, df.key))), df.key</span>
<span class="sd"> ... ).cast(&quot;STRING&quot;).alias(&#39;r&#39;)).collect()</span>
<span class="sd"> [Row(r=&#39;Spark SQL&#39;)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">_mode</span> <span class="o">=</span> <span class="n">lit</span><span class="p">(</span><span class="s2">&quot;GCM&quot;</span><span class="p">)</span> <span class="k">if</span> <span class="n">mode</span> <span class="ow">is</span> <span class="kc">None</span> <span class="k">else</span> <span class="n">mode</span>
<span class="n">_padding</span> <span class="o">=</span> <span class="n">lit</span><span class="p">(</span><span class="s2">&quot;DEFAULT&quot;</span><span class="p">)</span> <span class="k">if</span> <span class="n">padding</span> <span class="ow">is</span> <span class="kc">None</span> <span class="k">else</span> <span class="n">padding</span>
<span class="n">_iv</span> <span class="o">=</span> <span class="n">lit</span><span class="p">(</span><span class="s2">&quot;&quot;</span><span class="p">)</span> <span class="k">if</span> <span class="n">iv</span> <span class="ow">is</span> <span class="kc">None</span> <span class="k">else</span> <span class="n">iv</span>
<span class="n">_aad</span> <span class="o">=</span> <span class="n">lit</span><span class="p">(</span><span class="s2">&quot;&quot;</span><span class="p">)</span> <span class="k">if</span> <span class="n">aad</span> <span class="ow">is</span> <span class="kc">None</span> <span class="k">else</span> <span class="n">aad</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;aes_encrypt&quot;</span><span class="p">,</span> <span class="nb">input</span><span class="p">,</span> <span class="n">key</span><span class="p">,</span> <span class="n">_mode</span><span class="p">,</span> <span class="n">_padding</span><span class="p">,</span> <span class="n">_iv</span><span class="p">,</span> <span class="n">_aad</span><span class="p">)</span></div>
<div class="viewcode-block" id="aes_decrypt"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.aes_decrypt.html#pyspark.sql.functions.aes_decrypt">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">aes_decrypt</span><span class="p">(</span>
<span class="nb">input</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span>
<span class="n">key</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span>
<span class="n">mode</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="s2">&quot;ColumnOrName&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">padding</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="s2">&quot;ColumnOrName&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">aad</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="s2">&quot;ColumnOrName&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns a decrypted value of `input` using AES in `mode` with `padding`. Key lengths of 16,</span>
<span class="sd"> 24 and 32 bits are supported. Supported combinations of (`mode`, `padding`) are (&#39;ECB&#39;,</span>
<span class="sd"> &#39;PKCS&#39;), (&#39;GCM&#39;, &#39;NONE&#39;) and (&#39;CBC&#39;, &#39;PKCS&#39;). Optional additional authenticated data (AAD) is</span>
<span class="sd"> only supported for GCM. If provided for encryption, the identical AAD value must be provided</span>
<span class="sd"> for decryption. The default mode is GCM.</span>
<span class="sd"> .. versionadded:: 3.5.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> input : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> The binary value to decrypt.</span>
<span class="sd"> key : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> The passphrase to use to decrypt the data.</span>
<span class="sd"> mode : :class:`~pyspark.sql.Column` or str, optional</span>
<span class="sd"> Specifies which block cipher mode should be used to decrypt messages. Valid modes: ECB,</span>
<span class="sd"> GCM, CBC.</span>
<span class="sd"> padding : :class:`~pyspark.sql.Column` or str, optional</span>
<span class="sd"> Specifies how to pad messages whose length is not a multiple of the block size. Valid</span>
<span class="sd"> values: PKCS, NONE, DEFAULT. The DEFAULT padding means PKCS for ECB, NONE for GCM and PKCS</span>
<span class="sd"> for CBC.</span>
<span class="sd"> aad : :class:`~pyspark.sql.Column` or str, optional</span>
<span class="sd"> Optional additional authenticated data. Only supported for GCM mode. This can be any</span>
<span class="sd"> free-form input and must be provided for both encryption and decryption.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(</span>
<span class="sd"> ... &quot;AAAAAAAAAAAAAAAAQiYi+sTLm7KD9UcZ2nlRdYDe/PX4&quot;,</span>
<span class="sd"> ... &quot;abcdefghijklmnop12345678ABCDEFGH&quot;, &quot;GCM&quot;, &quot;DEFAULT&quot;,</span>
<span class="sd"> ... &quot;This is an AAD mixed into the input&quot;,)],</span>
<span class="sd"> ... [&quot;input&quot;, &quot;key&quot;, &quot;mode&quot;, &quot;padding&quot;, &quot;aad&quot;]</span>
<span class="sd"> ... )</span>
<span class="sd"> &gt;&gt;&gt; df.select(aes_decrypt(</span>
<span class="sd"> ... unbase64(df.input), df.key, df.mode, df.padding, df.aad).alias(&#39;r&#39;)</span>
<span class="sd"> ... ).collect()</span>
<span class="sd"> [Row(r=bytearray(b&#39;Spark&#39;))]</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(</span>
<span class="sd"> ... &quot;AAAAAAAAAAAAAAAAAAAAAPSd4mWyMZ5mhvjiAPQJnfg=&quot;,</span>
<span class="sd"> ... &quot;abcdefghijklmnop12345678ABCDEFGH&quot;, &quot;CBC&quot;, &quot;DEFAULT&quot;,)],</span>
<span class="sd"> ... [&quot;input&quot;, &quot;key&quot;, &quot;mode&quot;, &quot;padding&quot;]</span>
<span class="sd"> ... )</span>
<span class="sd"> &gt;&gt;&gt; df.select(aes_decrypt(</span>
<span class="sd"> ... unbase64(df.input), df.key, df.mode, df.padding).alias(&#39;r&#39;)</span>
<span class="sd"> ... ).collect()</span>
<span class="sd"> [Row(r=bytearray(b&#39;Spark&#39;))]</span>
<span class="sd"> &gt;&gt;&gt; df.select(aes_decrypt(unbase64(df.input), df.key, df.mode).alias(&#39;r&#39;)).collect()</span>
<span class="sd"> [Row(r=bytearray(b&#39;Spark&#39;))]</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(</span>
<span class="sd"> ... &quot;83F16B2AA704794132802D248E6BFD4E380078182D1544813898AC97E709B28A94&quot;,</span>
<span class="sd"> ... &quot;0000111122223333&quot;,)],</span>
<span class="sd"> ... [&quot;input&quot;, &quot;key&quot;]</span>
<span class="sd"> ... )</span>
<span class="sd"> &gt;&gt;&gt; df.select(aes_decrypt(unhex(df.input), df.key).alias(&#39;r&#39;)).collect()</span>
<span class="sd"> [Row(r=bytearray(b&#39;Spark&#39;))]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">_mode</span> <span class="o">=</span> <span class="n">lit</span><span class="p">(</span><span class="s2">&quot;GCM&quot;</span><span class="p">)</span> <span class="k">if</span> <span class="n">mode</span> <span class="ow">is</span> <span class="kc">None</span> <span class="k">else</span> <span class="n">mode</span>
<span class="n">_padding</span> <span class="o">=</span> <span class="n">lit</span><span class="p">(</span><span class="s2">&quot;DEFAULT&quot;</span><span class="p">)</span> <span class="k">if</span> <span class="n">padding</span> <span class="ow">is</span> <span class="kc">None</span> <span class="k">else</span> <span class="n">padding</span>
<span class="n">_aad</span> <span class="o">=</span> <span class="n">lit</span><span class="p">(</span><span class="s2">&quot;&quot;</span><span class="p">)</span> <span class="k">if</span> <span class="n">aad</span> <span class="ow">is</span> <span class="kc">None</span> <span class="k">else</span> <span class="n">aad</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;aes_decrypt&quot;</span><span class="p">,</span> <span class="nb">input</span><span class="p">,</span> <span class="n">key</span><span class="p">,</span> <span class="n">_mode</span><span class="p">,</span> <span class="n">_padding</span><span class="p">,</span> <span class="n">_aad</span><span class="p">)</span></div>
<div class="viewcode-block" id="try_aes_decrypt"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.try_aes_decrypt.html#pyspark.sql.functions.try_aes_decrypt">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">try_aes_decrypt</span><span class="p">(</span>
<span class="nb">input</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span>
<span class="n">key</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span>
<span class="n">mode</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="s2">&quot;ColumnOrName&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">padding</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="s2">&quot;ColumnOrName&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">aad</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="s2">&quot;ColumnOrName&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> This is a special version of `aes_decrypt` that performs the same operation,</span>
<span class="sd"> but returns a NULL value instead of raising an error if the decryption cannot be performed.</span>
<span class="sd"> Returns a decrypted value of `input` using AES in `mode` with `padding`. Key lengths of 16,</span>
<span class="sd"> 24 and 32 bits are supported. Supported combinations of (`mode`, `padding`) are (&#39;ECB&#39;,</span>
<span class="sd"> &#39;PKCS&#39;), (&#39;GCM&#39;, &#39;NONE&#39;) and (&#39;CBC&#39;, &#39;PKCS&#39;). Optional additional authenticated data (AAD) is</span>
<span class="sd"> only supported for GCM. If provided for encryption, the identical AAD value must be provided</span>
<span class="sd"> for decryption. The default mode is GCM.</span>
<span class="sd"> .. versionadded:: 3.5.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> input : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> The binary value to decrypt.</span>
<span class="sd"> key : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> The passphrase to use to decrypt the data.</span>
<span class="sd"> mode : :class:`~pyspark.sql.Column` or str, optional</span>
<span class="sd"> Specifies which block cipher mode should be used to decrypt messages. Valid modes: ECB,</span>
<span class="sd"> GCM, CBC.</span>
<span class="sd"> padding : :class:`~pyspark.sql.Column` or str, optional</span>
<span class="sd"> Specifies how to pad messages whose length is not a multiple of the block size. Valid</span>
<span class="sd"> values: PKCS, NONE, DEFAULT. The DEFAULT padding means PKCS for ECB, NONE for GCM and PKCS</span>
<span class="sd"> for CBC.</span>
<span class="sd"> aad : :class:`~pyspark.sql.Column` or str, optional</span>
<span class="sd"> Optional additional authenticated data. Only supported for GCM mode. This can be any</span>
<span class="sd"> free-form input and must be provided for both encryption and decryption.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(</span>
<span class="sd"> ... &quot;AAAAAAAAAAAAAAAAQiYi+sTLm7KD9UcZ2nlRdYDe/PX4&quot;,</span>
<span class="sd"> ... &quot;abcdefghijklmnop12345678ABCDEFGH&quot;, &quot;GCM&quot;, &quot;DEFAULT&quot;,</span>
<span class="sd"> ... &quot;This is an AAD mixed into the input&quot;,)],</span>
<span class="sd"> ... [&quot;input&quot;, &quot;key&quot;, &quot;mode&quot;, &quot;padding&quot;, &quot;aad&quot;]</span>
<span class="sd"> ... )</span>
<span class="sd"> &gt;&gt;&gt; df.select(try_aes_decrypt(</span>
<span class="sd"> ... unbase64(df.input), df.key, df.mode, df.padding, df.aad).alias(&#39;r&#39;)</span>
<span class="sd"> ... ).collect()</span>
<span class="sd"> [Row(r=bytearray(b&#39;Spark&#39;))]</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(</span>
<span class="sd"> ... &quot;AAAAAAAAAAAAAAAAAAAAAPSd4mWyMZ5mhvjiAPQJnfg=&quot;,</span>
<span class="sd"> ... &quot;abcdefghijklmnop12345678ABCDEFGH&quot;, &quot;CBC&quot;, &quot;DEFAULT&quot;,)],</span>
<span class="sd"> ... [&quot;input&quot;, &quot;key&quot;, &quot;mode&quot;, &quot;padding&quot;]</span>
<span class="sd"> ... )</span>
<span class="sd"> &gt;&gt;&gt; df.select(try_aes_decrypt(</span>
<span class="sd"> ... unbase64(df.input), df.key, df.mode, df.padding).alias(&#39;r&#39;)</span>
<span class="sd"> ... ).collect()</span>
<span class="sd"> [Row(r=bytearray(b&#39;Spark&#39;))]</span>
<span class="sd"> &gt;&gt;&gt; df.select(try_aes_decrypt(unbase64(df.input), df.key, df.mode).alias(&#39;r&#39;)).collect()</span>
<span class="sd"> [Row(r=bytearray(b&#39;Spark&#39;))]</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(</span>
<span class="sd"> ... &quot;83F16B2AA704794132802D248E6BFD4E380078182D1544813898AC97E709B28A94&quot;,</span>
<span class="sd"> ... &quot;0000111122223333&quot;,)],</span>
<span class="sd"> ... [&quot;input&quot;, &quot;key&quot;]</span>
<span class="sd"> ... )</span>
<span class="sd"> &gt;&gt;&gt; df.select(try_aes_decrypt(unhex(df.input), df.key).alias(&#39;r&#39;)).collect()</span>
<span class="sd"> [Row(r=bytearray(b&#39;Spark&#39;))]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">_mode</span> <span class="o">=</span> <span class="n">lit</span><span class="p">(</span><span class="s2">&quot;GCM&quot;</span><span class="p">)</span> <span class="k">if</span> <span class="n">mode</span> <span class="ow">is</span> <span class="kc">None</span> <span class="k">else</span> <span class="n">mode</span>
<span class="n">_padding</span> <span class="o">=</span> <span class="n">lit</span><span class="p">(</span><span class="s2">&quot;DEFAULT&quot;</span><span class="p">)</span> <span class="k">if</span> <span class="n">padding</span> <span class="ow">is</span> <span class="kc">None</span> <span class="k">else</span> <span class="n">padding</span>
<span class="n">_aad</span> <span class="o">=</span> <span class="n">lit</span><span class="p">(</span><span class="s2">&quot;&quot;</span><span class="p">)</span> <span class="k">if</span> <span class="n">aad</span> <span class="ow">is</span> <span class="kc">None</span> <span class="k">else</span> <span class="n">aad</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;try_aes_decrypt&quot;</span><span class="p">,</span> <span class="nb">input</span><span class="p">,</span> <span class="n">key</span><span class="p">,</span> <span class="n">_mode</span><span class="p">,</span> <span class="n">_padding</span><span class="p">,</span> <span class="n">_aad</span><span class="p">)</span></div>
<div class="viewcode-block" id="sha"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.sha.html#pyspark.sql.functions.sha">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">sha</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns a sha1 hash value as a hex string of the `col`.</span>
<span class="sd"> .. versionadded:: 3.5.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; import pyspark.sql.functions as sf</span>
<span class="sd"> &gt;&gt;&gt; spark.range(1).select(sf.sha(sf.lit(&quot;Spark&quot;))).show()</span>
<span class="sd"> +--------------------+</span>
<span class="sd"> | sha(Spark)|</span>
<span class="sd"> +--------------------+</span>
<span class="sd"> |85f5955f4b27a9a4c...|</span>
<span class="sd"> +--------------------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;sha&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="input_file_block_length"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.input_file_block_length.html#pyspark.sql.functions.input_file_block_length">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">input_file_block_length</span><span class="p">()</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns the length of the block being read, or -1 if not available.</span>
<span class="sd"> .. versionadded:: 3.5.0</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.read.text(&quot;python/test_support/sql/ages_newlines.csv&quot;, lineSep=&quot;,&quot;)</span>
<span class="sd"> &gt;&gt;&gt; df.select(input_file_block_length().alias(&#39;r&#39;)).first()</span>
<span class="sd"> Row(r=87)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;input_file_block_length&quot;</span><span class="p">)</span></div>
<div class="viewcode-block" id="input_file_block_start"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.input_file_block_start.html#pyspark.sql.functions.input_file_block_start">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">input_file_block_start</span><span class="p">()</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns the start offset of the block being read, or -1 if not available.</span>
<span class="sd"> .. versionadded:: 3.5.0</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.read.text(&quot;python/test_support/sql/ages_newlines.csv&quot;, lineSep=&quot;,&quot;)</span>
<span class="sd"> &gt;&gt;&gt; df.select(input_file_block_start().alias(&#39;r&#39;)).first()</span>
<span class="sd"> Row(r=0)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;input_file_block_start&quot;</span><span class="p">)</span></div>
<div class="viewcode-block" id="reflect"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.reflect.html#pyspark.sql.functions.reflect">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">reflect</span><span class="p">(</span><span class="o">*</span><span class="n">cols</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Calls a method with reflection.</span>
<span class="sd"> .. versionadded:: 3.5.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> cols : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> the first element should be a literal string for the class name,</span>
<span class="sd"> and the second element should be a literal string for the method name,</span>
<span class="sd"> and the remaining are input arguments to the Java method.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(&quot;a5cf6c42-0c85-418f-af6c-3e4e5b1328f2&quot;,)], [&quot;a&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(</span>
<span class="sd"> ... reflect(lit(&quot;java.util.UUID&quot;), lit(&quot;fromString&quot;), df.a).alias(&#39;r&#39;)</span>
<span class="sd"> ... ).collect()</span>
<span class="sd"> [Row(r=&#39;a5cf6c42-0c85-418f-af6c-3e4e5b1328f2&#39;)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_seq_of_columns</span><span class="p">(</span><span class="s2">&quot;reflect&quot;</span><span class="p">,</span> <span class="n">cols</span><span class="p">)</span></div>
<div class="viewcode-block" id="java_method"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.java_method.html#pyspark.sql.functions.java_method">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">java_method</span><span class="p">(</span><span class="o">*</span><span class="n">cols</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Calls a method with reflection.</span>
<span class="sd"> .. versionadded:: 3.5.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> cols : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> the first element should be a literal string for the class name,</span>
<span class="sd"> and the second element should be a literal string for the method name,</span>
<span class="sd"> and the remaining are input arguments to the Java method.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; import pyspark.sql.functions as sf</span>
<span class="sd"> &gt;&gt;&gt; spark.range(1).select(</span>
<span class="sd"> ... sf.java_method(</span>
<span class="sd"> ... sf.lit(&quot;java.util.UUID&quot;),</span>
<span class="sd"> ... sf.lit(&quot;fromString&quot;),</span>
<span class="sd"> ... sf.lit(&quot;a5cf6c42-0c85-418f-af6c-3e4e5b1328f2&quot;)</span>
<span class="sd"> ... )</span>
<span class="sd"> ... ).show(truncate=False)</span>
<span class="sd"> +-----------------------------------------------------------------------------+</span>
<span class="sd"> |java_method(java.util.UUID, fromString, a5cf6c42-0c85-418f-af6c-3e4e5b1328f2)|</span>
<span class="sd"> +-----------------------------------------------------------------------------+</span>
<span class="sd"> |a5cf6c42-0c85-418f-af6c-3e4e5b1328f2 |</span>
<span class="sd"> +-----------------------------------------------------------------------------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_seq_of_columns</span><span class="p">(</span><span class="s2">&quot;java_method&quot;</span><span class="p">,</span> <span class="n">cols</span><span class="p">)</span></div>
<div class="viewcode-block" id="version"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.version.html#pyspark.sql.functions.version">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">version</span><span class="p">()</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns the Spark version. The string contains 2 fields, the first being a release version</span>
<span class="sd"> and the second being a git revision.</span>
<span class="sd"> .. versionadded:: 3.5.0</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.range(1)</span>
<span class="sd"> &gt;&gt;&gt; df.select(version()).show(truncate=False) # doctest: +SKIP</span>
<span class="sd"> +----------------------------------------------+</span>
<span class="sd"> |version() |</span>
<span class="sd"> +----------------------------------------------+</span>
<span class="sd"> |3.5.0 cafbea5b13623276517a9d716f75745eff91f616|</span>
<span class="sd"> +----------------------------------------------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;version&quot;</span><span class="p">)</span></div>
<div class="viewcode-block" id="typeof"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.typeof.html#pyspark.sql.functions.typeof">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">typeof</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Return DDL-formatted type string for the data type of the input.</span>
<span class="sd"> .. versionadded:: 3.5.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(1,)], [&quot;a&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(typeof(df.a).alias(&#39;r&#39;)).collect()</span>
<span class="sd"> [Row(r=&#39;bigint&#39;)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;typeof&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="stack"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.stack.html#pyspark.sql.functions.stack">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">stack</span><span class="p">(</span><span class="o">*</span><span class="n">cols</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Separates `col1`, ..., `colk` into `n` rows. Uses column names col0, col1, etc. by default</span>
<span class="sd"> unless specified otherwise.</span>
<span class="sd"> .. versionadded:: 3.5.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> cols : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> the first element should be a literal int for the number of rows to be separated,</span>
<span class="sd"> and the remaining are input elements to be separated.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(1, 2, 3)], [&quot;a&quot;, &quot;b&quot;, &quot;c&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(stack(lit(2), df.a, df.b, df.c)).show(truncate=False)</span>
<span class="sd"> +----+----+</span>
<span class="sd"> |col0|col1|</span>
<span class="sd"> +----+----+</span>
<span class="sd"> |1 |2 |</span>
<span class="sd"> |3 |NULL|</span>
<span class="sd"> +----+----+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_seq_of_columns</span><span class="p">(</span><span class="s2">&quot;stack&quot;</span><span class="p">,</span> <span class="n">cols</span><span class="p">)</span></div>
<div class="viewcode-block" id="bitmap_bit_position"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.bitmap_bit_position.html#pyspark.sql.functions.bitmap_bit_position">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">bitmap_bit_position</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns the bit position for the given input column.</span>
<span class="sd"> .. versionadded:: 3.5.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> The input column.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(123,)], [&quot;a&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(bitmap_bit_position(df.a).alias(&quot;r&quot;)).collect()</span>
<span class="sd"> [Row(r=122)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;bitmap_bit_position&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="bitmap_bucket_number"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.bitmap_bucket_number.html#pyspark.sql.functions.bitmap_bucket_number">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">bitmap_bucket_number</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns the bucket number for the given input column.</span>
<span class="sd"> .. versionadded:: 3.5.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> The input column.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(123,)], [&quot;a&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(bitmap_bucket_number(df.a).alias(&quot;r&quot;)).collect()</span>
<span class="sd"> [Row(r=1)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;bitmap_bucket_number&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="bitmap_construct_agg"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.bitmap_construct_agg.html#pyspark.sql.functions.bitmap_construct_agg">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">bitmap_construct_agg</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns a bitmap with the positions of the bits set from all the values from the input column.</span>
<span class="sd"> The input column will most likely be bitmap_bit_position().</span>
<span class="sd"> .. versionadded:: 3.5.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> The input column will most likely be bitmap_bit_position().</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(1,),(2,),(3,)], [&quot;a&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(substring(hex(</span>
<span class="sd"> ... bitmap_construct_agg(bitmap_bit_position(df.a))</span>
<span class="sd"> ... ), 0, 6).alias(&quot;r&quot;)).collect()</span>
<span class="sd"> [Row(r=&#39;070000&#39;)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;bitmap_construct_agg&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="bitmap_count"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.bitmap_count.html#pyspark.sql.functions.bitmap_count">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">bitmap_count</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns the number of set bits in the input bitmap.</span>
<span class="sd"> .. versionadded:: 3.5.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> The input bitmap.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(&quot;FFFF&quot;,)], [&quot;a&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(bitmap_count(to_binary(df.a, lit(&quot;hex&quot;))).alias(&#39;r&#39;)).collect()</span>
<span class="sd"> [Row(r=16)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;bitmap_count&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="bitmap_or_agg"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.bitmap_or_agg.html#pyspark.sql.functions.bitmap_or_agg">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">bitmap_or_agg</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns a bitmap that is the bitwise OR of all of the bitmaps from the input column.</span>
<span class="sd"> The input column should be bitmaps created from bitmap_construct_agg().</span>
<span class="sd"> .. versionadded:: 3.5.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> The input column should be bitmaps created from bitmap_construct_agg().</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(&quot;10&quot;,),(&quot;20&quot;,),(&quot;40&quot;,)], [&quot;a&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(substring(hex(</span>
<span class="sd"> ... bitmap_or_agg(to_binary(df.a, lit(&quot;hex&quot;)))</span>
<span class="sd"> ... ), 0, 6).alias(&quot;r&quot;)).collect()</span>
<span class="sd"> [Row(r=&#39;700000&#39;)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">&quot;bitmap_or_agg&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<span class="c1"># ---------------------------- User Defined Function ----------------------------------</span>
<span class="nd">@overload</span>
<span class="k">def</span> <span class="nf">udf</span><span class="p">(</span>
<span class="n">f</span><span class="p">:</span> <span class="n">Callable</span><span class="p">[</span><span class="o">...</span><span class="p">,</span> <span class="n">Any</span><span class="p">],</span>
<span class="n">returnType</span><span class="p">:</span> <span class="s2">&quot;DataTypeOrString&quot;</span> <span class="o">=</span> <span class="n">StringType</span><span class="p">(),</span>
<span class="o">*</span><span class="p">,</span>
<span class="n">useArrow</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">bool</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;UserDefinedFunctionLike&quot;</span><span class="p">:</span>
<span class="o">...</span>
<span class="nd">@overload</span>
<span class="k">def</span> <span class="nf">udf</span><span class="p">(</span>
<span class="n">f</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="s2">&quot;DataTypeOrString&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="o">*</span><span class="p">,</span>
<span class="n">useArrow</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">bool</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Callable</span><span class="p">[[</span><span class="n">Callable</span><span class="p">[</span><span class="o">...</span><span class="p">,</span> <span class="n">Any</span><span class="p">]],</span> <span class="s2">&quot;UserDefinedFunctionLike&quot;</span><span class="p">]:</span>
<span class="o">...</span>
<span class="nd">@overload</span>
<span class="k">def</span> <span class="nf">udf</span><span class="p">(</span>
<span class="o">*</span><span class="p">,</span>
<span class="n">returnType</span><span class="p">:</span> <span class="s2">&quot;DataTypeOrString&quot;</span> <span class="o">=</span> <span class="n">StringType</span><span class="p">(),</span>
<span class="n">useArrow</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">bool</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Callable</span><span class="p">[[</span><span class="n">Callable</span><span class="p">[</span><span class="o">...</span><span class="p">,</span> <span class="n">Any</span><span class="p">]],</span> <span class="s2">&quot;UserDefinedFunctionLike&quot;</span><span class="p">]:</span>
<span class="o">...</span>
<div class="viewcode-block" id="udf"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.udf.html#pyspark.sql.functions.udf">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">udf</span><span class="p">(</span>
<span class="n">f</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="n">Callable</span><span class="p">[</span><span class="o">...</span><span class="p">,</span> <span class="n">Any</span><span class="p">],</span> <span class="s2">&quot;DataTypeOrString&quot;</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">returnType</span><span class="p">:</span> <span class="s2">&quot;DataTypeOrString&quot;</span> <span class="o">=</span> <span class="n">StringType</span><span class="p">(),</span>
<span class="o">*</span><span class="p">,</span>
<span class="n">useArrow</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">bool</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Union</span><span class="p">[</span><span class="s2">&quot;UserDefinedFunctionLike&quot;</span><span class="p">,</span> <span class="n">Callable</span><span class="p">[[</span><span class="n">Callable</span><span class="p">[</span><span class="o">...</span><span class="p">,</span> <span class="n">Any</span><span class="p">]],</span> <span class="s2">&quot;UserDefinedFunctionLike&quot;</span><span class="p">]]:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Creates a user defined function (UDF).</span>
<span class="sd"> .. versionadded:: 1.3.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> f : function</span>
<span class="sd"> python function if used as a standalone function</span>
<span class="sd"> returnType : :class:`pyspark.sql.types.DataType` or str</span>
<span class="sd"> the return type of the user-defined function. The value can be either a</span>
<span class="sd"> :class:`pyspark.sql.types.DataType` object or a DDL-formatted type string.</span>
<span class="sd"> useArrow : bool or None</span>
<span class="sd"> whether to use Arrow to optimize the (de)serialization. When it is None, the</span>
<span class="sd"> Spark config &quot;spark.sql.execution.pythonUDF.arrow.enabled&quot; takes effect.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; from pyspark.sql.types import IntegerType</span>
<span class="sd"> &gt;&gt;&gt; slen = udf(lambda s: len(s), IntegerType())</span>
<span class="sd"> &gt;&gt;&gt; @udf</span>
<span class="sd"> ... def to_upper(s):</span>
<span class="sd"> ... if s is not None:</span>
<span class="sd"> ... return s.upper()</span>
<span class="sd"> ...</span>
<span class="sd"> &gt;&gt;&gt; @udf(returnType=IntegerType())</span>
<span class="sd"> ... def add_one(x):</span>
<span class="sd"> ... if x is not None:</span>
<span class="sd"> ... return x + 1</span>
<span class="sd"> ...</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(1, &quot;John Doe&quot;, 21)], (&quot;id&quot;, &quot;name&quot;, &quot;age&quot;))</span>
<span class="sd"> &gt;&gt;&gt; df.select(slen(&quot;name&quot;).alias(&quot;slen(name)&quot;), to_upper(&quot;name&quot;), add_one(&quot;age&quot;)).show()</span>
<span class="sd"> +----------+--------------+------------+</span>
<span class="sd"> |slen(name)|to_upper(name)|add_one(age)|</span>
<span class="sd"> +----------+--------------+------------+</span>
<span class="sd"> | 8| JOHN DOE| 22|</span>
<span class="sd"> +----------+--------------+------------+</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> The user-defined functions are considered deterministic by default. Due to</span>
<span class="sd"> optimization, duplicate invocations may be eliminated or the function may even be invoked</span>
<span class="sd"> more times than it is present in the query. If your function is not deterministic, call</span>
<span class="sd"> `asNondeterministic` on the user defined function. E.g.:</span>
<span class="sd"> &gt;&gt;&gt; from pyspark.sql.types import IntegerType</span>
<span class="sd"> &gt;&gt;&gt; import random</span>
<span class="sd"> &gt;&gt;&gt; random_udf = udf(lambda: int(random.random() * 100), IntegerType()).asNondeterministic()</span>
<span class="sd"> The user-defined functions do not support conditional expressions or short circuiting</span>
<span class="sd"> in boolean expressions and it ends up with being executed all internally. If the functions</span>
<span class="sd"> can fail on special rows, the workaround is to incorporate the condition into the functions.</span>
<span class="sd"> The user-defined functions do not take keyword arguments on the calling side.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="c1"># The following table shows most of Python data and SQL type conversions in normal UDFs that</span>
<span class="c1"># are not yet visible to the user. Some of behaviors are buggy and might be changed in the near</span>
<span class="c1"># future. The table might have to be eventually documented externally.</span>
<span class="c1"># Please see SPARK-28131&#39;s PR to see the codes in order to generate the table below.</span>
<span class="c1">#</span>
<span class="c1"># +-----------------------------+--------------+----------+------+---------------+--------------------+-----------------------------+----------+----------------------+---------+--------------------+----------------------------+------------+--------------+------------------+----------------------+ # noqa</span>
<span class="c1"># |SQL Type \ Python Value(Type)|None(NoneType)|True(bool)|1(int)| a(str)| 1970-01-01(date)|1970-01-01 00:00:00(datetime)|1.0(float)|array(&#39;i&#39;, [1])(array)|[1](list)| (1,)(tuple)|bytearray(b&#39;ABC&#39;)(bytearray)| 1(Decimal)|{&#39;a&#39;: 1}(dict)|Row(kwargs=1)(Row)|Row(namedtuple=1)(Row)| # noqa</span>
<span class="c1"># +-----------------------------+--------------+----------+------+---------------+--------------------+-----------------------------+----------+----------------------+---------+--------------------+----------------------------+------------+--------------+------------------+----------------------+ # noqa</span>
<span class="c1"># | boolean| None| True| None| None| None| None| None| None| None| None| None| None| None| X| X| # noqa</span>
<span class="c1"># | tinyint| None| None| 1| None| None| None| None| None| None| None| None| None| None| X| X| # noqa</span>
<span class="c1"># | smallint| None| None| 1| None| None| None| None| None| None| None| None| None| None| X| X| # noqa</span>
<span class="c1"># | int| None| None| 1| None| None| None| None| None| None| None| None| None| None| X| X| # noqa</span>
<span class="c1"># | bigint| None| None| 1| None| None| None| None| None| None| None| None| None| None| X| X| # noqa</span>
<span class="c1"># | string| None| &#39;true&#39;| &#39;1&#39;| &#39;a&#39;|&#39;java.util.Gregor...| &#39;java.util.Gregor...| &#39;1.0&#39;| &#39;[I@66cbb73a&#39;| &#39;[1]&#39;|&#39;[Ljava.lang.Obje...| &#39;[B@5a51eb1a&#39;| &#39;1&#39;| &#39;{a=1}&#39;| X| X| # noqa</span>
<span class="c1"># | date| None| X| X| X|datetime.date(197...| datetime.date(197...| X| X| X| X| X| X| X| X| X| # noqa</span>
<span class="c1"># | timestamp| None| X| X| X| X| datetime.datetime...| X| X| X| X| X| X| X| X| X| # noqa</span>
<span class="c1"># | float| None| None| None| None| None| None| 1.0| None| None| None| None| None| None| X| X| # noqa</span>
<span class="c1"># | double| None| None| None| None| None| None| 1.0| None| None| None| None| None| None| X| X| # noqa</span>
<span class="c1"># | array&lt;int&gt;| None| None| None| None| None| None| None| [1]| [1]| [1]| [65, 66, 67]| None| None| X| X| # noqa</span>
<span class="c1"># | binary| None| None| None|bytearray(b&#39;a&#39;)| None| None| None| None| None| None| bytearray(b&#39;ABC&#39;)| None| None| X| X| # noqa</span>
<span class="c1"># | decimal(10,0)| None| None| None| None| None| None| None| None| None| None| None|Decimal(&#39;1&#39;)| None| X| X| # noqa</span>
<span class="c1"># | map&lt;string,int&gt;| None| None| None| None| None| None| None| None| None| None| None| None| {&#39;a&#39;: 1}| X| X| # noqa</span>
<span class="c1"># | struct&lt;_1:int&gt;| None| X| X| X| X| X| X| X|Row(_1=1)| Row(_1=1)| X| X| Row(_1=None)| Row(_1=1)| Row(_1=1)| # noqa</span>
<span class="c1"># +-----------------------------+--------------+----------+------+---------------+--------------------+-----------------------------+----------+----------------------+---------+--------------------+----------------------------+------------+--------------+------------------+----------------------+ # noqa</span>
<span class="c1">#</span>
<span class="c1"># Note: DDL formatted string is used for &#39;SQL Type&#39; for simplicity. This string can be</span>
<span class="c1"># used in `returnType`.</span>
<span class="c1"># Note: The values inside of the table are generated by `repr`.</span>
<span class="c1"># Note: &#39;X&#39; means it throws an exception during the conversion.</span>
<span class="c1"># decorator @udf, @udf(), @udf(dataType())</span>
<span class="k">if</span> <span class="n">f</span> <span class="ow">is</span> <span class="kc">None</span> <span class="ow">or</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">f</span><span class="p">,</span> <span class="p">(</span><span class="nb">str</span><span class="p">,</span> <span class="n">DataType</span><span class="p">)):</span>
<span class="c1"># If DataType has been passed as a positional argument</span>
<span class="c1"># for decorator use it as a returnType</span>
<span class="n">return_type</span> <span class="o">=</span> <span class="n">f</span> <span class="ow">or</span> <span class="n">returnType</span>
<span class="k">return</span> <span class="n">functools</span><span class="o">.</span><span class="n">partial</span><span class="p">(</span>
<span class="n">_create_py_udf</span><span class="p">,</span>
<span class="n">returnType</span><span class="o">=</span><span class="n">return_type</span><span class="p">,</span>
<span class="n">useArrow</span><span class="o">=</span><span class="n">useArrow</span><span class="p">,</span>
<span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">return</span> <span class="n">_create_py_udf</span><span class="p">(</span><span class="n">f</span><span class="o">=</span><span class="n">f</span><span class="p">,</span> <span class="n">returnType</span><span class="o">=</span><span class="n">returnType</span><span class="p">,</span> <span class="n">useArrow</span><span class="o">=</span><span class="n">useArrow</span><span class="p">)</span></div>
<div class="viewcode-block" id="udtf"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.udtf.html#pyspark.sql.functions.udtf">[docs]</a><span class="nd">@try_remote_functions</span>
<span class="k">def</span> <span class="nf">udtf</span><span class="p">(</span>
<span class="bp">cls</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Type</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="o">*</span><span class="p">,</span>
<span class="n">returnType</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="n">StructType</span><span class="p">,</span> <span class="nb">str</span><span class="p">],</span>
<span class="n">useArrow</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">bool</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Union</span><span class="p">[</span><span class="s2">&quot;UserDefinedTableFunction&quot;</span><span class="p">,</span> <span class="n">Callable</span><span class="p">[[</span><span class="n">Type</span><span class="p">],</span> <span class="s2">&quot;UserDefinedTableFunction&quot;</span><span class="p">]]:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Creates a user defined table function (UDTF).</span>
<span class="sd"> .. versionadded:: 3.5.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> cls : class</span>
<span class="sd"> the Python user-defined table function handler class.</span>
<span class="sd"> returnType : :class:`pyspark.sql.types.StructType` or str</span>
<span class="sd"> the return type of the user-defined table function. The value can be either a</span>
<span class="sd"> :class:`pyspark.sql.types.StructType` object or a DDL-formatted struct type string.</span>
<span class="sd"> useArrow : bool or None, optional</span>
<span class="sd"> whether to use Arrow to optimize the (de)serializations. When it&#39;s set to None, the</span>
<span class="sd"> Spark config &quot;spark.sql.execution.pythonUDTF.arrow.enabled&quot; is used.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> Implement the UDTF class and create a UDTF:</span>
<span class="sd"> &gt;&gt;&gt; class TestUDTF:</span>
<span class="sd"> ... def eval(self, *args: Any):</span>
<span class="sd"> ... yield &quot;hello&quot;, &quot;world&quot;</span>
<span class="sd"> ...</span>
<span class="sd"> &gt;&gt;&gt; from pyspark.sql.functions import udtf</span>
<span class="sd"> &gt;&gt;&gt; test_udtf = udtf(TestUDTF, returnType=&quot;c1: string, c2: string&quot;)</span>
<span class="sd"> &gt;&gt;&gt; test_udtf().show()</span>
<span class="sd"> +-----+-----+</span>
<span class="sd"> | c1| c2|</span>
<span class="sd"> +-----+-----+</span>
<span class="sd"> |hello|world|</span>
<span class="sd"> +-----+-----+</span>
<span class="sd"> UDTF can also be created using the decorator syntax:</span>
<span class="sd"> &gt;&gt;&gt; @udtf(returnType=&quot;c1: int, c2: int&quot;)</span>
<span class="sd"> ... class PlusOne:</span>
<span class="sd"> ... def eval(self, x: int):</span>
<span class="sd"> ... yield x, x + 1</span>
<span class="sd"> ...</span>
<span class="sd"> &gt;&gt;&gt; from pyspark.sql.functions import lit</span>
<span class="sd"> &gt;&gt;&gt; PlusOne(lit(1)).show()</span>
<span class="sd"> +---+---+</span>
<span class="sd"> | c1| c2|</span>
<span class="sd"> +---+---+</span>
<span class="sd"> | 1| 2|</span>
<span class="sd"> +---+---+</span>
<span class="sd"> Arrow optimization can be explicitly enabled when creating UDTFs:</span>
<span class="sd"> &gt;&gt;&gt; @udtf(returnType=&quot;c1: int, c2: int&quot;, useArrow=True)</span>
<span class="sd"> ... class ArrowPlusOne:</span>
<span class="sd"> ... def eval(self, x: int):</span>
<span class="sd"> ... yield x, x + 1</span>
<span class="sd"> ...</span>
<span class="sd"> &gt;&gt;&gt; ArrowPlusOne(lit(1)).show()</span>
<span class="sd"> +---+---+</span>
<span class="sd"> | c1| c2|</span>
<span class="sd"> +---+---+</span>
<span class="sd"> | 1| 2|</span>
<span class="sd"> +---+---+</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> User-defined table functions (UDTFs) are considered non-deterministic by default.</span>
<span class="sd"> Use `asDeterministic()` to mark a function as deterministic. E.g.:</span>
<span class="sd"> &gt;&gt;&gt; class PlusOne:</span>
<span class="sd"> ... def eval(self, a: int):</span>
<span class="sd"> ... yield a + 1,</span>
<span class="sd"> &gt;&gt;&gt; plus_one = udtf(PlusOne, returnType=&quot;r: int&quot;).asDeterministic()</span>
<span class="sd"> Use &quot;yield&quot; to produce one row for the UDTF result relation as many times</span>
<span class="sd"> as needed. In the context of a lateral join, each such result row will be</span>
<span class="sd"> associated with the most recent input row consumed from the &quot;eval&quot; method.</span>
<span class="sd"> User-defined table functions are considered opaque to the optimizer by default.</span>
<span class="sd"> As a result, operations like filters from WHERE clauses or limits from</span>
<span class="sd"> LIMIT/OFFSET clauses that appear after the UDTF call will execute on the</span>
<span class="sd"> UDTF&#39;s result relation. By the same token, any relations forwarded as input</span>
<span class="sd"> to UDTFs will plan as full table scans in the absence of any explicit such</span>
<span class="sd"> filtering or other logic explicitly written in a table subquery surrounding the</span>
<span class="sd"> provided input relation.</span>
<span class="sd"> User-defined table functions do not accept keyword arguments on the calling side.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="bp">cls</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
<span class="k">return</span> <span class="n">functools</span><span class="o">.</span><span class="n">partial</span><span class="p">(</span><span class="n">_create_py_udtf</span><span class="p">,</span> <span class="n">returnType</span><span class="o">=</span><span class="n">returnType</span><span class="p">,</span> <span class="n">useArrow</span><span class="o">=</span><span class="n">useArrow</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">return</span> <span class="n">_create_py_udtf</span><span class="p">(</span><span class="bp">cls</span><span class="o">=</span><span class="bp">cls</span><span class="p">,</span> <span class="n">returnType</span><span class="o">=</span><span class="n">returnType</span><span class="p">,</span> <span class="n">useArrow</span><span class="o">=</span><span class="n">useArrow</span><span class="p">)</span></div>
<span class="k">def</span> <span class="nf">_test</span><span class="p">()</span> <span class="o">-&gt;</span> <span class="kc">None</span><span class="p">:</span>
<span class="kn">import</span> <span class="nn">doctest</span>
<span class="kn">from</span> <span class="nn">pyspark.sql</span> <span class="kn">import</span> <span class="n">SparkSession</span>
<span class="kn">import</span> <span class="nn">pyspark.sql.functions</span>
<span class="n">globs</span> <span class="o">=</span> <span class="n">pyspark</span><span class="o">.</span><span class="n">sql</span><span class="o">.</span><span class="n">functions</span><span class="o">.</span><span class="vm">__dict__</span><span class="o">.</span><span class="n">copy</span><span class="p">()</span>
<span class="n">spark</span> <span class="o">=</span> <span class="n">SparkSession</span><span class="o">.</span><span class="n">builder</span><span class="o">.</span><span class="n">master</span><span class="p">(</span><span class="s2">&quot;local[4]&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">appName</span><span class="p">(</span><span class="s2">&quot;sql.functions tests&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">getOrCreate</span><span class="p">()</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">sparkContext</span>
<span class="n">globs</span><span class="p">[</span><span class="s2">&quot;sc&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="n">sc</span>
<span class="n">globs</span><span class="p">[</span><span class="s2">&quot;spark&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="n">spark</span>
<span class="p">(</span><span class="n">failure_count</span><span class="p">,</span> <span class="n">test_count</span><span class="p">)</span> <span class="o">=</span> <span class="n">doctest</span><span class="o">.</span><span class="n">testmod</span><span class="p">(</span>
<span class="n">pyspark</span><span class="o">.</span><span class="n">sql</span><span class="o">.</span><span class="n">functions</span><span class="p">,</span>
<span class="n">globs</span><span class="o">=</span><span class="n">globs</span><span class="p">,</span>
<span class="n">optionflags</span><span class="o">=</span><span class="n">doctest</span><span class="o">.</span><span class="n">ELLIPSIS</span> <span class="o">|</span> <span class="n">doctest</span><span class="o">.</span><span class="n">NORMALIZE_WHITESPACE</span><span class="p">,</span>
<span class="p">)</span>
<span class="n">spark</span><span class="o">.</span><span class="n">stop</span><span class="p">()</span>
<span class="k">if</span> <span class="n">failure_count</span><span class="p">:</span>
<span class="n">sys</span><span class="o">.</span><span class="n">exit</span><span class="p">(</span><span class="o">-</span><span class="mi">1</span><span class="p">)</span>
<span class="k">if</span> <span class="vm">__name__</span> <span class="o">==</span> <span class="s2">&quot;__main__&quot;</span><span class="p">:</span>
<span class="n">_test</span><span class="p">()</span>
</pre></div>
</div>
<!-- Previous / next buttons -->
<div class='prev-next-area'>
</div>
</main>
</div>
</div>
<script src="../../../_static/scripts/pydata-sphinx-theme.js?digest=1999514e3f237ded88cf"></script>
<footer class="footer mt-5 mt-md-0">
<div class="container">
<div class="footer-item">
<p class="copyright">
&copy; Copyright .<br>
</p>
</div>
<div class="footer-item">
<p class="sphinx-version">
Created using <a href="http://sphinx-doc.org/">Sphinx</a> 3.0.4.<br>
</p>
</div>
</div>
</footer>
</body>
</html>