blob: 075e71a2d653f9c4ae21b4ea0e496f46a5eaca94 [file] [log] [blame]
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8" />
<title>pyspark.sql.functions &#8212; PySpark 3.2.2 documentation</title>
<link rel="stylesheet" href="../../../_static/css/index.73d71520a4ca3b99cfee5594769eaaae.css">
<link rel="stylesheet"
href="../../../_static/vendor/fontawesome/5.13.0/css/all.min.css">
<link rel="preload" as="font" type="font/woff2" crossorigin
href="../../../_static/vendor/fontawesome/5.13.0/webfonts/fa-solid-900.woff2">
<link rel="preload" as="font" type="font/woff2" crossorigin
href="../../../_static/vendor/fontawesome/5.13.0/webfonts/fa-brands-400.woff2">
<link rel="stylesheet"
href="../../../_static/vendor/open-sans_all/1.44.1/index.css">
<link rel="stylesheet"
href="../../../_static/vendor/lato_latin-ext/1.44.1/index.css">
<link rel="stylesheet" href="../../../_static/basic.css" type="text/css" />
<link rel="stylesheet" href="../../../_static/pygments.css" type="text/css" />
<link rel="stylesheet" type="text/css" href="../../../_static/css/pyspark.css" />
<link rel="preload" as="script" href="../../../_static/js/index.3da636dd464baa7582d2.js">
<script id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
<script src="../../../_static/jquery.js"></script>
<script src="../../../_static/underscore.js"></script>
<script src="../../../_static/doctools.js"></script>
<script src="../../../_static/language_data.js"></script>
<script src="../../../_static/copybutton.js"></script>
<script crossorigin="anonymous" integrity="sha256-Ae2Vz/4ePdIu6ZyI/5ZGsYnb+m0JlOmKPjt6XZ9JJkA=" src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.4/require.min.js"></script>
<script async="async" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
<script type="text/x-mathjax-config">MathJax.Hub.Config({"tex2jax": {"inlineMath": [["$", "$"], ["\\(", "\\)"]], "processEscapes": true, "ignoreClass": "document", "processClass": "math|output_area"}})</script>
<link rel="search" title="Search" href="../../../search.html" />
<meta name="viewport" content="width=device-width, initial-scale=1" />
<meta name="docsearch:language" content="en" />
</head>
<body data-spy="scroll" data-target="#bd-toc-nav" data-offset="80">
<nav class="navbar navbar-light navbar-expand-lg bg-light fixed-top bd-navbar" id="navbar-main">
<div class="container-xl">
<a class="navbar-brand" href="../../../index.html">
<img src="../../../_static/spark-logo-reverse.png" class="logo" alt="logo" />
</a>
<button class="navbar-toggler" type="button" data-toggle="collapse" data-target="#navbar-menu" aria-controls="navbar-menu" aria-expanded="false" aria-label="Toggle navigation">
<span class="navbar-toggler-icon"></span>
</button>
<div id="navbar-menu" class="col-lg-9 collapse navbar-collapse">
<ul id="navbar-main-elements" class="navbar-nav mr-auto">
<li class="nav-item ">
<a class="nav-link" href="../../../getting_started/index.html">Getting Started</a>
</li>
<li class="nav-item ">
<a class="nav-link" href="../../../user_guide/index.html">User Guide</a>
</li>
<li class="nav-item ">
<a class="nav-link" href="../../../reference/index.html">API Reference</a>
</li>
<li class="nav-item ">
<a class="nav-link" href="../../../development/index.html">Development</a>
</li>
<li class="nav-item ">
<a class="nav-link" href="../../../migration_guide/index.html">Migration Guide</a>
</li>
</ul>
<ul class="navbar-nav">
</ul>
</div>
</div>
</nav>
<div class="container-xl">
<div class="row">
<div class="col-12 col-md-3 bd-sidebar"><form class="bd-search d-flex align-items-center" action="../../../search.html" method="get">
<i class="icon fas fa-search"></i>
<input type="search" class="form-control" name="q" id="search-input" placeholder="Search the docs ..." aria-label="Search the docs ..." autocomplete="off" >
</form>
<nav class="bd-links" id="bd-docs-nav" aria-label="Main navigation">
<div class="bd-toc-item active">
<ul class="nav bd-sidenav">
</ul>
</nav>
</div>
<div class="d-none d-xl-block col-xl-2 bd-toc">
<nav id="bd-toc-nav">
<ul class="nav section-nav flex-column">
</ul>
</nav>
</div>
<main class="col-12 col-md-9 col-xl-7 py-md-5 pl-md-5 pr-md-4 bd-content" role="main">
<div>
<h1>Source code for pyspark.sql.functions</h1><div class="highlight"><pre>
<span></span><span class="c1">#</span>
<span class="c1"># Licensed to the Apache Software Foundation (ASF) under one or more</span>
<span class="c1"># contributor license agreements. See the NOTICE file distributed with</span>
<span class="c1"># this work for additional information regarding copyright ownership.</span>
<span class="c1"># The ASF licenses this file to You under the Apache License, Version 2.0</span>
<span class="c1"># (the &quot;License&quot;); you may not use this file except in compliance with</span>
<span class="c1"># the License. You may obtain a copy of the License at</span>
<span class="c1">#</span>
<span class="c1"># http://www.apache.org/licenses/LICENSE-2.0</span>
<span class="c1">#</span>
<span class="c1"># Unless required by applicable law or agreed to in writing, software</span>
<span class="c1"># distributed under the License is distributed on an &quot;AS IS&quot; BASIS,</span>
<span class="c1"># WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.</span>
<span class="c1"># See the License for the specific language governing permissions and</span>
<span class="c1"># limitations under the License.</span>
<span class="c1">#</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd">A collections of builtin functions</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="kn">import</span> <span class="nn">sys</span>
<span class="kn">import</span> <span class="nn">functools</span>
<span class="kn">import</span> <span class="nn">warnings</span>
<span class="kn">from</span> <span class="nn">pyspark</span> <span class="kn">import</span> <span class="n">since</span><span class="p">,</span> <span class="n">SparkContext</span>
<span class="kn">from</span> <span class="nn">pyspark.rdd</span> <span class="kn">import</span> <span class="n">PythonEvalType</span>
<span class="kn">from</span> <span class="nn">pyspark.sql.column</span> <span class="kn">import</span> <span class="n">Column</span><span class="p">,</span> <span class="n">_to_java_column</span><span class="p">,</span> <span class="n">_to_seq</span><span class="p">,</span> <span class="n">_create_column_from_literal</span>
<span class="kn">from</span> <span class="nn">pyspark.sql.dataframe</span> <span class="kn">import</span> <span class="n">DataFrame</span>
<span class="kn">from</span> <span class="nn">pyspark.sql.types</span> <span class="kn">import</span> <span class="n">StringType</span><span class="p">,</span> <span class="n">DataType</span>
<span class="c1"># Keep UserDefinedFunction import for backwards compatible import; moved in SPARK-22409</span>
<span class="kn">from</span> <span class="nn">pyspark.sql.udf</span> <span class="kn">import</span> <span class="n">UserDefinedFunction</span><span class="p">,</span> <span class="n">_create_udf</span> <span class="c1"># noqa: F401</span>
<span class="kn">from</span> <span class="nn">pyspark.sql.udf</span> <span class="kn">import</span> <span class="n">_create_udf</span>
<span class="c1"># Keep pandas_udf and PandasUDFType import for backwards compatible import; moved in SPARK-28264</span>
<span class="kn">from</span> <span class="nn">pyspark.sql.pandas.functions</span> <span class="kn">import</span> <span class="n">pandas_udf</span><span class="p">,</span> <span class="n">PandasUDFType</span> <span class="c1"># noqa: F401</span>
<span class="kn">from</span> <span class="nn">pyspark.sql.utils</span> <span class="kn">import</span> <span class="n">to_str</span>
<span class="c1"># Note to developers: all of PySpark functions here take string as column names whenever possible.</span>
<span class="c1"># Namely, if columns are referred as arguments, they can be always both Column or string,</span>
<span class="c1"># even though there might be few exceptions for legacy or inevitable reasons.</span>
<span class="c1"># If you are fixing other language APIs together, also please note that Scala side is not the case</span>
<span class="c1"># since it requires to make every single overridden definition.</span>
<span class="k">def</span> <span class="nf">_get_get_jvm_function</span><span class="p">(</span><span class="n">name</span><span class="p">,</span> <span class="n">sc</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Retrieves JVM function identified by name from</span>
<span class="sd"> Java gateway associated with sc.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="nb">getattr</span><span class="p">(</span><span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">functions</span><span class="p">,</span> <span class="n">name</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">_invoke_function</span><span class="p">(</span><span class="n">name</span><span class="p">,</span> <span class="o">*</span><span class="n">args</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Invokes JVM function identified by name with args</span>
<span class="sd"> and wraps the result with :class:`~pyspark.sql.Column`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">jf</span> <span class="o">=</span> <span class="n">_get_get_jvm_function</span><span class="p">(</span><span class="n">name</span><span class="p">,</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span><span class="p">)</span>
<span class="k">return</span> <span class="n">Column</span><span class="p">(</span><span class="n">jf</span><span class="p">(</span><span class="o">*</span><span class="n">args</span><span class="p">))</span>
<span class="k">def</span> <span class="nf">_invoke_function_over_column</span><span class="p">(</span><span class="n">name</span><span class="p">,</span> <span class="n">col</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Invokes unary JVM function identified by name</span>
<span class="sd"> and wraps the result with :class:`~pyspark.sql.Column`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="n">name</span><span class="p">,</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">))</span>
<span class="k">def</span> <span class="nf">_invoke_binary_math_function</span><span class="p">(</span><span class="n">name</span><span class="p">,</span> <span class="n">col1</span><span class="p">,</span> <span class="n">col2</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Invokes binary JVM math function identified by name</span>
<span class="sd"> and wraps the result with :class:`~pyspark.sql.Column`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span>
<span class="n">name</span><span class="p">,</span>
<span class="c1"># For legacy reasons, the arguments here can be implicitly converted into floats,</span>
<span class="c1"># if they are not columns or strings.</span>
<span class="n">_to_java_column</span><span class="p">(</span><span class="n">col1</span><span class="p">)</span> <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">col1</span><span class="p">,</span> <span class="p">(</span><span class="nb">str</span><span class="p">,</span> <span class="n">Column</span><span class="p">))</span> <span class="k">else</span> <span class="nb">float</span><span class="p">(</span><span class="n">col1</span><span class="p">),</span>
<span class="n">_to_java_column</span><span class="p">(</span><span class="n">col2</span><span class="p">)</span> <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">col2</span><span class="p">,</span> <span class="p">(</span><span class="nb">str</span><span class="p">,</span> <span class="n">Column</span><span class="p">))</span> <span class="k">else</span> <span class="nb">float</span><span class="p">(</span><span class="n">col2</span><span class="p">)</span>
<span class="p">)</span>
<span class="k">def</span> <span class="nf">_options_to_str</span><span class="p">(</span><span class="n">options</span><span class="o">=</span><span class="kc">None</span><span class="p">):</span>
<span class="k">if</span> <span class="n">options</span><span class="p">:</span>
<span class="k">return</span> <span class="p">{</span><span class="n">key</span><span class="p">:</span> <span class="n">to_str</span><span class="p">(</span><span class="n">value</span><span class="p">)</span> <span class="k">for</span> <span class="p">(</span><span class="n">key</span><span class="p">,</span> <span class="n">value</span><span class="p">)</span> <span class="ow">in</span> <span class="n">options</span><span class="o">.</span><span class="n">items</span><span class="p">()}</span>
<span class="k">return</span> <span class="p">{}</span>
<div class="viewcode-block" id="lit"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.lit.html#pyspark.sql.functions.lit">[docs]</a><span class="k">def</span> <span class="nf">lit</span><span class="p">(</span><span class="n">col</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Creates a :class:`~pyspark.sql.Column` of literal value.</span>
<span class="sd"> .. versionadded:: 1.3.0</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df.select(lit(5).alias(&#39;height&#39;)).withColumn(&#39;spark_user&#39;, lit(True)).take(1)</span>
<span class="sd"> [Row(height=5, spark_user=True)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">col</span> <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">col</span><span class="p">,</span> <span class="n">Column</span><span class="p">)</span> <span class="k">else</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">&quot;lit&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="col"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.col.html#pyspark.sql.functions.col">[docs]</a><span class="nd">@since</span><span class="p">(</span><span class="mf">1.3</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">col</span><span class="p">(</span><span class="n">col</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns a :class:`~pyspark.sql.Column` based on the given column name.&#39;</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; col(&#39;x&#39;)</span>
<span class="sd"> Column&lt;&#39;x&#39;&gt;</span>
<span class="sd"> &gt;&gt;&gt; column(&#39;x&#39;)</span>
<span class="sd"> Column&lt;&#39;x&#39;&gt;</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">&quot;col&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<span class="n">column</span> <span class="o">=</span> <span class="n">col</span>
<div class="viewcode-block" id="asc"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.asc.html#pyspark.sql.functions.asc">[docs]</a><span class="nd">@since</span><span class="p">(</span><span class="mf">1.3</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">asc</span><span class="p">(</span><span class="n">col</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns a sort expression based on the ascending order of the given column name.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="p">(</span>
<span class="n">col</span><span class="o">.</span><span class="n">asc</span><span class="p">()</span> <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">col</span><span class="p">,</span> <span class="n">Column</span><span class="p">)</span>
<span class="k">else</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">&quot;asc&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span>
<span class="p">)</span></div>
<div class="viewcode-block" id="desc"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.desc.html#pyspark.sql.functions.desc">[docs]</a><span class="nd">@since</span><span class="p">(</span><span class="mf">1.3</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">desc</span><span class="p">(</span><span class="n">col</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns a sort expression based on the descending order of the given column name.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="p">(</span>
<span class="n">col</span><span class="o">.</span><span class="n">desc</span><span class="p">()</span> <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">col</span><span class="p">,</span> <span class="n">Column</span><span class="p">)</span>
<span class="k">else</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">&quot;desc&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span>
<span class="p">)</span></div>
<div class="viewcode-block" id="sqrt"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.sqrt.html#pyspark.sql.functions.sqrt">[docs]</a><span class="nd">@since</span><span class="p">(</span><span class="mf">1.3</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">sqrt</span><span class="p">(</span><span class="n">col</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Computes the square root of the specified float value.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_column</span><span class="p">(</span><span class="s2">&quot;sqrt&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="abs"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.abs.html#pyspark.sql.functions.abs">[docs]</a><span class="nd">@since</span><span class="p">(</span><span class="mf">1.3</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">abs</span><span class="p">(</span><span class="n">col</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Computes the absolute value.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_column</span><span class="p">(</span><span class="s2">&quot;abs&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="max"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.max.html#pyspark.sql.functions.max">[docs]</a><span class="nd">@since</span><span class="p">(</span><span class="mf">1.3</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">max</span><span class="p">(</span><span class="n">col</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Aggregate function: returns the maximum value of the expression in a group.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_column</span><span class="p">(</span><span class="s2">&quot;max&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="min"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.min.html#pyspark.sql.functions.min">[docs]</a><span class="nd">@since</span><span class="p">(</span><span class="mf">1.3</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">min</span><span class="p">(</span><span class="n">col</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Aggregate function: returns the minimum value of the expression in a group.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_column</span><span class="p">(</span><span class="s2">&quot;min&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="count"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.count.html#pyspark.sql.functions.count">[docs]</a><span class="nd">@since</span><span class="p">(</span><span class="mf">1.3</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">count</span><span class="p">(</span><span class="n">col</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Aggregate function: returns the number of items in a group.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_column</span><span class="p">(</span><span class="s2">&quot;count&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="sum"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.sum.html#pyspark.sql.functions.sum">[docs]</a><span class="nd">@since</span><span class="p">(</span><span class="mf">1.3</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">sum</span><span class="p">(</span><span class="n">col</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Aggregate function: returns the sum of all values in the expression.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_column</span><span class="p">(</span><span class="s2">&quot;sum&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="avg"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.avg.html#pyspark.sql.functions.avg">[docs]</a><span class="nd">@since</span><span class="p">(</span><span class="mf">1.3</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">avg</span><span class="p">(</span><span class="n">col</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Aggregate function: returns the average of the values in a group.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_column</span><span class="p">(</span><span class="s2">&quot;avg&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="mean"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.mean.html#pyspark.sql.functions.mean">[docs]</a><span class="nd">@since</span><span class="p">(</span><span class="mf">1.3</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">mean</span><span class="p">(</span><span class="n">col</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Aggregate function: returns the average of the values in a group.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_column</span><span class="p">(</span><span class="s2">&quot;mean&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="sumDistinct"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.sumDistinct.html#pyspark.sql.functions.sumDistinct">[docs]</a><span class="nd">@since</span><span class="p">(</span><span class="mf">1.3</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">sumDistinct</span><span class="p">(</span><span class="n">col</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Aggregate function: returns the sum of distinct values in the expression.</span>
<span class="sd"> .. deprecated:: 3.2.0</span>
<span class="sd"> Use :func:`sum_distinct` instead.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">warnings</span><span class="o">.</span><span class="n">warn</span><span class="p">(</span><span class="s2">&quot;Deprecated in 3.2, use sum_distinct instead.&quot;</span><span class="p">,</span> <span class="ne">FutureWarning</span><span class="p">)</span>
<span class="k">return</span> <span class="n">sum_distinct</span><span class="p">(</span><span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="sum_distinct"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.sum_distinct.html#pyspark.sql.functions.sum_distinct">[docs]</a><span class="nd">@since</span><span class="p">(</span><span class="mf">3.2</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">sum_distinct</span><span class="p">(</span><span class="n">col</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Aggregate function: returns the sum of distinct values in the expression.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_column</span><span class="p">(</span><span class="s2">&quot;sum_distinct&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="product"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.product.html#pyspark.sql.functions.product">[docs]</a><span class="k">def</span> <span class="nf">product</span><span class="p">(</span><span class="n">col</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Aggregate function: returns the product of the values in a group.</span>
<span class="sd"> .. versionadded:: 3.2.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : str, :class:`Column`</span>
<span class="sd"> column containing values to be multiplied together</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.range(1, 10).toDF(&#39;x&#39;).withColumn(&#39;mod3&#39;, col(&#39;x&#39;) % 3)</span>
<span class="sd"> &gt;&gt;&gt; prods = df.groupBy(&#39;mod3&#39;).agg(product(&#39;x&#39;).alias(&#39;product&#39;))</span>
<span class="sd"> &gt;&gt;&gt; prods.orderBy(&#39;mod3&#39;).show()</span>
<span class="sd"> +----+-------+</span>
<span class="sd"> |mod3|product|</span>
<span class="sd"> +----+-------+</span>
<span class="sd"> | 0| 162.0|</span>
<span class="sd"> | 1| 28.0|</span>
<span class="sd"> | 2| 80.0|</span>
<span class="sd"> +----+-------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_column</span><span class="p">(</span><span class="s2">&quot;product&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="acos"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.acos.html#pyspark.sql.functions.acos">[docs]</a><span class="k">def</span> <span class="nf">acos</span><span class="p">(</span><span class="n">col</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> .. versionadded:: 1.4.0</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> inverse cosine of `col`, as if computed by `java.lang.Math.acos()`</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_column</span><span class="p">(</span><span class="s2">&quot;acos&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="acosh"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.acosh.html#pyspark.sql.functions.acosh">[docs]</a><span class="k">def</span> <span class="nf">acosh</span><span class="p">(</span><span class="n">col</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Computes inverse hyperbolic cosine of the input column.</span>
<span class="sd"> .. versionadded:: 3.1.0</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_column</span><span class="p">(</span><span class="s2">&quot;acosh&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="asin"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.asin.html#pyspark.sql.functions.asin">[docs]</a><span class="k">def</span> <span class="nf">asin</span><span class="p">(</span><span class="n">col</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> .. versionadded:: 1.3.0</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> inverse sine of `col`, as if computed by `java.lang.Math.asin()`</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_column</span><span class="p">(</span><span class="s2">&quot;asin&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="asinh"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.asinh.html#pyspark.sql.functions.asinh">[docs]</a><span class="k">def</span> <span class="nf">asinh</span><span class="p">(</span><span class="n">col</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Computes inverse hyperbolic sine of the input column.</span>
<span class="sd"> .. versionadded:: 3.1.0</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_column</span><span class="p">(</span><span class="s2">&quot;asinh&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="atan"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.atan.html#pyspark.sql.functions.atan">[docs]</a><span class="k">def</span> <span class="nf">atan</span><span class="p">(</span><span class="n">col</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> .. versionadded:: 1.4.0</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> inverse tangent of `col`, as if computed by `java.lang.Math.atan()`</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_column</span><span class="p">(</span><span class="s2">&quot;atan&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="atanh"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.atanh.html#pyspark.sql.functions.atanh">[docs]</a><span class="k">def</span> <span class="nf">atanh</span><span class="p">(</span><span class="n">col</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Computes inverse hyperbolic tangent of the input column.</span>
<span class="sd"> .. versionadded:: 3.1.0</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_column</span><span class="p">(</span><span class="s2">&quot;atanh&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="cbrt"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.cbrt.html#pyspark.sql.functions.cbrt">[docs]</a><span class="nd">@since</span><span class="p">(</span><span class="mf">1.4</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">cbrt</span><span class="p">(</span><span class="n">col</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Computes the cube-root of the given value.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_column</span><span class="p">(</span><span class="s2">&quot;cbrt&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="ceil"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.ceil.html#pyspark.sql.functions.ceil">[docs]</a><span class="nd">@since</span><span class="p">(</span><span class="mf">1.4</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">ceil</span><span class="p">(</span><span class="n">col</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Computes the ceiling of the given value.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_column</span><span class="p">(</span><span class="s2">&quot;ceil&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="cos"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.cos.html#pyspark.sql.functions.cos">[docs]</a><span class="k">def</span> <span class="nf">cos</span><span class="p">(</span><span class="n">col</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> .. versionadded:: 1.4.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> angle in radians</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> cosine of the angle, as if computed by `java.lang.Math.cos()`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_column</span><span class="p">(</span><span class="s2">&quot;cos&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="cosh"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.cosh.html#pyspark.sql.functions.cosh">[docs]</a><span class="k">def</span> <span class="nf">cosh</span><span class="p">(</span><span class="n">col</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> .. versionadded:: 1.4.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> hyperbolic angle</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> hyperbolic cosine of the angle, as if computed by `java.lang.Math.cosh()`</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_column</span><span class="p">(</span><span class="s2">&quot;cosh&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="exp"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.exp.html#pyspark.sql.functions.exp">[docs]</a><span class="nd">@since</span><span class="p">(</span><span class="mf">1.4</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">exp</span><span class="p">(</span><span class="n">col</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Computes the exponential of the given value.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_column</span><span class="p">(</span><span class="s2">&quot;exp&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="expm1"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.expm1.html#pyspark.sql.functions.expm1">[docs]</a><span class="nd">@since</span><span class="p">(</span><span class="mf">1.4</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">expm1</span><span class="p">(</span><span class="n">col</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Computes the exponential of the given value minus one.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_column</span><span class="p">(</span><span class="s2">&quot;expm1&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="floor"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.floor.html#pyspark.sql.functions.floor">[docs]</a><span class="nd">@since</span><span class="p">(</span><span class="mf">1.4</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">floor</span><span class="p">(</span><span class="n">col</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Computes the floor of the given value.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_column</span><span class="p">(</span><span class="s2">&quot;floor&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<span class="nd">@since</span><span class="p">(</span><span class="mf">1.4</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">log</span><span class="p">(</span><span class="n">col</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Computes the natural logarithm of the given value.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_column</span><span class="p">(</span><span class="s2">&quot;log&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span>
<div class="viewcode-block" id="log10"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.log10.html#pyspark.sql.functions.log10">[docs]</a><span class="nd">@since</span><span class="p">(</span><span class="mf">1.4</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">log10</span><span class="p">(</span><span class="n">col</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Computes the logarithm of the given value in Base 10.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_column</span><span class="p">(</span><span class="s2">&quot;log10&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="log1p"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.log1p.html#pyspark.sql.functions.log1p">[docs]</a><span class="nd">@since</span><span class="p">(</span><span class="mf">1.4</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">log1p</span><span class="p">(</span><span class="n">col</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Computes the natural logarithm of the given value plus one.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_column</span><span class="p">(</span><span class="s2">&quot;log1p&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="rint"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.rint.html#pyspark.sql.functions.rint">[docs]</a><span class="nd">@since</span><span class="p">(</span><span class="mf">1.4</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">rint</span><span class="p">(</span><span class="n">col</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns the double value that is closest in value to the argument and</span>
<span class="sd"> is equal to a mathematical integer.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_column</span><span class="p">(</span><span class="s2">&quot;rint&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="signum"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.signum.html#pyspark.sql.functions.signum">[docs]</a><span class="nd">@since</span><span class="p">(</span><span class="mf">1.4</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">signum</span><span class="p">(</span><span class="n">col</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Computes the signum of the given value.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_column</span><span class="p">(</span><span class="s2">&quot;signum&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="sin"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.sin.html#pyspark.sql.functions.sin">[docs]</a><span class="k">def</span> <span class="nf">sin</span><span class="p">(</span><span class="n">col</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> .. versionadded:: 1.4.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> sine of the angle, as if computed by `java.lang.Math.sin()`</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_column</span><span class="p">(</span><span class="s2">&quot;sin&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="sinh"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.sinh.html#pyspark.sql.functions.sinh">[docs]</a><span class="k">def</span> <span class="nf">sinh</span><span class="p">(</span><span class="n">col</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> .. versionadded:: 1.4.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> hyperbolic angle</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> hyperbolic sine of the given value,</span>
<span class="sd"> as if computed by `java.lang.Math.sinh()`</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_column</span><span class="p">(</span><span class="s2">&quot;sinh&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="tan"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.tan.html#pyspark.sql.functions.tan">[docs]</a><span class="k">def</span> <span class="nf">tan</span><span class="p">(</span><span class="n">col</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> .. versionadded:: 1.4.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> angle in radians</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> tangent of the given value, as if computed by `java.lang.Math.tan()`</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_column</span><span class="p">(</span><span class="s2">&quot;tan&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="tanh"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.tanh.html#pyspark.sql.functions.tanh">[docs]</a><span class="k">def</span> <span class="nf">tanh</span><span class="p">(</span><span class="n">col</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> .. versionadded:: 1.4.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> hyperbolic angle</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> hyperbolic tangent of the given value</span>
<span class="sd"> as if computed by `java.lang.Math.tanh()`</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_column</span><span class="p">(</span><span class="s2">&quot;tanh&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="toDegrees"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.toDegrees.html#pyspark.sql.functions.toDegrees">[docs]</a><span class="nd">@since</span><span class="p">(</span><span class="mf">1.4</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">toDegrees</span><span class="p">(</span><span class="n">col</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> .. deprecated:: 2.1.0</span>
<span class="sd"> Use :func:`degrees` instead.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">warnings</span><span class="o">.</span><span class="n">warn</span><span class="p">(</span><span class="s2">&quot;Deprecated in 2.1, use degrees instead.&quot;</span><span class="p">,</span> <span class="ne">FutureWarning</span><span class="p">)</span>
<span class="k">return</span> <span class="n">degrees</span><span class="p">(</span><span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="toRadians"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.toRadians.html#pyspark.sql.functions.toRadians">[docs]</a><span class="nd">@since</span><span class="p">(</span><span class="mf">1.4</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">toRadians</span><span class="p">(</span><span class="n">col</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> .. deprecated:: 2.1.0</span>
<span class="sd"> Use :func:`radians` instead.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">warnings</span><span class="o">.</span><span class="n">warn</span><span class="p">(</span><span class="s2">&quot;Deprecated in 2.1, use radians instead.&quot;</span><span class="p">,</span> <span class="ne">FutureWarning</span><span class="p">)</span>
<span class="k">return</span> <span class="n">radians</span><span class="p">(</span><span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="bitwiseNOT"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.bitwiseNOT.html#pyspark.sql.functions.bitwiseNOT">[docs]</a><span class="nd">@since</span><span class="p">(</span><span class="mf">1.4</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">bitwiseNOT</span><span class="p">(</span><span class="n">col</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Computes bitwise not.</span>
<span class="sd"> .. deprecated:: 3.2.0</span>
<span class="sd"> Use :func:`bitwise_not` instead.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">warnings</span><span class="o">.</span><span class="n">warn</span><span class="p">(</span><span class="s2">&quot;Deprecated in 3.2, use bitwise_not instead.&quot;</span><span class="p">,</span> <span class="ne">FutureWarning</span><span class="p">)</span>
<span class="k">return</span> <span class="n">bitwise_not</span><span class="p">(</span><span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="bitwise_not"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.bitwise_not.html#pyspark.sql.functions.bitwise_not">[docs]</a><span class="nd">@since</span><span class="p">(</span><span class="mf">3.2</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">bitwise_not</span><span class="p">(</span><span class="n">col</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Computes bitwise not.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_column</span><span class="p">(</span><span class="s2">&quot;bitwise_not&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="asc_nulls_first"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.asc_nulls_first.html#pyspark.sql.functions.asc_nulls_first">[docs]</a><span class="nd">@since</span><span class="p">(</span><span class="mf">2.4</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">asc_nulls_first</span><span class="p">(</span><span class="n">col</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns a sort expression based on the ascending order of the given</span>
<span class="sd"> column name, and null values return before non-null values.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="p">(</span>
<span class="n">col</span><span class="o">.</span><span class="n">asc_nulls_first</span><span class="p">()</span> <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">col</span><span class="p">,</span> <span class="n">Column</span><span class="p">)</span>
<span class="k">else</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">&quot;asc_nulls_first&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span>
<span class="p">)</span></div>
<div class="viewcode-block" id="asc_nulls_last"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.asc_nulls_last.html#pyspark.sql.functions.asc_nulls_last">[docs]</a><span class="nd">@since</span><span class="p">(</span><span class="mf">2.4</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">asc_nulls_last</span><span class="p">(</span><span class="n">col</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns a sort expression based on the ascending order of the given</span>
<span class="sd"> column name, and null values appear after non-null values.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="p">(</span>
<span class="n">col</span><span class="o">.</span><span class="n">asc_nulls_last</span><span class="p">()</span> <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">col</span><span class="p">,</span> <span class="n">Column</span><span class="p">)</span>
<span class="k">else</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">&quot;asc_nulls_last&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span>
<span class="p">)</span></div>
<div class="viewcode-block" id="desc_nulls_first"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.desc_nulls_first.html#pyspark.sql.functions.desc_nulls_first">[docs]</a><span class="nd">@since</span><span class="p">(</span><span class="mf">2.4</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">desc_nulls_first</span><span class="p">(</span><span class="n">col</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns a sort expression based on the descending order of the given</span>
<span class="sd"> column name, and null values appear before non-null values.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="p">(</span>
<span class="n">col</span><span class="o">.</span><span class="n">desc_nulls_first</span><span class="p">()</span> <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">col</span><span class="p">,</span> <span class="n">Column</span><span class="p">)</span>
<span class="k">else</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">&quot;desc_nulls_first&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span>
<span class="p">)</span></div>
<div class="viewcode-block" id="desc_nulls_last"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.desc_nulls_last.html#pyspark.sql.functions.desc_nulls_last">[docs]</a><span class="nd">@since</span><span class="p">(</span><span class="mf">2.4</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">desc_nulls_last</span><span class="p">(</span><span class="n">col</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns a sort expression based on the descending order of the given</span>
<span class="sd"> column name, and null values appear after non-null values.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="p">(</span>
<span class="n">col</span><span class="o">.</span><span class="n">desc_nulls_last</span><span class="p">()</span> <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">col</span><span class="p">,</span> <span class="n">Column</span><span class="p">)</span>
<span class="k">else</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">&quot;desc_nulls_last&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span>
<span class="p">)</span></div>
<div class="viewcode-block" id="stddev"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.stddev.html#pyspark.sql.functions.stddev">[docs]</a><span class="nd">@since</span><span class="p">(</span><span class="mf">1.6</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">stddev</span><span class="p">(</span><span class="n">col</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Aggregate function: alias for stddev_samp.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_column</span><span class="p">(</span><span class="s2">&quot;stddev&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="stddev_samp"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.stddev_samp.html#pyspark.sql.functions.stddev_samp">[docs]</a><span class="nd">@since</span><span class="p">(</span><span class="mf">1.6</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">stddev_samp</span><span class="p">(</span><span class="n">col</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Aggregate function: returns the unbiased sample standard deviation of</span>
<span class="sd"> the expression in a group.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_column</span><span class="p">(</span><span class="s2">&quot;stddev_samp&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="stddev_pop"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.stddev_pop.html#pyspark.sql.functions.stddev_pop">[docs]</a><span class="nd">@since</span><span class="p">(</span><span class="mf">1.6</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">stddev_pop</span><span class="p">(</span><span class="n">col</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Aggregate function: returns population standard deviation of</span>
<span class="sd"> the expression in a group.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_column</span><span class="p">(</span><span class="s2">&quot;stddev_pop&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="variance"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.variance.html#pyspark.sql.functions.variance">[docs]</a><span class="nd">@since</span><span class="p">(</span><span class="mf">1.6</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">variance</span><span class="p">(</span><span class="n">col</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Aggregate function: alias for var_samp</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_column</span><span class="p">(</span><span class="s2">&quot;variance&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="var_samp"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.var_samp.html#pyspark.sql.functions.var_samp">[docs]</a><span class="nd">@since</span><span class="p">(</span><span class="mf">1.6</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">var_samp</span><span class="p">(</span><span class="n">col</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Aggregate function: returns the unbiased sample variance of</span>
<span class="sd"> the values in a group.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_column</span><span class="p">(</span><span class="s2">&quot;var_samp&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="var_pop"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.var_pop.html#pyspark.sql.functions.var_pop">[docs]</a><span class="nd">@since</span><span class="p">(</span><span class="mf">1.6</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">var_pop</span><span class="p">(</span><span class="n">col</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Aggregate function: returns the population variance of the values in a group.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_column</span><span class="p">(</span><span class="s2">&quot;var_pop&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="skewness"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.skewness.html#pyspark.sql.functions.skewness">[docs]</a><span class="nd">@since</span><span class="p">(</span><span class="mf">1.6</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">skewness</span><span class="p">(</span><span class="n">col</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Aggregate function: returns the skewness of the values in a group.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_column</span><span class="p">(</span><span class="s2">&quot;skewness&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="kurtosis"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.kurtosis.html#pyspark.sql.functions.kurtosis">[docs]</a><span class="nd">@since</span><span class="p">(</span><span class="mf">1.6</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">kurtosis</span><span class="p">(</span><span class="n">col</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Aggregate function: returns the kurtosis of the values in a group.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_column</span><span class="p">(</span><span class="s2">&quot;kurtosis&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="collect_list"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.collect_list.html#pyspark.sql.functions.collect_list">[docs]</a><span class="k">def</span> <span class="nf">collect_list</span><span class="p">(</span><span class="n">col</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Aggregate function: returns a list of objects with duplicates.</span>
<span class="sd"> .. versionadded:: 1.6.0</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> The function is non-deterministic because the order of collected results depends</span>
<span class="sd"> on the order of the rows which may be non-deterministic after a shuffle.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df2 = spark.createDataFrame([(2,), (5,), (5,)], (&#39;age&#39;,))</span>
<span class="sd"> &gt;&gt;&gt; df2.agg(collect_list(&#39;age&#39;)).collect()</span>
<span class="sd"> [Row(collect_list(age)=[2, 5, 5])]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_column</span><span class="p">(</span><span class="s2">&quot;collect_list&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="collect_set"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.collect_set.html#pyspark.sql.functions.collect_set">[docs]</a><span class="k">def</span> <span class="nf">collect_set</span><span class="p">(</span><span class="n">col</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Aggregate function: returns a set of objects with duplicate elements eliminated.</span>
<span class="sd"> .. versionadded:: 1.6.0</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> The function is non-deterministic because the order of collected results depends</span>
<span class="sd"> on the order of the rows which may be non-deterministic after a shuffle.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df2 = spark.createDataFrame([(2,), (5,), (5,)], (&#39;age&#39;,))</span>
<span class="sd"> &gt;&gt;&gt; df2.agg(collect_set(&#39;age&#39;)).collect()</span>
<span class="sd"> [Row(collect_set(age)=[5, 2])]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_column</span><span class="p">(</span><span class="s2">&quot;collect_set&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="degrees"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.degrees.html#pyspark.sql.functions.degrees">[docs]</a><span class="k">def</span> <span class="nf">degrees</span><span class="p">(</span><span class="n">col</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Converts an angle measured in radians to an approximately equivalent angle</span>
<span class="sd"> measured in degrees.</span>
<span class="sd"> .. versionadded:: 2.1.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> angle in radians</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> angle in degrees, as if computed by `java.lang.Math.toDegrees()`</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_column</span><span class="p">(</span><span class="s2">&quot;degrees&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="radians"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.radians.html#pyspark.sql.functions.radians">[docs]</a><span class="k">def</span> <span class="nf">radians</span><span class="p">(</span><span class="n">col</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Converts an angle measured in degrees to an approximately equivalent angle</span>
<span class="sd"> measured in radians.</span>
<span class="sd"> .. versionadded:: 2.1.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> angle in degrees</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> angle in radians, as if computed by `java.lang.Math.toRadians()`</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_column</span><span class="p">(</span><span class="s2">&quot;radians&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="atan2"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.atan2.html#pyspark.sql.functions.atan2">[docs]</a><span class="k">def</span> <span class="nf">atan2</span><span class="p">(</span><span class="n">col1</span><span class="p">,</span> <span class="n">col2</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> .. versionadded:: 1.4.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col1 : str, :class:`~pyspark.sql.Column` or float</span>
<span class="sd"> coordinate on y-axis</span>
<span class="sd"> col2 : str, :class:`~pyspark.sql.Column` or float</span>
<span class="sd"> coordinate on x-axis</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> the `theta` component of the point</span>
<span class="sd"> (`r`, `theta`)</span>
<span class="sd"> in polar coordinates that corresponds to the point</span>
<span class="sd"> (`x`, `y`) in Cartesian coordinates,</span>
<span class="sd"> as if computed by `java.lang.Math.atan2()`</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_binary_math_function</span><span class="p">(</span><span class="s2">&quot;atan2&quot;</span><span class="p">,</span> <span class="n">col1</span><span class="p">,</span> <span class="n">col2</span><span class="p">)</span></div>
<div class="viewcode-block" id="hypot"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.hypot.html#pyspark.sql.functions.hypot">[docs]</a><span class="nd">@since</span><span class="p">(</span><span class="mf">1.4</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">hypot</span><span class="p">(</span><span class="n">col1</span><span class="p">,</span> <span class="n">col2</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Computes ``sqrt(a^2 + b^2)`` without intermediate overflow or underflow.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_binary_math_function</span><span class="p">(</span><span class="s2">&quot;hypot&quot;</span><span class="p">,</span> <span class="n">col1</span><span class="p">,</span> <span class="n">col2</span><span class="p">)</span></div>
<div class="viewcode-block" id="pow"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.pow.html#pyspark.sql.functions.pow">[docs]</a><span class="nd">@since</span><span class="p">(</span><span class="mf">1.4</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">pow</span><span class="p">(</span><span class="n">col1</span><span class="p">,</span> <span class="n">col2</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns the value of the first argument raised to the power of the second argument.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_binary_math_function</span><span class="p">(</span><span class="s2">&quot;pow&quot;</span><span class="p">,</span> <span class="n">col1</span><span class="p">,</span> <span class="n">col2</span><span class="p">)</span></div>
<div class="viewcode-block" id="row_number"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.row_number.html#pyspark.sql.functions.row_number">[docs]</a><span class="nd">@since</span><span class="p">(</span><span class="mf">1.6</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">row_number</span><span class="p">():</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Window function: returns a sequential number starting at 1 within a window partition.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">&quot;row_number&quot;</span><span class="p">)</span></div>
<div class="viewcode-block" id="dense_rank"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.dense_rank.html#pyspark.sql.functions.dense_rank">[docs]</a><span class="nd">@since</span><span class="p">(</span><span class="mf">1.6</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">dense_rank</span><span class="p">():</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Window function: returns the rank of rows within a window partition, without any gaps.</span>
<span class="sd"> The difference between rank and dense_rank is that dense_rank leaves no gaps in ranking</span>
<span class="sd"> sequence when there are ties. That is, if you were ranking a competition using dense_rank</span>
<span class="sd"> and had three people tie for second place, you would say that all three were in second</span>
<span class="sd"> place and that the next person came in third. Rank would give me sequential numbers, making</span>
<span class="sd"> the person that came in third place (after the ties) would register as coming in fifth.</span>
<span class="sd"> This is equivalent to the DENSE_RANK function in SQL.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">&quot;dense_rank&quot;</span><span class="p">)</span></div>
<div class="viewcode-block" id="rank"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.rank.html#pyspark.sql.functions.rank">[docs]</a><span class="nd">@since</span><span class="p">(</span><span class="mf">1.6</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">rank</span><span class="p">():</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Window function: returns the rank of rows within a window partition.</span>
<span class="sd"> The difference between rank and dense_rank is that dense_rank leaves no gaps in ranking</span>
<span class="sd"> sequence when there are ties. That is, if you were ranking a competition using dense_rank</span>
<span class="sd"> and had three people tie for second place, you would say that all three were in second</span>
<span class="sd"> place and that the next person came in third. Rank would give me sequential numbers, making</span>
<span class="sd"> the person that came in third place (after the ties) would register as coming in fifth.</span>
<span class="sd"> This is equivalent to the RANK function in SQL.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">&quot;rank&quot;</span><span class="p">)</span></div>
<div class="viewcode-block" id="cume_dist"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.cume_dist.html#pyspark.sql.functions.cume_dist">[docs]</a><span class="nd">@since</span><span class="p">(</span><span class="mf">1.6</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">cume_dist</span><span class="p">():</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Window function: returns the cumulative distribution of values within a window partition,</span>
<span class="sd"> i.e. the fraction of rows that are below the current row.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">&quot;cume_dist&quot;</span><span class="p">)</span></div>
<div class="viewcode-block" id="percent_rank"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.percent_rank.html#pyspark.sql.functions.percent_rank">[docs]</a><span class="nd">@since</span><span class="p">(</span><span class="mf">1.6</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">percent_rank</span><span class="p">():</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Window function: returns the relative rank (i.e. percentile) of rows within a window partition.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">&quot;percent_rank&quot;</span><span class="p">)</span></div>
<div class="viewcode-block" id="approxCountDistinct"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.approxCountDistinct.html#pyspark.sql.functions.approxCountDistinct">[docs]</a><span class="nd">@since</span><span class="p">(</span><span class="mf">1.3</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">approxCountDistinct</span><span class="p">(</span><span class="n">col</span><span class="p">,</span> <span class="n">rsd</span><span class="o">=</span><span class="kc">None</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> .. deprecated:: 2.1.0</span>
<span class="sd"> Use :func:`approx_count_distinct` instead.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">warnings</span><span class="o">.</span><span class="n">warn</span><span class="p">(</span><span class="s2">&quot;Deprecated in 2.1, use approx_count_distinct instead.&quot;</span><span class="p">,</span> <span class="ne">FutureWarning</span><span class="p">)</span>
<span class="k">return</span> <span class="n">approx_count_distinct</span><span class="p">(</span><span class="n">col</span><span class="p">,</span> <span class="n">rsd</span><span class="p">)</span></div>
<div class="viewcode-block" id="approx_count_distinct"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.approx_count_distinct.html#pyspark.sql.functions.approx_count_distinct">[docs]</a><span class="k">def</span> <span class="nf">approx_count_distinct</span><span class="p">(</span><span class="n">col</span><span class="p">,</span> <span class="n">rsd</span><span class="o">=</span><span class="kc">None</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;Aggregate function: returns a new :class:`~pyspark.sql.Column` for approximate distinct count</span>
<span class="sd"> of column `col`.</span>
<span class="sd"> .. versionadded:: 2.1.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> rsd : float, optional</span>
<span class="sd"> maximum relative standard deviation allowed (default = 0.05).</span>
<span class="sd"> For rsd &lt; 0.01, it is more efficient to use :func:`count_distinct`</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df.agg(approx_count_distinct(df.age).alias(&#39;distinct_ages&#39;)).collect()</span>
<span class="sd"> [Row(distinct_ages=2)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span>
<span class="k">if</span> <span class="n">rsd</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
<span class="n">jc</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">functions</span><span class="o">.</span><span class="n">approx_count_distinct</span><span class="p">(</span><span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">))</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">jc</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">functions</span><span class="o">.</span><span class="n">approx_count_distinct</span><span class="p">(</span><span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">),</span> <span class="n">rsd</span><span class="p">)</span>
<span class="k">return</span> <span class="n">Column</span><span class="p">(</span><span class="n">jc</span><span class="p">)</span></div>
<div class="viewcode-block" id="broadcast"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.broadcast.html#pyspark.sql.functions.broadcast">[docs]</a><span class="nd">@since</span><span class="p">(</span><span class="mf">1.6</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">broadcast</span><span class="p">(</span><span class="n">df</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;Marks a DataFrame as small enough for use in broadcast joins.&quot;&quot;&quot;</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span>
<span class="k">return</span> <span class="n">DataFrame</span><span class="p">(</span><span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">functions</span><span class="o">.</span><span class="n">broadcast</span><span class="p">(</span><span class="n">df</span><span class="o">.</span><span class="n">_jdf</span><span class="p">),</span> <span class="n">df</span><span class="o">.</span><span class="n">sql_ctx</span><span class="p">)</span></div>
<div class="viewcode-block" id="coalesce"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.coalesce.html#pyspark.sql.functions.coalesce">[docs]</a><span class="k">def</span> <span class="nf">coalesce</span><span class="p">(</span><span class="o">*</span><span class="n">cols</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;Returns the first column that is not null.</span>
<span class="sd"> .. versionadded:: 1.4.0</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; cDf = spark.createDataFrame([(None, None), (1, None), (None, 2)], (&quot;a&quot;, &quot;b&quot;))</span>
<span class="sd"> &gt;&gt;&gt; cDf.show()</span>
<span class="sd"> +----+----+</span>
<span class="sd"> | a| b|</span>
<span class="sd"> +----+----+</span>
<span class="sd"> |null|null|</span>
<span class="sd"> | 1|null|</span>
<span class="sd"> |null| 2|</span>
<span class="sd"> +----+----+</span>
<span class="sd"> &gt;&gt;&gt; cDf.select(coalesce(cDf[&quot;a&quot;], cDf[&quot;b&quot;])).show()</span>
<span class="sd"> +--------------+</span>
<span class="sd"> |coalesce(a, b)|</span>
<span class="sd"> +--------------+</span>
<span class="sd"> | null|</span>
<span class="sd"> | 1|</span>
<span class="sd"> | 2|</span>
<span class="sd"> +--------------+</span>
<span class="sd"> &gt;&gt;&gt; cDf.select(&#39;*&#39;, coalesce(cDf[&quot;a&quot;], lit(0.0))).show()</span>
<span class="sd"> +----+----+----------------+</span>
<span class="sd"> | a| b|coalesce(a, 0.0)|</span>
<span class="sd"> +----+----+----------------+</span>
<span class="sd"> |null|null| 0.0|</span>
<span class="sd"> | 1|null| 1.0|</span>
<span class="sd"> |null| 2| 0.0|</span>
<span class="sd"> +----+----+----------------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span>
<span class="n">jc</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">functions</span><span class="o">.</span><span class="n">coalesce</span><span class="p">(</span><span class="n">_to_seq</span><span class="p">(</span><span class="n">sc</span><span class="p">,</span> <span class="n">cols</span><span class="p">,</span> <span class="n">_to_java_column</span><span class="p">))</span>
<span class="k">return</span> <span class="n">Column</span><span class="p">(</span><span class="n">jc</span><span class="p">)</span></div>
<div class="viewcode-block" id="corr"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.corr.html#pyspark.sql.functions.corr">[docs]</a><span class="k">def</span> <span class="nf">corr</span><span class="p">(</span><span class="n">col1</span><span class="p">,</span> <span class="n">col2</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;Returns a new :class:`~pyspark.sql.Column` for the Pearson Correlation Coefficient for</span>
<span class="sd"> ``col1`` and ``col2``.</span>
<span class="sd"> .. versionadded:: 1.6.0</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; a = range(20)</span>
<span class="sd"> &gt;&gt;&gt; b = [2 * x for x in range(20)]</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame(zip(a, b), [&quot;a&quot;, &quot;b&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df.agg(corr(&quot;a&quot;, &quot;b&quot;).alias(&#39;c&#39;)).collect()</span>
<span class="sd"> [Row(c=1.0)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span>
<span class="k">return</span> <span class="n">Column</span><span class="p">(</span><span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">functions</span><span class="o">.</span><span class="n">corr</span><span class="p">(</span><span class="n">_to_java_column</span><span class="p">(</span><span class="n">col1</span><span class="p">),</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">col2</span><span class="p">)))</span></div>
<div class="viewcode-block" id="covar_pop"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.covar_pop.html#pyspark.sql.functions.covar_pop">[docs]</a><span class="k">def</span> <span class="nf">covar_pop</span><span class="p">(</span><span class="n">col1</span><span class="p">,</span> <span class="n">col2</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;Returns a new :class:`~pyspark.sql.Column` for the population covariance of ``col1`` and</span>
<span class="sd"> ``col2``.</span>
<span class="sd"> .. versionadded:: 2.0.0</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; a = [1] * 10</span>
<span class="sd"> &gt;&gt;&gt; b = [1] * 10</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame(zip(a, b), [&quot;a&quot;, &quot;b&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df.agg(covar_pop(&quot;a&quot;, &quot;b&quot;).alias(&#39;c&#39;)).collect()</span>
<span class="sd"> [Row(c=0.0)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span>
<span class="k">return</span> <span class="n">Column</span><span class="p">(</span><span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">functions</span><span class="o">.</span><span class="n">covar_pop</span><span class="p">(</span><span class="n">_to_java_column</span><span class="p">(</span><span class="n">col1</span><span class="p">),</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">col2</span><span class="p">)))</span></div>
<div class="viewcode-block" id="covar_samp"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.covar_samp.html#pyspark.sql.functions.covar_samp">[docs]</a><span class="k">def</span> <span class="nf">covar_samp</span><span class="p">(</span><span class="n">col1</span><span class="p">,</span> <span class="n">col2</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;Returns a new :class:`~pyspark.sql.Column` for the sample covariance of ``col1`` and</span>
<span class="sd"> ``col2``.</span>
<span class="sd"> .. versionadded:: 2.0.0</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; a = [1] * 10</span>
<span class="sd"> &gt;&gt;&gt; b = [1] * 10</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame(zip(a, b), [&quot;a&quot;, &quot;b&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df.agg(covar_samp(&quot;a&quot;, &quot;b&quot;).alias(&#39;c&#39;)).collect()</span>
<span class="sd"> [Row(c=0.0)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span>
<span class="k">return</span> <span class="n">Column</span><span class="p">(</span><span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">functions</span><span class="o">.</span><span class="n">covar_samp</span><span class="p">(</span><span class="n">_to_java_column</span><span class="p">(</span><span class="n">col1</span><span class="p">),</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">col2</span><span class="p">)))</span></div>
<div class="viewcode-block" id="countDistinct"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.countDistinct.html#pyspark.sql.functions.countDistinct">[docs]</a><span class="k">def</span> <span class="nf">countDistinct</span><span class="p">(</span><span class="n">col</span><span class="p">,</span> <span class="o">*</span><span class="n">cols</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;Returns a new :class:`~pyspark.sql.Column` for distinct count of ``col`` or ``cols``.</span>
<span class="sd"> An alias of :func:`count_distinct`, and it is encouraged to use :func:`count_distinct`</span>
<span class="sd"> directly.</span>
<span class="sd"> .. versionadded:: 1.3.0</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">count_distinct</span><span class="p">(</span><span class="n">col</span><span class="p">,</span> <span class="o">*</span><span class="n">cols</span><span class="p">)</span></div>
<div class="viewcode-block" id="count_distinct"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.count_distinct.html#pyspark.sql.functions.count_distinct">[docs]</a><span class="k">def</span> <span class="nf">count_distinct</span><span class="p">(</span><span class="n">col</span><span class="p">,</span> <span class="o">*</span><span class="n">cols</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;Returns a new :class:`Column` for distinct count of ``col`` or ``cols``.</span>
<span class="sd"> .. versionadded:: 3.2.0</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df.agg(count_distinct(df.age, df.name).alias(&#39;c&#39;)).collect()</span>
<span class="sd"> [Row(c=2)]</span>
<span class="sd"> &gt;&gt;&gt; df.agg(count_distinct(&quot;age&quot;, &quot;name&quot;).alias(&#39;c&#39;)).collect()</span>
<span class="sd"> [Row(c=2)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span>
<span class="n">jc</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">functions</span><span class="o">.</span><span class="n">count_distinct</span><span class="p">(</span><span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">),</span> <span class="n">_to_seq</span><span class="p">(</span><span class="n">sc</span><span class="p">,</span> <span class="n">cols</span><span class="p">,</span> <span class="n">_to_java_column</span><span class="p">))</span>
<span class="k">return</span> <span class="n">Column</span><span class="p">(</span><span class="n">jc</span><span class="p">)</span></div>
<div class="viewcode-block" id="first"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.first.html#pyspark.sql.functions.first">[docs]</a><span class="k">def</span> <span class="nf">first</span><span class="p">(</span><span class="n">col</span><span class="p">,</span> <span class="n">ignorenulls</span><span class="o">=</span><span class="kc">False</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;Aggregate function: returns the first value in a group.</span>
<span class="sd"> The function by default returns the first values it sees. It will return the first non-null</span>
<span class="sd"> value it sees when ignoreNulls is set to true. If all values are null, then null is returned.</span>
<span class="sd"> .. versionadded:: 1.3.0</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> The function is non-deterministic because its results depends on the order of the</span>
<span class="sd"> rows which may be non-deterministic after a shuffle.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span>
<span class="n">jc</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">functions</span><span class="o">.</span><span class="n">first</span><span class="p">(</span><span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">),</span> <span class="n">ignorenulls</span><span class="p">)</span>
<span class="k">return</span> <span class="n">Column</span><span class="p">(</span><span class="n">jc</span><span class="p">)</span></div>
<div class="viewcode-block" id="grouping"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.grouping.html#pyspark.sql.functions.grouping">[docs]</a><span class="k">def</span> <span class="nf">grouping</span><span class="p">(</span><span class="n">col</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Aggregate function: indicates whether a specified column in a GROUP BY list is aggregated</span>
<span class="sd"> or not, returns 1 for aggregated or 0 for not aggregated in the result set.</span>
<span class="sd"> .. versionadded:: 2.0.0</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df.cube(&quot;name&quot;).agg(grouping(&quot;name&quot;), sum(&quot;age&quot;)).orderBy(&quot;name&quot;).show()</span>
<span class="sd"> +-----+--------------+--------+</span>
<span class="sd"> | name|grouping(name)|sum(age)|</span>
<span class="sd"> +-----+--------------+--------+</span>
<span class="sd"> | null| 1| 7|</span>
<span class="sd"> |Alice| 0| 2|</span>
<span class="sd"> | Bob| 0| 5|</span>
<span class="sd"> +-----+--------------+--------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span>
<span class="n">jc</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">functions</span><span class="o">.</span><span class="n">grouping</span><span class="p">(</span><span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">))</span>
<span class="k">return</span> <span class="n">Column</span><span class="p">(</span><span class="n">jc</span><span class="p">)</span></div>
<div class="viewcode-block" id="grouping_id"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.grouping_id.html#pyspark.sql.functions.grouping_id">[docs]</a><span class="k">def</span> <span class="nf">grouping_id</span><span class="p">(</span><span class="o">*</span><span class="n">cols</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Aggregate function: returns the level of grouping, equals to</span>
<span class="sd"> (grouping(c1) &lt;&lt; (n-1)) + (grouping(c2) &lt;&lt; (n-2)) + ... + grouping(cn)</span>
<span class="sd"> .. versionadded:: 2.0.0</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> The list of columns should match with grouping columns exactly, or empty (means all</span>
<span class="sd"> the grouping columns).</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df.cube(&quot;name&quot;).agg(grouping_id(), sum(&quot;age&quot;)).orderBy(&quot;name&quot;).show()</span>
<span class="sd"> +-----+-------------+--------+</span>
<span class="sd"> | name|grouping_id()|sum(age)|</span>
<span class="sd"> +-----+-------------+--------+</span>
<span class="sd"> | null| 1| 7|</span>
<span class="sd"> |Alice| 0| 2|</span>
<span class="sd"> | Bob| 0| 5|</span>
<span class="sd"> +-----+-------------+--------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span>
<span class="n">jc</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">functions</span><span class="o">.</span><span class="n">grouping_id</span><span class="p">(</span><span class="n">_to_seq</span><span class="p">(</span><span class="n">sc</span><span class="p">,</span> <span class="n">cols</span><span class="p">,</span> <span class="n">_to_java_column</span><span class="p">))</span>
<span class="k">return</span> <span class="n">Column</span><span class="p">(</span><span class="n">jc</span><span class="p">)</span></div>
<div class="viewcode-block" id="input_file_name"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.input_file_name.html#pyspark.sql.functions.input_file_name">[docs]</a><span class="nd">@since</span><span class="p">(</span><span class="mf">1.6</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">input_file_name</span><span class="p">():</span>
<span class="sd">&quot;&quot;&quot;Creates a string column for the file name of the current Spark task.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span>
<span class="k">return</span> <span class="n">Column</span><span class="p">(</span><span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">functions</span><span class="o">.</span><span class="n">input_file_name</span><span class="p">())</span></div>
<div class="viewcode-block" id="isnan"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.isnan.html#pyspark.sql.functions.isnan">[docs]</a><span class="k">def</span> <span class="nf">isnan</span><span class="p">(</span><span class="n">col</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;An expression that returns true iff the column is NaN.</span>
<span class="sd"> .. versionadded:: 1.6.0</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(1.0, float(&#39;nan&#39;)), (float(&#39;nan&#39;), 2.0)], (&quot;a&quot;, &quot;b&quot;))</span>
<span class="sd"> &gt;&gt;&gt; df.select(isnan(&quot;a&quot;).alias(&quot;r1&quot;), isnan(df.a).alias(&quot;r2&quot;)).collect()</span>
<span class="sd"> [Row(r1=False, r2=False), Row(r1=True, r2=True)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span>
<span class="k">return</span> <span class="n">Column</span><span class="p">(</span><span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">functions</span><span class="o">.</span><span class="n">isnan</span><span class="p">(</span><span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">)))</span></div>
<div class="viewcode-block" id="isnull"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.isnull.html#pyspark.sql.functions.isnull">[docs]</a><span class="k">def</span> <span class="nf">isnull</span><span class="p">(</span><span class="n">col</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;An expression that returns true iff the column is null.</span>
<span class="sd"> .. versionadded:: 1.6.0</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(1, None), (None, 2)], (&quot;a&quot;, &quot;b&quot;))</span>
<span class="sd"> &gt;&gt;&gt; df.select(isnull(&quot;a&quot;).alias(&quot;r1&quot;), isnull(df.a).alias(&quot;r2&quot;)).collect()</span>
<span class="sd"> [Row(r1=False, r2=False), Row(r1=True, r2=True)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span>
<span class="k">return</span> <span class="n">Column</span><span class="p">(</span><span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">functions</span><span class="o">.</span><span class="n">isnull</span><span class="p">(</span><span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">)))</span></div>
<div class="viewcode-block" id="last"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.last.html#pyspark.sql.functions.last">[docs]</a><span class="k">def</span> <span class="nf">last</span><span class="p">(</span><span class="n">col</span><span class="p">,</span> <span class="n">ignorenulls</span><span class="o">=</span><span class="kc">False</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;Aggregate function: returns the last value in a group.</span>
<span class="sd"> The function by default returns the last values it sees. It will return the last non-null</span>
<span class="sd"> value it sees when ignoreNulls is set to true. If all values are null, then null is returned.</span>
<span class="sd"> .. versionadded:: 1.3.0</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> The function is non-deterministic because its results depends on the order of the</span>
<span class="sd"> rows which may be non-deterministic after a shuffle.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span>
<span class="n">jc</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">functions</span><span class="o">.</span><span class="n">last</span><span class="p">(</span><span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">),</span> <span class="n">ignorenulls</span><span class="p">)</span>
<span class="k">return</span> <span class="n">Column</span><span class="p">(</span><span class="n">jc</span><span class="p">)</span></div>
<div class="viewcode-block" id="monotonically_increasing_id"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.monotonically_increasing_id.html#pyspark.sql.functions.monotonically_increasing_id">[docs]</a><span class="k">def</span> <span class="nf">monotonically_increasing_id</span><span class="p">():</span>
<span class="sd">&quot;&quot;&quot;A column that generates monotonically increasing 64-bit integers.</span>
<span class="sd"> The generated ID is guaranteed to be monotonically increasing and unique, but not consecutive.</span>
<span class="sd"> The current implementation puts the partition ID in the upper 31 bits, and the record number</span>
<span class="sd"> within each partition in the lower 33 bits. The assumption is that the data frame has</span>
<span class="sd"> less than 1 billion partitions, and each partition has less than 8 billion records.</span>
<span class="sd"> .. versionadded:: 1.6.0</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> The function is non-deterministic because its result depends on partition IDs.</span>
<span class="sd"> As an example, consider a :class:`DataFrame` with two partitions, each with 3 records.</span>
<span class="sd"> This expression would return the following IDs:</span>
<span class="sd"> 0, 1, 2, 8589934592 (1L &lt;&lt; 33), 8589934593, 8589934594.</span>
<span class="sd"> &gt;&gt;&gt; df0 = sc.parallelize(range(2), 2).mapPartitions(lambda x: [(1,), (2,), (3,)]).toDF([&#39;col1&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df0.select(monotonically_increasing_id().alias(&#39;id&#39;)).collect()</span>
<span class="sd"> [Row(id=0), Row(id=1), Row(id=2), Row(id=8589934592), Row(id=8589934593), Row(id=8589934594)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span>
<span class="k">return</span> <span class="n">Column</span><span class="p">(</span><span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">functions</span><span class="o">.</span><span class="n">monotonically_increasing_id</span><span class="p">())</span></div>
<div class="viewcode-block" id="nanvl"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.nanvl.html#pyspark.sql.functions.nanvl">[docs]</a><span class="k">def</span> <span class="nf">nanvl</span><span class="p">(</span><span class="n">col1</span><span class="p">,</span> <span class="n">col2</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;Returns col1 if it is not NaN, or col2 if col1 is NaN.</span>
<span class="sd"> Both inputs should be floating point columns (:class:`DoubleType` or :class:`FloatType`).</span>
<span class="sd"> .. versionadded:: 1.6.0</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(1.0, float(&#39;nan&#39;)), (float(&#39;nan&#39;), 2.0)], (&quot;a&quot;, &quot;b&quot;))</span>
<span class="sd"> &gt;&gt;&gt; df.select(nanvl(&quot;a&quot;, &quot;b&quot;).alias(&quot;r1&quot;), nanvl(df.a, df.b).alias(&quot;r2&quot;)).collect()</span>
<span class="sd"> [Row(r1=1.0, r2=1.0), Row(r1=2.0, r2=2.0)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span>
<span class="k">return</span> <span class="n">Column</span><span class="p">(</span><span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">functions</span><span class="o">.</span><span class="n">nanvl</span><span class="p">(</span><span class="n">_to_java_column</span><span class="p">(</span><span class="n">col1</span><span class="p">),</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">col2</span><span class="p">)))</span></div>
<div class="viewcode-block" id="percentile_approx"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.percentile_approx.html#pyspark.sql.functions.percentile_approx">[docs]</a><span class="k">def</span> <span class="nf">percentile_approx</span><span class="p">(</span><span class="n">col</span><span class="p">,</span> <span class="n">percentage</span><span class="p">,</span> <span class="n">accuracy</span><span class="o">=</span><span class="mi">10000</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;Returns the approximate `percentile` of the numeric column `col` which is the smallest value</span>
<span class="sd"> in the ordered `col` values (sorted from least to greatest) such that no more than `percentage`</span>
<span class="sd"> of `col` values is less than the value or equal to that value.</span>
<span class="sd"> The value of percentage must be between 0.0 and 1.0.</span>
<span class="sd"> The accuracy parameter (default: 10000)</span>
<span class="sd"> is a positive numeric literal which controls approximation accuracy at the cost of memory.</span>
<span class="sd"> Higher value of accuracy yields better accuracy, 1.0/accuracy is the relative error</span>
<span class="sd"> of the approximation.</span>
<span class="sd"> When percentage is an array, each value of the percentage array must be between 0.0 and 1.0.</span>
<span class="sd"> In this case, returns the approximate percentile array of column col</span>
<span class="sd"> at the given percentage array.</span>
<span class="sd"> .. versionadded:: 3.1.0</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; key = (col(&quot;id&quot;) % 3).alias(&quot;key&quot;)</span>
<span class="sd"> &gt;&gt;&gt; value = (randn(42) + key * 10).alias(&quot;value&quot;)</span>
<span class="sd"> &gt;&gt;&gt; df = spark.range(0, 1000, 1, 1).select(key, value)</span>
<span class="sd"> &gt;&gt;&gt; df.select(</span>
<span class="sd"> ... percentile_approx(&quot;value&quot;, [0.25, 0.5, 0.75], 1000000).alias(&quot;quantiles&quot;)</span>
<span class="sd"> ... ).printSchema()</span>
<span class="sd"> root</span>
<span class="sd"> |-- quantiles: array (nullable = true)</span>
<span class="sd"> | |-- element: double (containsNull = false)</span>
<span class="sd"> &gt;&gt;&gt; df.groupBy(&quot;key&quot;).agg(</span>
<span class="sd"> ... percentile_approx(&quot;value&quot;, 0.5, lit(1000000)).alias(&quot;median&quot;)</span>
<span class="sd"> ... ).printSchema()</span>
<span class="sd"> root</span>
<span class="sd"> |-- key: long (nullable = true)</span>
<span class="sd"> |-- median: double (nullable = true)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">percentage</span><span class="p">,</span> <span class="p">(</span><span class="nb">list</span><span class="p">,</span> <span class="nb">tuple</span><span class="p">)):</span>
<span class="c1"># A local list</span>
<span class="n">percentage</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">functions</span><span class="o">.</span><span class="n">array</span><span class="p">(</span><span class="n">_to_seq</span><span class="p">(</span><span class="n">sc</span><span class="p">,</span> <span class="p">[</span>
<span class="n">_create_column_from_literal</span><span class="p">(</span><span class="n">x</span><span class="p">)</span> <span class="k">for</span> <span class="n">x</span> <span class="ow">in</span> <span class="n">percentage</span>
<span class="p">]))</span>
<span class="k">elif</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">percentage</span><span class="p">,</span> <span class="n">Column</span><span class="p">):</span>
<span class="c1"># Already a Column</span>
<span class="n">percentage</span> <span class="o">=</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">percentage</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="c1"># Probably scalar</span>
<span class="n">percentage</span> <span class="o">=</span> <span class="n">_create_column_from_literal</span><span class="p">(</span><span class="n">percentage</span><span class="p">)</span>
<span class="n">accuracy</span> <span class="o">=</span> <span class="p">(</span>
<span class="n">_to_java_column</span><span class="p">(</span><span class="n">accuracy</span><span class="p">)</span> <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">accuracy</span><span class="p">,</span> <span class="n">Column</span><span class="p">)</span>
<span class="k">else</span> <span class="n">_create_column_from_literal</span><span class="p">(</span><span class="n">accuracy</span><span class="p">)</span>
<span class="p">)</span>
<span class="k">return</span> <span class="n">Column</span><span class="p">(</span><span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">functions</span><span class="o">.</span><span class="n">percentile_approx</span><span class="p">(</span><span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">),</span> <span class="n">percentage</span><span class="p">,</span> <span class="n">accuracy</span><span class="p">))</span></div>
<div class="viewcode-block" id="rand"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.rand.html#pyspark.sql.functions.rand">[docs]</a><span class="k">def</span> <span class="nf">rand</span><span class="p">(</span><span class="n">seed</span><span class="o">=</span><span class="kc">None</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;Generates a random column with independent and identically distributed (i.i.d.) samples</span>
<span class="sd"> uniformly distributed in [0.0, 1.0).</span>
<span class="sd"> .. versionadded:: 1.4.0</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> The function is non-deterministic in general case.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df.withColumn(&#39;rand&#39;, rand(seed=42) * 3).collect()</span>
<span class="sd"> [Row(age=2, name=&#39;Alice&#39;, rand=2.4052597283576684),</span>
<span class="sd"> Row(age=5, name=&#39;Bob&#39;, rand=2.3913904055683974)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span>
<span class="k">if</span> <span class="n">seed</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="n">jc</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">functions</span><span class="o">.</span><span class="n">rand</span><span class="p">(</span><span class="n">seed</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">jc</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">functions</span><span class="o">.</span><span class="n">rand</span><span class="p">()</span>
<span class="k">return</span> <span class="n">Column</span><span class="p">(</span><span class="n">jc</span><span class="p">)</span></div>
<div class="viewcode-block" id="randn"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.randn.html#pyspark.sql.functions.randn">[docs]</a><span class="k">def</span> <span class="nf">randn</span><span class="p">(</span><span class="n">seed</span><span class="o">=</span><span class="kc">None</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;Generates a column with independent and identically distributed (i.i.d.) samples from</span>
<span class="sd"> the standard normal distribution.</span>
<span class="sd"> .. versionadded:: 1.4.0</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> The function is non-deterministic in general case.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df.withColumn(&#39;randn&#39;, randn(seed=42)).collect()</span>
<span class="sd"> [Row(age=2, name=&#39;Alice&#39;, randn=1.1027054481455365),</span>
<span class="sd"> Row(age=5, name=&#39;Bob&#39;, randn=0.7400395449950132)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span>
<span class="k">if</span> <span class="n">seed</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="n">jc</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">functions</span><span class="o">.</span><span class="n">randn</span><span class="p">(</span><span class="n">seed</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">jc</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">functions</span><span class="o">.</span><span class="n">randn</span><span class="p">()</span>
<span class="k">return</span> <span class="n">Column</span><span class="p">(</span><span class="n">jc</span><span class="p">)</span></div>
<div class="viewcode-block" id="round"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.round.html#pyspark.sql.functions.round">[docs]</a><span class="k">def</span> <span class="nf">round</span><span class="p">(</span><span class="n">col</span><span class="p">,</span> <span class="n">scale</span><span class="o">=</span><span class="mi">0</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Round the given value to `scale` decimal places using HALF_UP rounding mode if `scale` &gt;= 0</span>
<span class="sd"> or at integral part when `scale` &lt; 0.</span>
<span class="sd"> .. versionadded:: 1.5.0</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; spark.createDataFrame([(2.5,)], [&#39;a&#39;]).select(round(&#39;a&#39;, 0).alias(&#39;r&#39;)).collect()</span>
<span class="sd"> [Row(r=3.0)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span>
<span class="k">return</span> <span class="n">Column</span><span class="p">(</span><span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">functions</span><span class="o">.</span><span class="n">round</span><span class="p">(</span><span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">),</span> <span class="n">scale</span><span class="p">))</span></div>
<div class="viewcode-block" id="bround"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.bround.html#pyspark.sql.functions.bround">[docs]</a><span class="k">def</span> <span class="nf">bround</span><span class="p">(</span><span class="n">col</span><span class="p">,</span> <span class="n">scale</span><span class="o">=</span><span class="mi">0</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Round the given value to `scale` decimal places using HALF_EVEN rounding mode if `scale` &gt;= 0</span>
<span class="sd"> or at integral part when `scale` &lt; 0.</span>
<span class="sd"> .. versionadded:: 2.0.0</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; spark.createDataFrame([(2.5,)], [&#39;a&#39;]).select(bround(&#39;a&#39;, 0).alias(&#39;r&#39;)).collect()</span>
<span class="sd"> [Row(r=2.0)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span>
<span class="k">return</span> <span class="n">Column</span><span class="p">(</span><span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">functions</span><span class="o">.</span><span class="n">bround</span><span class="p">(</span><span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">),</span> <span class="n">scale</span><span class="p">))</span></div>
<span class="k">def</span> <span class="nf">shiftLeft</span><span class="p">(</span><span class="n">col</span><span class="p">,</span> <span class="n">numBits</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;Shift the given value numBits left.</span>
<span class="sd"> .. versionadded:: 1.5.0</span>
<span class="sd"> .. deprecated:: 3.2.0</span>
<span class="sd"> Use :func:`shiftleft` instead.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">warnings</span><span class="o">.</span><span class="n">warn</span><span class="p">(</span><span class="s2">&quot;Deprecated in 3.2, use shiftleft instead.&quot;</span><span class="p">,</span> <span class="ne">FutureWarning</span><span class="p">)</span>
<span class="k">return</span> <span class="n">shiftleft</span><span class="p">(</span><span class="n">col</span><span class="p">,</span> <span class="n">numBits</span><span class="p">)</span>
<div class="viewcode-block" id="shiftleft"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.shiftleft.html#pyspark.sql.functions.shiftleft">[docs]</a><span class="k">def</span> <span class="nf">shiftleft</span><span class="p">(</span><span class="n">col</span><span class="p">,</span> <span class="n">numBits</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;Shift the given value numBits left.</span>
<span class="sd"> .. versionadded:: 3.2.0</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; spark.createDataFrame([(21,)], [&#39;a&#39;]).select(shiftleft(&#39;a&#39;, 1).alias(&#39;r&#39;)).collect()</span>
<span class="sd"> [Row(r=42)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span>
<span class="k">return</span> <span class="n">Column</span><span class="p">(</span><span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">functions</span><span class="o">.</span><span class="n">shiftleft</span><span class="p">(</span><span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">),</span> <span class="n">numBits</span><span class="p">))</span></div>
<span class="k">def</span> <span class="nf">shiftRight</span><span class="p">(</span><span class="n">col</span><span class="p">,</span> <span class="n">numBits</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;(Signed) shift the given value numBits right.</span>
<span class="sd"> .. versionadded:: 1.5.0</span>
<span class="sd"> .. deprecated:: 3.2.0</span>
<span class="sd"> Use :func:`shiftright` instead.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">warnings</span><span class="o">.</span><span class="n">warn</span><span class="p">(</span><span class="s2">&quot;Deprecated in 3.2, use shiftright instead.&quot;</span><span class="p">,</span> <span class="ne">FutureWarning</span><span class="p">)</span>
<span class="k">return</span> <span class="n">shiftright</span><span class="p">(</span><span class="n">col</span><span class="p">,</span> <span class="n">numBits</span><span class="p">)</span>
<div class="viewcode-block" id="shiftright"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.shiftright.html#pyspark.sql.functions.shiftright">[docs]</a><span class="k">def</span> <span class="nf">shiftright</span><span class="p">(</span><span class="n">col</span><span class="p">,</span> <span class="n">numBits</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;(Signed) shift the given value numBits right.</span>
<span class="sd"> .. versionadded:: 3.2.0</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; spark.createDataFrame([(42,)], [&#39;a&#39;]).select(shiftright(&#39;a&#39;, 1).alias(&#39;r&#39;)).collect()</span>
<span class="sd"> [Row(r=21)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span>
<span class="n">jc</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">functions</span><span class="o">.</span><span class="n">shiftRight</span><span class="p">(</span><span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">),</span> <span class="n">numBits</span><span class="p">)</span>
<span class="k">return</span> <span class="n">Column</span><span class="p">(</span><span class="n">jc</span><span class="p">)</span></div>
<span class="k">def</span> <span class="nf">shiftRightUnsigned</span><span class="p">(</span><span class="n">col</span><span class="p">,</span> <span class="n">numBits</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;Unsigned shift the given value numBits right.</span>
<span class="sd"> .. versionadded:: 1.5.0</span>
<span class="sd"> .. deprecated:: 3.2.0</span>
<span class="sd"> Use :func:`shiftrightunsigned` instead.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">warnings</span><span class="o">.</span><span class="n">warn</span><span class="p">(</span><span class="s2">&quot;Deprecated in 3.2, use shiftrightunsigned instead.&quot;</span><span class="p">,</span> <span class="ne">FutureWarning</span><span class="p">)</span>
<span class="k">return</span> <span class="n">shiftrightunsigned</span><span class="p">(</span><span class="n">col</span><span class="p">,</span> <span class="n">numBits</span><span class="p">)</span>
<div class="viewcode-block" id="shiftrightunsigned"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.shiftrightunsigned.html#pyspark.sql.functions.shiftrightunsigned">[docs]</a><span class="k">def</span> <span class="nf">shiftrightunsigned</span><span class="p">(</span><span class="n">col</span><span class="p">,</span> <span class="n">numBits</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;Unsigned shift the given value numBits right.</span>
<span class="sd"> .. versionadded:: 3.2.0</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(-42,)], [&#39;a&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(shiftrightunsigned(&#39;a&#39;, 1).alias(&#39;r&#39;)).collect()</span>
<span class="sd"> [Row(r=9223372036854775787)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span>
<span class="n">jc</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">functions</span><span class="o">.</span><span class="n">shiftRightUnsigned</span><span class="p">(</span><span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">),</span> <span class="n">numBits</span><span class="p">)</span>
<span class="k">return</span> <span class="n">Column</span><span class="p">(</span><span class="n">jc</span><span class="p">)</span></div>
<div class="viewcode-block" id="spark_partition_id"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.spark_partition_id.html#pyspark.sql.functions.spark_partition_id">[docs]</a><span class="k">def</span> <span class="nf">spark_partition_id</span><span class="p">():</span>
<span class="sd">&quot;&quot;&quot;A column for partition ID.</span>
<span class="sd"> .. versionadded:: 1.6.0</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> This is non deterministic because it depends on data partitioning and task scheduling.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df.repartition(1).select(spark_partition_id().alias(&quot;pid&quot;)).collect()</span>
<span class="sd"> [Row(pid=0), Row(pid=0)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span>
<span class="k">return</span> <span class="n">Column</span><span class="p">(</span><span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">functions</span><span class="o">.</span><span class="n">spark_partition_id</span><span class="p">())</span></div>
<div class="viewcode-block" id="expr"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.expr.html#pyspark.sql.functions.expr">[docs]</a><span class="k">def</span> <span class="nf">expr</span><span class="p">(</span><span class="nb">str</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;Parses the expression string into the column that it represents</span>
<span class="sd"> .. versionadded:: 1.5.0</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df.select(expr(&quot;length(name)&quot;)).collect()</span>
<span class="sd"> [Row(length(name)=5), Row(length(name)=3)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span>
<span class="k">return</span> <span class="n">Column</span><span class="p">(</span><span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">functions</span><span class="o">.</span><span class="n">expr</span><span class="p">(</span><span class="nb">str</span><span class="p">))</span></div>
<div class="viewcode-block" id="struct"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.struct.html#pyspark.sql.functions.struct">[docs]</a><span class="k">def</span> <span class="nf">struct</span><span class="p">(</span><span class="o">*</span><span class="n">cols</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;Creates a new struct column.</span>
<span class="sd"> .. versionadded:: 1.4.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> cols : list, set, str or :class:`~pyspark.sql.Column`</span>
<span class="sd"> column names or :class:`~pyspark.sql.Column`\\s to contain in the output struct.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df.select(struct(&#39;age&#39;, &#39;name&#39;).alias(&quot;struct&quot;)).collect()</span>
<span class="sd"> [Row(struct=Row(age=2, name=&#39;Alice&#39;)), Row(struct=Row(age=5, name=&#39;Bob&#39;))]</span>
<span class="sd"> &gt;&gt;&gt; df.select(struct([df.age, df.name]).alias(&quot;struct&quot;)).collect()</span>
<span class="sd"> [Row(struct=Row(age=2, name=&#39;Alice&#39;)), Row(struct=Row(age=5, name=&#39;Bob&#39;))]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span>
<span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">cols</span><span class="p">)</span> <span class="o">==</span> <span class="mi">1</span> <span class="ow">and</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">cols</span><span class="p">[</span><span class="mi">0</span><span class="p">],</span> <span class="p">(</span><span class="nb">list</span><span class="p">,</span> <span class="nb">set</span><span class="p">)):</span>
<span class="n">cols</span> <span class="o">=</span> <span class="n">cols</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span>
<span class="n">jc</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">functions</span><span class="o">.</span><span class="n">struct</span><span class="p">(</span><span class="n">_to_seq</span><span class="p">(</span><span class="n">sc</span><span class="p">,</span> <span class="n">cols</span><span class="p">,</span> <span class="n">_to_java_column</span><span class="p">))</span>
<span class="k">return</span> <span class="n">Column</span><span class="p">(</span><span class="n">jc</span><span class="p">)</span></div>
<div class="viewcode-block" id="greatest"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.greatest.html#pyspark.sql.functions.greatest">[docs]</a><span class="k">def</span> <span class="nf">greatest</span><span class="p">(</span><span class="o">*</span><span class="n">cols</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns the greatest value of the list of column names, skipping null values.</span>
<span class="sd"> This function takes at least 2 parameters. It will return null iff all parameters are null.</span>
<span class="sd"> .. versionadded:: 1.5.0</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(1, 4, 3)], [&#39;a&#39;, &#39;b&#39;, &#39;c&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(greatest(df.a, df.b, df.c).alias(&quot;greatest&quot;)).collect()</span>
<span class="sd"> [Row(greatest=4)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">cols</span><span class="p">)</span> <span class="o">&lt;</span> <span class="mi">2</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">&quot;greatest should take at least two columns&quot;</span><span class="p">)</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span>
<span class="k">return</span> <span class="n">Column</span><span class="p">(</span><span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">functions</span><span class="o">.</span><span class="n">greatest</span><span class="p">(</span><span class="n">_to_seq</span><span class="p">(</span><span class="n">sc</span><span class="p">,</span> <span class="n">cols</span><span class="p">,</span> <span class="n">_to_java_column</span><span class="p">)))</span></div>
<div class="viewcode-block" id="least"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.least.html#pyspark.sql.functions.least">[docs]</a><span class="k">def</span> <span class="nf">least</span><span class="p">(</span><span class="o">*</span><span class="n">cols</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns the least value of the list of column names, skipping null values.</span>
<span class="sd"> This function takes at least 2 parameters. It will return null iff all parameters are null.</span>
<span class="sd"> .. versionadded:: 1.5.0</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(1, 4, 3)], [&#39;a&#39;, &#39;b&#39;, &#39;c&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(least(df.a, df.b, df.c).alias(&quot;least&quot;)).collect()</span>
<span class="sd"> [Row(least=1)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">cols</span><span class="p">)</span> <span class="o">&lt;</span> <span class="mi">2</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">&quot;least should take at least two columns&quot;</span><span class="p">)</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span>
<span class="k">return</span> <span class="n">Column</span><span class="p">(</span><span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">functions</span><span class="o">.</span><span class="n">least</span><span class="p">(</span><span class="n">_to_seq</span><span class="p">(</span><span class="n">sc</span><span class="p">,</span> <span class="n">cols</span><span class="p">,</span> <span class="n">_to_java_column</span><span class="p">)))</span></div>
<div class="viewcode-block" id="when"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.when.html#pyspark.sql.functions.when">[docs]</a><span class="k">def</span> <span class="nf">when</span><span class="p">(</span><span class="n">condition</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;Evaluates a list of conditions and returns one of multiple possible result expressions.</span>
<span class="sd"> If :func:`pyspark.sql.Column.otherwise` is not invoked, None is returned for unmatched</span>
<span class="sd"> conditions.</span>
<span class="sd"> .. versionadded:: 1.4.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> condition : :class:`~pyspark.sql.Column`</span>
<span class="sd"> a boolean :class:`~pyspark.sql.Column` expression.</span>
<span class="sd"> value :</span>
<span class="sd"> a literal value, or a :class:`~pyspark.sql.Column` expression.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df.select(when(df[&#39;age&#39;] == 2, 3).otherwise(4).alias(&quot;age&quot;)).collect()</span>
<span class="sd"> [Row(age=3), Row(age=4)]</span>
<span class="sd"> &gt;&gt;&gt; df.select(when(df.age == 2, df.age + 1).alias(&quot;age&quot;)).collect()</span>
<span class="sd"> [Row(age=3), Row(age=None)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span>
<span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">condition</span><span class="p">,</span> <span class="n">Column</span><span class="p">):</span>
<span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">&quot;condition should be a Column&quot;</span><span class="p">)</span>
<span class="n">v</span> <span class="o">=</span> <span class="n">value</span><span class="o">.</span><span class="n">_jc</span> <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">value</span><span class="p">,</span> <span class="n">Column</span><span class="p">)</span> <span class="k">else</span> <span class="n">value</span>
<span class="n">jc</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">functions</span><span class="o">.</span><span class="n">when</span><span class="p">(</span><span class="n">condition</span><span class="o">.</span><span class="n">_jc</span><span class="p">,</span> <span class="n">v</span><span class="p">)</span>
<span class="k">return</span> <span class="n">Column</span><span class="p">(</span><span class="n">jc</span><span class="p">)</span></div>
<div class="viewcode-block" id="log"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.log.html#pyspark.sql.functions.log">[docs]</a><span class="k">def</span> <span class="nf">log</span><span class="p">(</span><span class="n">arg1</span><span class="p">,</span> <span class="n">arg2</span><span class="o">=</span><span class="kc">None</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;Returns the first argument-based logarithm of the second argument.</span>
<span class="sd"> If there is only one argument, then this takes the natural logarithm of the argument.</span>
<span class="sd"> .. versionadded:: 1.5.0</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df.select(log(10.0, df.age).alias(&#39;ten&#39;)).rdd.map(lambda l: str(l.ten)[:7]).collect()</span>
<span class="sd"> [&#39;0.30102&#39;, &#39;0.69897&#39;]</span>
<span class="sd"> &gt;&gt;&gt; df.select(log(df.age).alias(&#39;e&#39;)).rdd.map(lambda l: str(l.e)[:7]).collect()</span>
<span class="sd"> [&#39;0.69314&#39;, &#39;1.60943&#39;]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span>
<span class="k">if</span> <span class="n">arg2</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
<span class="n">jc</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">functions</span><span class="o">.</span><span class="n">log</span><span class="p">(</span><span class="n">_to_java_column</span><span class="p">(</span><span class="n">arg1</span><span class="p">))</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">jc</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">functions</span><span class="o">.</span><span class="n">log</span><span class="p">(</span><span class="n">arg1</span><span class="p">,</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">arg2</span><span class="p">))</span>
<span class="k">return</span> <span class="n">Column</span><span class="p">(</span><span class="n">jc</span><span class="p">)</span></div>
<div class="viewcode-block" id="log2"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.log2.html#pyspark.sql.functions.log2">[docs]</a><span class="k">def</span> <span class="nf">log2</span><span class="p">(</span><span class="n">col</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;Returns the base-2 logarithm of the argument.</span>
<span class="sd"> .. versionadded:: 1.5.0</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; spark.createDataFrame([(4,)], [&#39;a&#39;]).select(log2(&#39;a&#39;).alias(&#39;log2&#39;)).collect()</span>
<span class="sd"> [Row(log2=2.0)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span>
<span class="k">return</span> <span class="n">Column</span><span class="p">(</span><span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">functions</span><span class="o">.</span><span class="n">log2</span><span class="p">(</span><span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">)))</span></div>
<div class="viewcode-block" id="conv"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.conv.html#pyspark.sql.functions.conv">[docs]</a><span class="k">def</span> <span class="nf">conv</span><span class="p">(</span><span class="n">col</span><span class="p">,</span> <span class="n">fromBase</span><span class="p">,</span> <span class="n">toBase</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Convert a number in a string column from one base to another.</span>
<span class="sd"> .. versionadded:: 1.5.0</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(&quot;010101&quot;,)], [&#39;n&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(conv(df.n, 2, 16).alias(&#39;hex&#39;)).collect()</span>
<span class="sd"> [Row(hex=&#39;15&#39;)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span>
<span class="k">return</span> <span class="n">Column</span><span class="p">(</span><span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">functions</span><span class="o">.</span><span class="n">conv</span><span class="p">(</span><span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">),</span> <span class="n">fromBase</span><span class="p">,</span> <span class="n">toBase</span><span class="p">))</span></div>
<div class="viewcode-block" id="factorial"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.factorial.html#pyspark.sql.functions.factorial">[docs]</a><span class="k">def</span> <span class="nf">factorial</span><span class="p">(</span><span class="n">col</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Computes the factorial of the given value.</span>
<span class="sd"> .. versionadded:: 1.5.0</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(5,)], [&#39;n&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(factorial(df.n).alias(&#39;f&#39;)).collect()</span>
<span class="sd"> [Row(f=120)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span>
<span class="k">return</span> <span class="n">Column</span><span class="p">(</span><span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">functions</span><span class="o">.</span><span class="n">factorial</span><span class="p">(</span><span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">)))</span></div>
<span class="c1"># --------------- Window functions ------------------------</span>
<div class="viewcode-block" id="lag"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.lag.html#pyspark.sql.functions.lag">[docs]</a><span class="k">def</span> <span class="nf">lag</span><span class="p">(</span><span class="n">col</span><span class="p">,</span> <span class="n">offset</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span> <span class="n">default</span><span class="o">=</span><span class="kc">None</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Window function: returns the value that is `offset` rows before the current row, and</span>
<span class="sd"> `default` if there is less than `offset` rows before the current row. For example,</span>
<span class="sd"> an `offset` of one will return the previous row at any given point in the window partition.</span>
<span class="sd"> This is equivalent to the LAG function in SQL.</span>
<span class="sd"> .. versionadded:: 1.4.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> name of column or expression</span>
<span class="sd"> offset : int, optional</span>
<span class="sd"> number of row to extend</span>
<span class="sd"> default : optional</span>
<span class="sd"> default value</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span>
<span class="k">return</span> <span class="n">Column</span><span class="p">(</span><span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">functions</span><span class="o">.</span><span class="n">lag</span><span class="p">(</span><span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">),</span> <span class="n">offset</span><span class="p">,</span> <span class="n">default</span><span class="p">))</span></div>
<div class="viewcode-block" id="lead"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.lead.html#pyspark.sql.functions.lead">[docs]</a><span class="k">def</span> <span class="nf">lead</span><span class="p">(</span><span class="n">col</span><span class="p">,</span> <span class="n">offset</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span> <span class="n">default</span><span class="o">=</span><span class="kc">None</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Window function: returns the value that is `offset` rows after the current row, and</span>
<span class="sd"> `default` if there is less than `offset` rows after the current row. For example,</span>
<span class="sd"> an `offset` of one will return the next row at any given point in the window partition.</span>
<span class="sd"> This is equivalent to the LEAD function in SQL.</span>
<span class="sd"> .. versionadded:: 1.4.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> name of column or expression</span>
<span class="sd"> offset : int, optional</span>
<span class="sd"> number of row to extend</span>
<span class="sd"> default : optional</span>
<span class="sd"> default value</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span>
<span class="k">return</span> <span class="n">Column</span><span class="p">(</span><span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">functions</span><span class="o">.</span><span class="n">lead</span><span class="p">(</span><span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">),</span> <span class="n">offset</span><span class="p">,</span> <span class="n">default</span><span class="p">))</span></div>
<div class="viewcode-block" id="nth_value"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.nth_value.html#pyspark.sql.functions.nth_value">[docs]</a><span class="k">def</span> <span class="nf">nth_value</span><span class="p">(</span><span class="n">col</span><span class="p">,</span> <span class="n">offset</span><span class="p">,</span> <span class="n">ignoreNulls</span><span class="o">=</span><span class="kc">False</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Window function: returns the value that is the `offset`\\th row of the window frame</span>
<span class="sd"> (counting from 1), and `null` if the size of window frame is less than `offset` rows.</span>
<span class="sd"> It will return the `offset`\\th non-null value it sees when `ignoreNulls` is set to</span>
<span class="sd"> true. If all values are null, then null is returned.</span>
<span class="sd"> This is equivalent to the nth_value function in SQL.</span>
<span class="sd"> .. versionadded:: 3.1.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> name of column or expression</span>
<span class="sd"> offset : int, optional</span>
<span class="sd"> number of row to use as the value</span>
<span class="sd"> ignoreNulls : bool, optional</span>
<span class="sd"> indicates the Nth value should skip null in the</span>
<span class="sd"> determination of which row to use</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span>
<span class="k">return</span> <span class="n">Column</span><span class="p">(</span><span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">functions</span><span class="o">.</span><span class="n">nth_value</span><span class="p">(</span><span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">),</span> <span class="n">offset</span><span class="p">,</span> <span class="n">ignoreNulls</span><span class="p">))</span></div>
<div class="viewcode-block" id="ntile"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.ntile.html#pyspark.sql.functions.ntile">[docs]</a><span class="k">def</span> <span class="nf">ntile</span><span class="p">(</span><span class="n">n</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Window function: returns the ntile group id (from 1 to `n` inclusive)</span>
<span class="sd"> in an ordered window partition. For example, if `n` is 4, the first</span>
<span class="sd"> quarter of the rows will get value 1, the second quarter will get 2,</span>
<span class="sd"> the third quarter will get 3, and the last quarter will get 4.</span>
<span class="sd"> This is equivalent to the NTILE function in SQL.</span>
<span class="sd"> .. versionadded:: 1.4.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> n : int</span>
<span class="sd"> an integer</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span>
<span class="k">return</span> <span class="n">Column</span><span class="p">(</span><span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">functions</span><span class="o">.</span><span class="n">ntile</span><span class="p">(</span><span class="nb">int</span><span class="p">(</span><span class="n">n</span><span class="p">)))</span></div>
<span class="c1"># ---------------------- Date/Timestamp functions ------------------------------</span>
<div class="viewcode-block" id="current_date"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.current_date.html#pyspark.sql.functions.current_date">[docs]</a><span class="nd">@since</span><span class="p">(</span><span class="mf">1.5</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">current_date</span><span class="p">():</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns the current date at the start of query evaluation as a :class:`DateType` column.</span>
<span class="sd"> All calls of current_date within the same query return the same value.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span>
<span class="k">return</span> <span class="n">Column</span><span class="p">(</span><span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">functions</span><span class="o">.</span><span class="n">current_date</span><span class="p">())</span></div>
<div class="viewcode-block" id="current_timestamp"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.current_timestamp.html#pyspark.sql.functions.current_timestamp">[docs]</a><span class="k">def</span> <span class="nf">current_timestamp</span><span class="p">():</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns the current timestamp at the start of query evaluation as a :class:`TimestampType`</span>
<span class="sd"> column. All calls of current_timestamp within the same query return the same value.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span>
<span class="k">return</span> <span class="n">Column</span><span class="p">(</span><span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">functions</span><span class="o">.</span><span class="n">current_timestamp</span><span class="p">())</span></div>
<div class="viewcode-block" id="date_format"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.date_format.html#pyspark.sql.functions.date_format">[docs]</a><span class="k">def</span> <span class="nf">date_format</span><span class="p">(</span><span class="n">date</span><span class="p">,</span> <span class="nb">format</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Converts a date/timestamp/string to a value of string in the format specified by the date</span>
<span class="sd"> format given by the second argument.</span>
<span class="sd"> A pattern could be for instance `dd.MM.yyyy` and could return a string like &#39;18.03.1993&#39;. All</span>
<span class="sd"> pattern letters of `datetime pattern`_. can be used.</span>
<span class="sd"> .. _datetime pattern: https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html</span>
<span class="sd"> .. versionadded:: 1.5.0</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> Whenever possible, use specialized functions like `year`.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(&#39;2015-04-08&#39;,)], [&#39;dt&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(date_format(&#39;dt&#39;, &#39;MM/dd/yyy&#39;).alias(&#39;date&#39;)).collect()</span>
<span class="sd"> [Row(date=&#39;04/08/2015&#39;)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span>
<span class="k">return</span> <span class="n">Column</span><span class="p">(</span><span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">functions</span><span class="o">.</span><span class="n">date_format</span><span class="p">(</span><span class="n">_to_java_column</span><span class="p">(</span><span class="n">date</span><span class="p">),</span> <span class="nb">format</span><span class="p">))</span></div>
<div class="viewcode-block" id="year"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.year.html#pyspark.sql.functions.year">[docs]</a><span class="k">def</span> <span class="nf">year</span><span class="p">(</span><span class="n">col</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Extract the year of a given date as integer.</span>
<span class="sd"> .. versionadded:: 1.5.0</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(&#39;2015-04-08&#39;,)], [&#39;dt&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(year(&#39;dt&#39;).alias(&#39;year&#39;)).collect()</span>
<span class="sd"> [Row(year=2015)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span>
<span class="k">return</span> <span class="n">Column</span><span class="p">(</span><span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">functions</span><span class="o">.</span><span class="n">year</span><span class="p">(</span><span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">)))</span></div>
<div class="viewcode-block" id="quarter"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.quarter.html#pyspark.sql.functions.quarter">[docs]</a><span class="k">def</span> <span class="nf">quarter</span><span class="p">(</span><span class="n">col</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Extract the quarter of a given date as integer.</span>
<span class="sd"> .. versionadded:: 1.5.0</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(&#39;2015-04-08&#39;,)], [&#39;dt&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(quarter(&#39;dt&#39;).alias(&#39;quarter&#39;)).collect()</span>
<span class="sd"> [Row(quarter=2)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span>
<span class="k">return</span> <span class="n">Column</span><span class="p">(</span><span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">functions</span><span class="o">.</span><span class="n">quarter</span><span class="p">(</span><span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">)))</span></div>
<div class="viewcode-block" id="month"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.month.html#pyspark.sql.functions.month">[docs]</a><span class="k">def</span> <span class="nf">month</span><span class="p">(</span><span class="n">col</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Extract the month of a given date as integer.</span>
<span class="sd"> .. versionadded:: 1.5.0</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(&#39;2015-04-08&#39;,)], [&#39;dt&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(month(&#39;dt&#39;).alias(&#39;month&#39;)).collect()</span>
<span class="sd"> [Row(month=4)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span>
<span class="k">return</span> <span class="n">Column</span><span class="p">(</span><span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">functions</span><span class="o">.</span><span class="n">month</span><span class="p">(</span><span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">)))</span></div>
<div class="viewcode-block" id="dayofweek"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.dayofweek.html#pyspark.sql.functions.dayofweek">[docs]</a><span class="k">def</span> <span class="nf">dayofweek</span><span class="p">(</span><span class="n">col</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Extract the day of the week of a given date as integer.</span>
<span class="sd"> .. versionadded:: 2.3.0</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(&#39;2015-04-08&#39;,)], [&#39;dt&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(dayofweek(&#39;dt&#39;).alias(&#39;day&#39;)).collect()</span>
<span class="sd"> [Row(day=4)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span>
<span class="k">return</span> <span class="n">Column</span><span class="p">(</span><span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">functions</span><span class="o">.</span><span class="n">dayofweek</span><span class="p">(</span><span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">)))</span></div>
<div class="viewcode-block" id="dayofmonth"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.dayofmonth.html#pyspark.sql.functions.dayofmonth">[docs]</a><span class="k">def</span> <span class="nf">dayofmonth</span><span class="p">(</span><span class="n">col</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Extract the day of the month of a given date as integer.</span>
<span class="sd"> .. versionadded:: 1.5.0</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(&#39;2015-04-08&#39;,)], [&#39;dt&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(dayofmonth(&#39;dt&#39;).alias(&#39;day&#39;)).collect()</span>
<span class="sd"> [Row(day=8)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span>
<span class="k">return</span> <span class="n">Column</span><span class="p">(</span><span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">functions</span><span class="o">.</span><span class="n">dayofmonth</span><span class="p">(</span><span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">)))</span></div>
<div class="viewcode-block" id="dayofyear"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.dayofyear.html#pyspark.sql.functions.dayofyear">[docs]</a><span class="k">def</span> <span class="nf">dayofyear</span><span class="p">(</span><span class="n">col</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Extract the day of the year of a given date as integer.</span>
<span class="sd"> .. versionadded:: 1.5.0</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(&#39;2015-04-08&#39;,)], [&#39;dt&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(dayofyear(&#39;dt&#39;).alias(&#39;day&#39;)).collect()</span>
<span class="sd"> [Row(day=98)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span>
<span class="k">return</span> <span class="n">Column</span><span class="p">(</span><span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">functions</span><span class="o">.</span><span class="n">dayofyear</span><span class="p">(</span><span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">)))</span></div>
<div class="viewcode-block" id="hour"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.hour.html#pyspark.sql.functions.hour">[docs]</a><span class="k">def</span> <span class="nf">hour</span><span class="p">(</span><span class="n">col</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Extract the hours of a given date as integer.</span>
<span class="sd"> .. versionadded:: 1.5.0</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(&#39;2015-04-08 13:08:15&#39;,)], [&#39;ts&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(hour(&#39;ts&#39;).alias(&#39;hour&#39;)).collect()</span>
<span class="sd"> [Row(hour=13)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span>
<span class="k">return</span> <span class="n">Column</span><span class="p">(</span><span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">functions</span><span class="o">.</span><span class="n">hour</span><span class="p">(</span><span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">)))</span></div>
<div class="viewcode-block" id="minute"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.minute.html#pyspark.sql.functions.minute">[docs]</a><span class="k">def</span> <span class="nf">minute</span><span class="p">(</span><span class="n">col</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Extract the minutes of a given date as integer.</span>
<span class="sd"> .. versionadded:: 1.5.0</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(&#39;2015-04-08 13:08:15&#39;,)], [&#39;ts&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(minute(&#39;ts&#39;).alias(&#39;minute&#39;)).collect()</span>
<span class="sd"> [Row(minute=8)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span>
<span class="k">return</span> <span class="n">Column</span><span class="p">(</span><span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">functions</span><span class="o">.</span><span class="n">minute</span><span class="p">(</span><span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">)))</span></div>
<div class="viewcode-block" id="second"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.second.html#pyspark.sql.functions.second">[docs]</a><span class="k">def</span> <span class="nf">second</span><span class="p">(</span><span class="n">col</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Extract the seconds of a given date as integer.</span>
<span class="sd"> .. versionadded:: 1.5.0</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(&#39;2015-04-08 13:08:15&#39;,)], [&#39;ts&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(second(&#39;ts&#39;).alias(&#39;second&#39;)).collect()</span>
<span class="sd"> [Row(second=15)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span>
<span class="k">return</span> <span class="n">Column</span><span class="p">(</span><span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">functions</span><span class="o">.</span><span class="n">second</span><span class="p">(</span><span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">)))</span></div>
<div class="viewcode-block" id="weekofyear"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.weekofyear.html#pyspark.sql.functions.weekofyear">[docs]</a><span class="k">def</span> <span class="nf">weekofyear</span><span class="p">(</span><span class="n">col</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Extract the week number of a given date as integer.</span>
<span class="sd"> .. versionadded:: 1.5.0</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(&#39;2015-04-08&#39;,)], [&#39;dt&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(weekofyear(df.dt).alias(&#39;week&#39;)).collect()</span>
<span class="sd"> [Row(week=15)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span>
<span class="k">return</span> <span class="n">Column</span><span class="p">(</span><span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">functions</span><span class="o">.</span><span class="n">weekofyear</span><span class="p">(</span><span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">)))</span></div>
<div class="viewcode-block" id="date_add"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.date_add.html#pyspark.sql.functions.date_add">[docs]</a><span class="k">def</span> <span class="nf">date_add</span><span class="p">(</span><span class="n">start</span><span class="p">,</span> <span class="n">days</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns the date that is `days` days after `start`</span>
<span class="sd"> .. versionadded:: 1.5.0</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(&#39;2015-04-08&#39;,)], [&#39;dt&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(date_add(df.dt, 1).alias(&#39;next_date&#39;)).collect()</span>
<span class="sd"> [Row(next_date=datetime.date(2015, 4, 9))]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span>
<span class="k">return</span> <span class="n">Column</span><span class="p">(</span><span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">functions</span><span class="o">.</span><span class="n">date_add</span><span class="p">(</span><span class="n">_to_java_column</span><span class="p">(</span><span class="n">start</span><span class="p">),</span> <span class="n">days</span><span class="p">))</span></div>
<div class="viewcode-block" id="date_sub"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.date_sub.html#pyspark.sql.functions.date_sub">[docs]</a><span class="k">def</span> <span class="nf">date_sub</span><span class="p">(</span><span class="n">start</span><span class="p">,</span> <span class="n">days</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns the date that is `days` days before `start`</span>
<span class="sd"> .. versionadded:: 1.5.0</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(&#39;2015-04-08&#39;,)], [&#39;dt&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(date_sub(df.dt, 1).alias(&#39;prev_date&#39;)).collect()</span>
<span class="sd"> [Row(prev_date=datetime.date(2015, 4, 7))]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span>
<span class="k">return</span> <span class="n">Column</span><span class="p">(</span><span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">functions</span><span class="o">.</span><span class="n">date_sub</span><span class="p">(</span><span class="n">_to_java_column</span><span class="p">(</span><span class="n">start</span><span class="p">),</span> <span class="n">days</span><span class="p">))</span></div>
<div class="viewcode-block" id="datediff"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.datediff.html#pyspark.sql.functions.datediff">[docs]</a><span class="k">def</span> <span class="nf">datediff</span><span class="p">(</span><span class="n">end</span><span class="p">,</span> <span class="n">start</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns the number of days from `start` to `end`.</span>
<span class="sd"> .. versionadded:: 1.5.0</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(&#39;2015-04-08&#39;,&#39;2015-05-10&#39;)], [&#39;d1&#39;, &#39;d2&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(datediff(df.d2, df.d1).alias(&#39;diff&#39;)).collect()</span>
<span class="sd"> [Row(diff=32)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span>
<span class="k">return</span> <span class="n">Column</span><span class="p">(</span><span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">functions</span><span class="o">.</span><span class="n">datediff</span><span class="p">(</span><span class="n">_to_java_column</span><span class="p">(</span><span class="n">end</span><span class="p">),</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">start</span><span class="p">)))</span></div>
<div class="viewcode-block" id="add_months"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.add_months.html#pyspark.sql.functions.add_months">[docs]</a><span class="k">def</span> <span class="nf">add_months</span><span class="p">(</span><span class="n">start</span><span class="p">,</span> <span class="n">months</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns the date that is `months` months after `start`</span>
<span class="sd"> .. versionadded:: 1.5.0</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(&#39;2015-04-08&#39;,)], [&#39;dt&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(add_months(df.dt, 1).alias(&#39;next_month&#39;)).collect()</span>
<span class="sd"> [Row(next_month=datetime.date(2015, 5, 8))]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span>
<span class="k">return</span> <span class="n">Column</span><span class="p">(</span><span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">functions</span><span class="o">.</span><span class="n">add_months</span><span class="p">(</span><span class="n">_to_java_column</span><span class="p">(</span><span class="n">start</span><span class="p">),</span> <span class="n">months</span><span class="p">))</span></div>
<div class="viewcode-block" id="months_between"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.months_between.html#pyspark.sql.functions.months_between">[docs]</a><span class="k">def</span> <span class="nf">months_between</span><span class="p">(</span><span class="n">date1</span><span class="p">,</span> <span class="n">date2</span><span class="p">,</span> <span class="n">roundOff</span><span class="o">=</span><span class="kc">True</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns number of months between dates date1 and date2.</span>
<span class="sd"> If date1 is later than date2, then the result is positive.</span>
<span class="sd"> If date1 and date2 are on the same day of month, or both are the last day of month,</span>
<span class="sd"> returns an integer (time of day will be ignored).</span>
<span class="sd"> The result is rounded off to 8 digits unless `roundOff` is set to `False`.</span>
<span class="sd"> .. versionadded:: 1.5.0</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(&#39;1997-02-28 10:30:00&#39;, &#39;1996-10-30&#39;)], [&#39;date1&#39;, &#39;date2&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(months_between(df.date1, df.date2).alias(&#39;months&#39;)).collect()</span>
<span class="sd"> [Row(months=3.94959677)]</span>
<span class="sd"> &gt;&gt;&gt; df.select(months_between(df.date1, df.date2, False).alias(&#39;months&#39;)).collect()</span>
<span class="sd"> [Row(months=3.9495967741935485)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span>
<span class="k">return</span> <span class="n">Column</span><span class="p">(</span><span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">functions</span><span class="o">.</span><span class="n">months_between</span><span class="p">(</span>
<span class="n">_to_java_column</span><span class="p">(</span><span class="n">date1</span><span class="p">),</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">date2</span><span class="p">),</span> <span class="n">roundOff</span><span class="p">))</span></div>
<div class="viewcode-block" id="to_date"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.to_date.html#pyspark.sql.functions.to_date">[docs]</a><span class="k">def</span> <span class="nf">to_date</span><span class="p">(</span><span class="n">col</span><span class="p">,</span> <span class="nb">format</span><span class="o">=</span><span class="kc">None</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;Converts a :class:`~pyspark.sql.Column` into :class:`pyspark.sql.types.DateType`</span>
<span class="sd"> using the optionally specified format. Specify formats according to `datetime pattern`_.</span>
<span class="sd"> By default, it follows casting rules to :class:`pyspark.sql.types.DateType` if the format</span>
<span class="sd"> is omitted. Equivalent to ``col.cast(&quot;date&quot;)``.</span>
<span class="sd"> .. _datetime pattern: https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html</span>
<span class="sd"> .. versionadded:: 2.2.0</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(&#39;1997-02-28 10:30:00&#39;,)], [&#39;t&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(to_date(df.t).alias(&#39;date&#39;)).collect()</span>
<span class="sd"> [Row(date=datetime.date(1997, 2, 28))]</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(&#39;1997-02-28 10:30:00&#39;,)], [&#39;t&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(to_date(df.t, &#39;yyyy-MM-dd HH:mm:ss&#39;).alias(&#39;date&#39;)).collect()</span>
<span class="sd"> [Row(date=datetime.date(1997, 2, 28))]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span>
<span class="k">if</span> <span class="nb">format</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
<span class="n">jc</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">functions</span><span class="o">.</span><span class="n">to_date</span><span class="p">(</span><span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">))</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">jc</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">functions</span><span class="o">.</span><span class="n">to_date</span><span class="p">(</span><span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">),</span> <span class="nb">format</span><span class="p">)</span>
<span class="k">return</span> <span class="n">Column</span><span class="p">(</span><span class="n">jc</span><span class="p">)</span></div>
<div class="viewcode-block" id="to_timestamp"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.to_timestamp.html#pyspark.sql.functions.to_timestamp">[docs]</a><span class="k">def</span> <span class="nf">to_timestamp</span><span class="p">(</span><span class="n">col</span><span class="p">,</span> <span class="nb">format</span><span class="o">=</span><span class="kc">None</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;Converts a :class:`~pyspark.sql.Column` into :class:`pyspark.sql.types.TimestampType`</span>
<span class="sd"> using the optionally specified format. Specify formats according to `datetime pattern`_.</span>
<span class="sd"> By default, it follows casting rules to :class:`pyspark.sql.types.TimestampType` if the format</span>
<span class="sd"> is omitted. Equivalent to ``col.cast(&quot;timestamp&quot;)``.</span>
<span class="sd"> .. _datetime pattern: https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html</span>
<span class="sd"> .. versionadded:: 2.2.0</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(&#39;1997-02-28 10:30:00&#39;,)], [&#39;t&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(to_timestamp(df.t).alias(&#39;dt&#39;)).collect()</span>
<span class="sd"> [Row(dt=datetime.datetime(1997, 2, 28, 10, 30))]</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(&#39;1997-02-28 10:30:00&#39;,)], [&#39;t&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(to_timestamp(df.t, &#39;yyyy-MM-dd HH:mm:ss&#39;).alias(&#39;dt&#39;)).collect()</span>
<span class="sd"> [Row(dt=datetime.datetime(1997, 2, 28, 10, 30))]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span>
<span class="k">if</span> <span class="nb">format</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
<span class="n">jc</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">functions</span><span class="o">.</span><span class="n">to_timestamp</span><span class="p">(</span><span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">))</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">jc</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">functions</span><span class="o">.</span><span class="n">to_timestamp</span><span class="p">(</span><span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">),</span> <span class="nb">format</span><span class="p">)</span>
<span class="k">return</span> <span class="n">Column</span><span class="p">(</span><span class="n">jc</span><span class="p">)</span></div>
<div class="viewcode-block" id="trunc"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.trunc.html#pyspark.sql.functions.trunc">[docs]</a><span class="k">def</span> <span class="nf">trunc</span><span class="p">(</span><span class="n">date</span><span class="p">,</span> <span class="nb">format</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns date truncated to the unit specified by the format.</span>
<span class="sd"> .. versionadded:: 1.5.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> date : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> format : str</span>
<span class="sd"> &#39;year&#39;, &#39;yyyy&#39;, &#39;yy&#39; to truncate by year,</span>
<span class="sd"> or &#39;month&#39;, &#39;mon&#39;, &#39;mm&#39; to truncate by month</span>
<span class="sd"> Other options are: &#39;week&#39;, &#39;quarter&#39;</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(&#39;1997-02-28&#39;,)], [&#39;d&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(trunc(df.d, &#39;year&#39;).alias(&#39;year&#39;)).collect()</span>
<span class="sd"> [Row(year=datetime.date(1997, 1, 1))]</span>
<span class="sd"> &gt;&gt;&gt; df.select(trunc(df.d, &#39;mon&#39;).alias(&#39;month&#39;)).collect()</span>
<span class="sd"> [Row(month=datetime.date(1997, 2, 1))]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span>
<span class="k">return</span> <span class="n">Column</span><span class="p">(</span><span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">functions</span><span class="o">.</span><span class="n">trunc</span><span class="p">(</span><span class="n">_to_java_column</span><span class="p">(</span><span class="n">date</span><span class="p">),</span> <span class="nb">format</span><span class="p">))</span></div>
<div class="viewcode-block" id="date_trunc"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.date_trunc.html#pyspark.sql.functions.date_trunc">[docs]</a><span class="k">def</span> <span class="nf">date_trunc</span><span class="p">(</span><span class="nb">format</span><span class="p">,</span> <span class="n">timestamp</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns timestamp truncated to the unit specified by the format.</span>
<span class="sd"> .. versionadded:: 2.3.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> format : str</span>
<span class="sd"> &#39;year&#39;, &#39;yyyy&#39;, &#39;yy&#39; to truncate by year,</span>
<span class="sd"> &#39;month&#39;, &#39;mon&#39;, &#39;mm&#39; to truncate by month,</span>
<span class="sd"> &#39;day&#39;, &#39;dd&#39; to truncate by day,</span>
<span class="sd"> Other options are:</span>
<span class="sd"> &#39;microsecond&#39;, &#39;millisecond&#39;, &#39;second&#39;, &#39;minute&#39;, &#39;hour&#39;, &#39;week&#39;, &#39;quarter&#39;</span>
<span class="sd"> timestamp : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(&#39;1997-02-28 05:02:11&#39;,)], [&#39;t&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(date_trunc(&#39;year&#39;, df.t).alias(&#39;year&#39;)).collect()</span>
<span class="sd"> [Row(year=datetime.datetime(1997, 1, 1, 0, 0))]</span>
<span class="sd"> &gt;&gt;&gt; df.select(date_trunc(&#39;mon&#39;, df.t).alias(&#39;month&#39;)).collect()</span>
<span class="sd"> [Row(month=datetime.datetime(1997, 2, 1, 0, 0))]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span>
<span class="k">return</span> <span class="n">Column</span><span class="p">(</span><span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">functions</span><span class="o">.</span><span class="n">date_trunc</span><span class="p">(</span><span class="nb">format</span><span class="p">,</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">timestamp</span><span class="p">)))</span></div>
<div class="viewcode-block" id="next_day"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.next_day.html#pyspark.sql.functions.next_day">[docs]</a><span class="k">def</span> <span class="nf">next_day</span><span class="p">(</span><span class="n">date</span><span class="p">,</span> <span class="n">dayOfWeek</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns the first date which is later than the value of the date column.</span>
<span class="sd"> Day of the week parameter is case insensitive, and accepts:</span>
<span class="sd"> &quot;Mon&quot;, &quot;Tue&quot;, &quot;Wed&quot;, &quot;Thu&quot;, &quot;Fri&quot;, &quot;Sat&quot;, &quot;Sun&quot;.</span>
<span class="sd"> .. versionadded:: 1.5.0</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(&#39;2015-07-27&#39;,)], [&#39;d&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(next_day(df.d, &#39;Sun&#39;).alias(&#39;date&#39;)).collect()</span>
<span class="sd"> [Row(date=datetime.date(2015, 8, 2))]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span>
<span class="k">return</span> <span class="n">Column</span><span class="p">(</span><span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">functions</span><span class="o">.</span><span class="n">next_day</span><span class="p">(</span><span class="n">_to_java_column</span><span class="p">(</span><span class="n">date</span><span class="p">),</span> <span class="n">dayOfWeek</span><span class="p">))</span></div>
<div class="viewcode-block" id="last_day"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.last_day.html#pyspark.sql.functions.last_day">[docs]</a><span class="k">def</span> <span class="nf">last_day</span><span class="p">(</span><span class="n">date</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns the last day of the month which the given date belongs to.</span>
<span class="sd"> .. versionadded:: 1.5.0</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(&#39;1997-02-10&#39;,)], [&#39;d&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(last_day(df.d).alias(&#39;date&#39;)).collect()</span>
<span class="sd"> [Row(date=datetime.date(1997, 2, 28))]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span>
<span class="k">return</span> <span class="n">Column</span><span class="p">(</span><span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">functions</span><span class="o">.</span><span class="n">last_day</span><span class="p">(</span><span class="n">_to_java_column</span><span class="p">(</span><span class="n">date</span><span class="p">)))</span></div>
<div class="viewcode-block" id="from_unixtime"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.from_unixtime.html#pyspark.sql.functions.from_unixtime">[docs]</a><span class="k">def</span> <span class="nf">from_unixtime</span><span class="p">(</span><span class="n">timestamp</span><span class="p">,</span> <span class="nb">format</span><span class="o">=</span><span class="s2">&quot;yyyy-MM-dd HH:mm:ss&quot;</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Converts the number of seconds from unix epoch (1970-01-01 00:00:00 UTC) to a string</span>
<span class="sd"> representing the timestamp of that moment in the current system time zone in the given</span>
<span class="sd"> format.</span>
<span class="sd"> .. versionadded:: 1.5.0</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; spark.conf.set(&quot;spark.sql.session.timeZone&quot;, &quot;America/Los_Angeles&quot;)</span>
<span class="sd"> &gt;&gt;&gt; time_df = spark.createDataFrame([(1428476400,)], [&#39;unix_time&#39;])</span>
<span class="sd"> &gt;&gt;&gt; time_df.select(from_unixtime(&#39;unix_time&#39;).alias(&#39;ts&#39;)).collect()</span>
<span class="sd"> [Row(ts=&#39;2015-04-08 00:00:00&#39;)]</span>
<span class="sd"> &gt;&gt;&gt; spark.conf.unset(&quot;spark.sql.session.timeZone&quot;)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span>
<span class="k">return</span> <span class="n">Column</span><span class="p">(</span><span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">functions</span><span class="o">.</span><span class="n">from_unixtime</span><span class="p">(</span><span class="n">_to_java_column</span><span class="p">(</span><span class="n">timestamp</span><span class="p">),</span> <span class="nb">format</span><span class="p">))</span></div>
<div class="viewcode-block" id="unix_timestamp"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.unix_timestamp.html#pyspark.sql.functions.unix_timestamp">[docs]</a><span class="k">def</span> <span class="nf">unix_timestamp</span><span class="p">(</span><span class="n">timestamp</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="nb">format</span><span class="o">=</span><span class="s1">&#39;yyyy-MM-dd HH:mm:ss&#39;</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Convert time string with given pattern (&#39;yyyy-MM-dd HH:mm:ss&#39;, by default)</span>
<span class="sd"> to Unix time stamp (in seconds), using the default timezone and the default</span>
<span class="sd"> locale, return null if fail.</span>
<span class="sd"> if `timestamp` is None, then it returns current timestamp.</span>
<span class="sd"> .. versionadded:: 1.5.0</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; spark.conf.set(&quot;spark.sql.session.timeZone&quot;, &quot;America/Los_Angeles&quot;)</span>
<span class="sd"> &gt;&gt;&gt; time_df = spark.createDataFrame([(&#39;2015-04-08&#39;,)], [&#39;dt&#39;])</span>
<span class="sd"> &gt;&gt;&gt; time_df.select(unix_timestamp(&#39;dt&#39;, &#39;yyyy-MM-dd&#39;).alias(&#39;unix_time&#39;)).collect()</span>
<span class="sd"> [Row(unix_time=1428476400)]</span>
<span class="sd"> &gt;&gt;&gt; spark.conf.unset(&quot;spark.sql.session.timeZone&quot;)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span>
<span class="k">if</span> <span class="n">timestamp</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
<span class="k">return</span> <span class="n">Column</span><span class="p">(</span><span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">functions</span><span class="o">.</span><span class="n">unix_timestamp</span><span class="p">())</span>
<span class="k">return</span> <span class="n">Column</span><span class="p">(</span><span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">functions</span><span class="o">.</span><span class="n">unix_timestamp</span><span class="p">(</span><span class="n">_to_java_column</span><span class="p">(</span><span class="n">timestamp</span><span class="p">),</span> <span class="nb">format</span><span class="p">))</span></div>
<div class="viewcode-block" id="from_utc_timestamp"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.from_utc_timestamp.html#pyspark.sql.functions.from_utc_timestamp">[docs]</a><span class="k">def</span> <span class="nf">from_utc_timestamp</span><span class="p">(</span><span class="n">timestamp</span><span class="p">,</span> <span class="n">tz</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> This is a common function for databases supporting TIMESTAMP WITHOUT TIMEZONE. This function</span>
<span class="sd"> takes a timestamp which is timezone-agnostic, and interprets it as a timestamp in UTC, and</span>
<span class="sd"> renders that timestamp as a timestamp in the given time zone.</span>
<span class="sd"> However, timestamp in Spark represents number of microseconds from the Unix epoch, which is not</span>
<span class="sd"> timezone-agnostic. So in Spark this function just shift the timestamp value from UTC timezone to</span>
<span class="sd"> the given timezone.</span>
<span class="sd"> This function may return confusing result if the input is a string with timezone, e.g.</span>
<span class="sd"> &#39;2018-03-13T06:18:23+00:00&#39;. The reason is that, Spark firstly cast the string to timestamp</span>
<span class="sd"> according to the timezone in the string, and finally display the result by converting the</span>
<span class="sd"> timestamp to string according to the session local timezone.</span>
<span class="sd"> .. versionadded:: 1.5.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> timestamp : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> the column that contains timestamps</span>
<span class="sd"> tz : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> A string detailing the time zone ID that the input should be adjusted to. It should</span>
<span class="sd"> be in the format of either region-based zone IDs or zone offsets. Region IDs must</span>
<span class="sd"> have the form &#39;area/city&#39;, such as &#39;America/Los_Angeles&#39;. Zone offsets must be in</span>
<span class="sd"> the format &#39;(+|-)HH:mm&#39;, for example &#39;-08:00&#39; or &#39;+01:00&#39;. Also &#39;UTC&#39; and &#39;Z&#39; are</span>
<span class="sd"> supported as aliases of &#39;+00:00&#39;. Other short names are not recommended to use</span>
<span class="sd"> because they can be ambiguous.</span>
<span class="sd"> .. versionchanged:: 2.4</span>
<span class="sd"> `tz` can take a :class:`~pyspark.sql.Column` containing timezone ID strings.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(&#39;1997-02-28 10:30:00&#39;, &#39;JST&#39;)], [&#39;ts&#39;, &#39;tz&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(from_utc_timestamp(df.ts, &quot;PST&quot;).alias(&#39;local_time&#39;)).collect()</span>
<span class="sd"> [Row(local_time=datetime.datetime(1997, 2, 28, 2, 30))]</span>
<span class="sd"> &gt;&gt;&gt; df.select(from_utc_timestamp(df.ts, df.tz).alias(&#39;local_time&#39;)).collect()</span>
<span class="sd"> [Row(local_time=datetime.datetime(1997, 2, 28, 19, 30))]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">tz</span><span class="p">,</span> <span class="n">Column</span><span class="p">):</span>
<span class="n">tz</span> <span class="o">=</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">tz</span><span class="p">)</span>
<span class="k">return</span> <span class="n">Column</span><span class="p">(</span><span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">functions</span><span class="o">.</span><span class="n">from_utc_timestamp</span><span class="p">(</span><span class="n">_to_java_column</span><span class="p">(</span><span class="n">timestamp</span><span class="p">),</span> <span class="n">tz</span><span class="p">))</span></div>
<div class="viewcode-block" id="to_utc_timestamp"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.to_utc_timestamp.html#pyspark.sql.functions.to_utc_timestamp">[docs]</a><span class="k">def</span> <span class="nf">to_utc_timestamp</span><span class="p">(</span><span class="n">timestamp</span><span class="p">,</span> <span class="n">tz</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> This is a common function for databases supporting TIMESTAMP WITHOUT TIMEZONE. This function</span>
<span class="sd"> takes a timestamp which is timezone-agnostic, and interprets it as a timestamp in the given</span>
<span class="sd"> timezone, and renders that timestamp as a timestamp in UTC.</span>
<span class="sd"> However, timestamp in Spark represents number of microseconds from the Unix epoch, which is not</span>
<span class="sd"> timezone-agnostic. So in Spark this function just shift the timestamp value from the given</span>
<span class="sd"> timezone to UTC timezone.</span>
<span class="sd"> This function may return confusing result if the input is a string with timezone, e.g.</span>
<span class="sd"> &#39;2018-03-13T06:18:23+00:00&#39;. The reason is that, Spark firstly cast the string to timestamp</span>
<span class="sd"> according to the timezone in the string, and finally display the result by converting the</span>
<span class="sd"> timestamp to string according to the session local timezone.</span>
<span class="sd"> .. versionadded:: 1.5.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> timestamp : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> the column that contains timestamps</span>
<span class="sd"> tz : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> A string detailing the time zone ID that the input should be adjusted to. It should</span>
<span class="sd"> be in the format of either region-based zone IDs or zone offsets. Region IDs must</span>
<span class="sd"> have the form &#39;area/city&#39;, such as &#39;America/Los_Angeles&#39;. Zone offsets must be in</span>
<span class="sd"> the format &#39;(+|-)HH:mm&#39;, for example &#39;-08:00&#39; or &#39;+01:00&#39;. Also &#39;UTC&#39; and &#39;Z&#39; are</span>
<span class="sd"> upported as aliases of &#39;+00:00&#39;. Other short names are not recommended to use</span>
<span class="sd"> because they can be ambiguous.</span>
<span class="sd"> .. versionchanged:: 2.4.0</span>
<span class="sd"> `tz` can take a :class:`~pyspark.sql.Column` containing timezone ID strings.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(&#39;1997-02-28 10:30:00&#39;, &#39;JST&#39;)], [&#39;ts&#39;, &#39;tz&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(to_utc_timestamp(df.ts, &quot;PST&quot;).alias(&#39;utc_time&#39;)).collect()</span>
<span class="sd"> [Row(utc_time=datetime.datetime(1997, 2, 28, 18, 30))]</span>
<span class="sd"> &gt;&gt;&gt; df.select(to_utc_timestamp(df.ts, df.tz).alias(&#39;utc_time&#39;)).collect()</span>
<span class="sd"> [Row(utc_time=datetime.datetime(1997, 2, 28, 1, 30))]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">tz</span><span class="p">,</span> <span class="n">Column</span><span class="p">):</span>
<span class="n">tz</span> <span class="o">=</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">tz</span><span class="p">)</span>
<span class="k">return</span> <span class="n">Column</span><span class="p">(</span><span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">functions</span><span class="o">.</span><span class="n">to_utc_timestamp</span><span class="p">(</span><span class="n">_to_java_column</span><span class="p">(</span><span class="n">timestamp</span><span class="p">),</span> <span class="n">tz</span><span class="p">))</span></div>
<div class="viewcode-block" id="timestamp_seconds"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.timestamp_seconds.html#pyspark.sql.functions.timestamp_seconds">[docs]</a><span class="k">def</span> <span class="nf">timestamp_seconds</span><span class="p">(</span><span class="n">col</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> .. versionadded:: 3.1.0</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; from pyspark.sql.functions import timestamp_seconds</span>
<span class="sd"> &gt;&gt;&gt; spark.conf.set(&quot;spark.sql.session.timeZone&quot;, &quot;America/Los_Angeles&quot;)</span>
<span class="sd"> &gt;&gt;&gt; time_df = spark.createDataFrame([(1230219000,)], [&#39;unix_time&#39;])</span>
<span class="sd"> &gt;&gt;&gt; time_df.select(timestamp_seconds(time_df.unix_time).alias(&#39;ts&#39;)).show()</span>
<span class="sd"> +-------------------+</span>
<span class="sd"> | ts|</span>
<span class="sd"> +-------------------+</span>
<span class="sd"> |2008-12-25 07:30:00|</span>
<span class="sd"> +-------------------+</span>
<span class="sd"> &gt;&gt;&gt; spark.conf.unset(&quot;spark.sql.session.timeZone&quot;)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span>
<span class="k">return</span> <span class="n">Column</span><span class="p">(</span><span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">functions</span><span class="o">.</span><span class="n">timestamp_seconds</span><span class="p">(</span><span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">)))</span></div>
<div class="viewcode-block" id="window"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.window.html#pyspark.sql.functions.window">[docs]</a><span class="k">def</span> <span class="nf">window</span><span class="p">(</span><span class="n">timeColumn</span><span class="p">,</span> <span class="n">windowDuration</span><span class="p">,</span> <span class="n">slideDuration</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">startTime</span><span class="o">=</span><span class="kc">None</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;Bucketize rows into one or more time windows given a timestamp specifying column. Window</span>
<span class="sd"> starts are inclusive but the window ends are exclusive, e.g. 12:05 will be in the window</span>
<span class="sd"> [12:05,12:10) but not in [12:00,12:05). Windows can support microsecond precision. Windows in</span>
<span class="sd"> the order of months are not supported.</span>
<span class="sd"> The time column must be of :class:`pyspark.sql.types.TimestampType`.</span>
<span class="sd"> Durations are provided as strings, e.g. &#39;1 second&#39;, &#39;1 day 12 hours&#39;, &#39;2 minutes&#39;. Valid</span>
<span class="sd"> interval strings are &#39;week&#39;, &#39;day&#39;, &#39;hour&#39;, &#39;minute&#39;, &#39;second&#39;, &#39;millisecond&#39;, &#39;microsecond&#39;.</span>
<span class="sd"> If the ``slideDuration`` is not provided, the windows will be tumbling windows.</span>
<span class="sd"> The startTime is the offset with respect to 1970-01-01 00:00:00 UTC with which to start</span>
<span class="sd"> window intervals. For example, in order to have hourly tumbling windows that start 15 minutes</span>
<span class="sd"> past the hour, e.g. 12:15-13:15, 13:15-14:15... provide `startTime` as `15 minutes`.</span>
<span class="sd"> The output column will be a struct called &#39;window&#39; by default with the nested columns &#39;start&#39;</span>
<span class="sd"> and &#39;end&#39;, where &#39;start&#39; and &#39;end&#39; will be of :class:`pyspark.sql.types.TimestampType`.</span>
<span class="sd"> .. versionadded:: 2.0.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> timeColumn : :class:`~pyspark.sql.Column`</span>
<span class="sd"> The column or the expression to use as the timestamp for windowing by time.</span>
<span class="sd"> The time column must be of TimestampType or TimestampNTZType.</span>
<span class="sd"> windowDuration : str</span>
<span class="sd"> A string specifying the width of the window, e.g. `10 minutes`,</span>
<span class="sd"> `1 second`. Check `org.apache.spark.unsafe.types.CalendarInterval` for</span>
<span class="sd"> valid duration identifiers. Note that the duration is a fixed length of</span>
<span class="sd"> time, and does not vary over time according to a calendar. For example,</span>
<span class="sd"> `1 day` always means 86,400,000 milliseconds, not a calendar day.</span>
<span class="sd"> slideDuration : str, optional</span>
<span class="sd"> A new window will be generated every `slideDuration`. Must be less than</span>
<span class="sd"> or equal to the `windowDuration`. Check</span>
<span class="sd"> `org.apache.spark.unsafe.types.CalendarInterval` for valid duration</span>
<span class="sd"> identifiers. This duration is likewise absolute, and does not vary</span>
<span class="sd"> according to a calendar.</span>
<span class="sd"> startTime : str, optional</span>
<span class="sd"> The offset with respect to 1970-01-01 00:00:00 UTC with which to start</span>
<span class="sd"> window intervals. For example, in order to have hourly tumbling windows that</span>
<span class="sd"> start 15 minutes past the hour, e.g. 12:15-13:15, 13:15-14:15... provide</span>
<span class="sd"> `startTime` as `15 minutes`.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(&quot;2016-03-11 09:00:07&quot;, 1)]).toDF(&quot;date&quot;, &quot;val&quot;)</span>
<span class="sd"> &gt;&gt;&gt; w = df.groupBy(window(&quot;date&quot;, &quot;5 seconds&quot;)).agg(sum(&quot;val&quot;).alias(&quot;sum&quot;))</span>
<span class="sd"> &gt;&gt;&gt; w.select(w.window.start.cast(&quot;string&quot;).alias(&quot;start&quot;),</span>
<span class="sd"> ... w.window.end.cast(&quot;string&quot;).alias(&quot;end&quot;), &quot;sum&quot;).collect()</span>
<span class="sd"> [Row(start=&#39;2016-03-11 09:00:05&#39;, end=&#39;2016-03-11 09:00:10&#39;, sum=1)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">def</span> <span class="nf">check_string_field</span><span class="p">(</span><span class="n">field</span><span class="p">,</span> <span class="n">fieldName</span><span class="p">):</span>
<span class="k">if</span> <span class="ow">not</span> <span class="n">field</span> <span class="ow">or</span> <span class="nb">type</span><span class="p">(</span><span class="n">field</span><span class="p">)</span> <span class="ow">is</span> <span class="ow">not</span> <span class="nb">str</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">&quot;</span><span class="si">%s</span><span class="s2"> should be provided as a string&quot;</span> <span class="o">%</span> <span class="n">fieldName</span><span class="p">)</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span>
<span class="n">time_col</span> <span class="o">=</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">timeColumn</span><span class="p">)</span>
<span class="n">check_string_field</span><span class="p">(</span><span class="n">windowDuration</span><span class="p">,</span> <span class="s2">&quot;windowDuration&quot;</span><span class="p">)</span>
<span class="k">if</span> <span class="n">slideDuration</span> <span class="ow">and</span> <span class="n">startTime</span><span class="p">:</span>
<span class="n">check_string_field</span><span class="p">(</span><span class="n">slideDuration</span><span class="p">,</span> <span class="s2">&quot;slideDuration&quot;</span><span class="p">)</span>
<span class="n">check_string_field</span><span class="p">(</span><span class="n">startTime</span><span class="p">,</span> <span class="s2">&quot;startTime&quot;</span><span class="p">)</span>
<span class="n">res</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">functions</span><span class="o">.</span><span class="n">window</span><span class="p">(</span><span class="n">time_col</span><span class="p">,</span> <span class="n">windowDuration</span><span class="p">,</span> <span class="n">slideDuration</span><span class="p">,</span> <span class="n">startTime</span><span class="p">)</span>
<span class="k">elif</span> <span class="n">slideDuration</span><span class="p">:</span>
<span class="n">check_string_field</span><span class="p">(</span><span class="n">slideDuration</span><span class="p">,</span> <span class="s2">&quot;slideDuration&quot;</span><span class="p">)</span>
<span class="n">res</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">functions</span><span class="o">.</span><span class="n">window</span><span class="p">(</span><span class="n">time_col</span><span class="p">,</span> <span class="n">windowDuration</span><span class="p">,</span> <span class="n">slideDuration</span><span class="p">)</span>
<span class="k">elif</span> <span class="n">startTime</span><span class="p">:</span>
<span class="n">check_string_field</span><span class="p">(</span><span class="n">startTime</span><span class="p">,</span> <span class="s2">&quot;startTime&quot;</span><span class="p">)</span>
<span class="n">res</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">functions</span><span class="o">.</span><span class="n">window</span><span class="p">(</span><span class="n">time_col</span><span class="p">,</span> <span class="n">windowDuration</span><span class="p">,</span> <span class="n">windowDuration</span><span class="p">,</span> <span class="n">startTime</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">res</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">functions</span><span class="o">.</span><span class="n">window</span><span class="p">(</span><span class="n">time_col</span><span class="p">,</span> <span class="n">windowDuration</span><span class="p">)</span>
<span class="k">return</span> <span class="n">Column</span><span class="p">(</span><span class="n">res</span><span class="p">)</span></div>
<div class="viewcode-block" id="session_window"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.session_window.html#pyspark.sql.functions.session_window">[docs]</a><span class="k">def</span> <span class="nf">session_window</span><span class="p">(</span><span class="n">timeColumn</span><span class="p">,</span> <span class="n">gapDuration</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Generates session window given a timestamp specifying column.</span>
<span class="sd"> Session window is one of dynamic windows, which means the length of window is varying</span>
<span class="sd"> according to the given inputs. The length of session window is defined as &quot;the timestamp</span>
<span class="sd"> of latest input of the session + gap duration&quot;, so when the new inputs are bound to the</span>
<span class="sd"> current session window, the end time of session window can be expanded according to the new</span>
<span class="sd"> inputs.</span>
<span class="sd"> Windows can support microsecond precision. Windows in the order of months are not supported.</span>
<span class="sd"> For a streaming query, you may use the function `current_timestamp` to generate windows on</span>
<span class="sd"> processing time.</span>
<span class="sd"> gapDuration is provided as strings, e.g. &#39;1 second&#39;, &#39;1 day 12 hours&#39;, &#39;2 minutes&#39;. Valid</span>
<span class="sd"> interval strings are &#39;week&#39;, &#39;day&#39;, &#39;hour&#39;, &#39;minute&#39;, &#39;second&#39;, &#39;millisecond&#39;, &#39;microsecond&#39;.</span>
<span class="sd"> It could also be a Column which can be evaluated to gap duration dynamically based on the</span>
<span class="sd"> input row.</span>
<span class="sd"> The output column will be a struct called &#39;session_window&#39; by default with the nested columns</span>
<span class="sd"> &#39;start&#39; and &#39;end&#39;, where &#39;start&#39; and &#39;end&#39; will be of :class:`pyspark.sql.types.TimestampType`.</span>
<span class="sd"> .. versionadded:: 3.2.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> timeColumn : :class:`~pyspark.sql.Column`</span>
<span class="sd"> The column or the expression to use as the timestamp for windowing by time.</span>
<span class="sd"> The time column must be of TimestampType.</span>
<span class="sd"> gapDuration : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> A column or string specifying the timeout of the session. It could be static value,</span>
<span class="sd"> e.g. `10 minutes`, `1 second`, or an expression/UDF that specifies gap</span>
<span class="sd"> duration dynamically based on the input row.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(&quot;2016-03-11 09:00:07&quot;, 1)]).toDF(&quot;date&quot;, &quot;val&quot;)</span>
<span class="sd"> &gt;&gt;&gt; w = df.groupBy(session_window(&quot;date&quot;, &quot;5 seconds&quot;)).agg(sum(&quot;val&quot;).alias(&quot;sum&quot;))</span>
<span class="sd"> &gt;&gt;&gt; w.select(w.session_window.start.cast(&quot;string&quot;).alias(&quot;start&quot;),</span>
<span class="sd"> ... w.session_window.end.cast(&quot;string&quot;).alias(&quot;end&quot;), &quot;sum&quot;).collect()</span>
<span class="sd"> [Row(start=&#39;2016-03-11 09:00:07&#39;, end=&#39;2016-03-11 09:00:12&#39;, sum=1)]</span>
<span class="sd"> &gt;&gt;&gt; w = df.groupBy(session_window(&quot;date&quot;, lit(&quot;5 seconds&quot;))).agg(sum(&quot;val&quot;).alias(&quot;sum&quot;))</span>
<span class="sd"> &gt;&gt;&gt; w.select(w.session_window.start.cast(&quot;string&quot;).alias(&quot;start&quot;),</span>
<span class="sd"> ... w.session_window.end.cast(&quot;string&quot;).alias(&quot;end&quot;), &quot;sum&quot;).collect()</span>
<span class="sd"> [Row(start=&#39;2016-03-11 09:00:07&#39;, end=&#39;2016-03-11 09:00:12&#39;, sum=1)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">def</span> <span class="nf">check_field</span><span class="p">(</span><span class="n">field</span><span class="p">,</span> <span class="n">fieldName</span><span class="p">):</span>
<span class="k">if</span> <span class="n">field</span> <span class="ow">is</span> <span class="kc">None</span> <span class="ow">or</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">field</span><span class="p">,</span> <span class="p">(</span><span class="nb">str</span><span class="p">,</span> <span class="n">Column</span><span class="p">)):</span>
<span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">&quot;</span><span class="si">%s</span><span class="s2"> should be provided as a string or Column&quot;</span> <span class="o">%</span> <span class="n">fieldName</span><span class="p">)</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span>
<span class="n">time_col</span> <span class="o">=</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">timeColumn</span><span class="p">)</span>
<span class="n">check_field</span><span class="p">(</span><span class="n">gapDuration</span><span class="p">,</span> <span class="s2">&quot;gapDuration&quot;</span><span class="p">)</span>
<span class="n">gap_duration</span> <span class="o">=</span> <span class="p">(</span>
<span class="n">gapDuration</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">gapDuration</span><span class="p">,</span> <span class="nb">str</span><span class="p">)</span>
<span class="k">else</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">gapDuration</span><span class="p">)</span>
<span class="p">)</span>
<span class="n">res</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">functions</span><span class="o">.</span><span class="n">session_window</span><span class="p">(</span><span class="n">time_col</span><span class="p">,</span> <span class="n">gap_duration</span><span class="p">)</span>
<span class="k">return</span> <span class="n">Column</span><span class="p">(</span><span class="n">res</span><span class="p">)</span></div>
<span class="c1"># ---------------------------- misc functions ----------------------------------</span>
<div class="viewcode-block" id="crc32"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.crc32.html#pyspark.sql.functions.crc32">[docs]</a><span class="k">def</span> <span class="nf">crc32</span><span class="p">(</span><span class="n">col</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Calculates the cyclic redundancy check value (CRC32) of a binary column and</span>
<span class="sd"> returns the value as a bigint.</span>
<span class="sd"> .. versionadded:: 1.5.0</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; spark.createDataFrame([(&#39;ABC&#39;,)], [&#39;a&#39;]).select(crc32(&#39;a&#39;).alias(&#39;crc32&#39;)).collect()</span>
<span class="sd"> [Row(crc32=2743272264)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span>
<span class="k">return</span> <span class="n">Column</span><span class="p">(</span><span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">functions</span><span class="o">.</span><span class="n">crc32</span><span class="p">(</span><span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">)))</span></div>
<div class="viewcode-block" id="md5"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.md5.html#pyspark.sql.functions.md5">[docs]</a><span class="k">def</span> <span class="nf">md5</span><span class="p">(</span><span class="n">col</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;Calculates the MD5 digest and returns the value as a 32 character hex string.</span>
<span class="sd"> .. versionadded:: 1.5.0</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; spark.createDataFrame([(&#39;ABC&#39;,)], [&#39;a&#39;]).select(md5(&#39;a&#39;).alias(&#39;hash&#39;)).collect()</span>
<span class="sd"> [Row(hash=&#39;902fbdd2b1df0c4f70b4a5d23525e932&#39;)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span>
<span class="n">jc</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">functions</span><span class="o">.</span><span class="n">md5</span><span class="p">(</span><span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">))</span>
<span class="k">return</span> <span class="n">Column</span><span class="p">(</span><span class="n">jc</span><span class="p">)</span></div>
<div class="viewcode-block" id="sha1"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.sha1.html#pyspark.sql.functions.sha1">[docs]</a><span class="k">def</span> <span class="nf">sha1</span><span class="p">(</span><span class="n">col</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;Returns the hex string result of SHA-1.</span>
<span class="sd"> .. versionadded:: 1.5.0</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; spark.createDataFrame([(&#39;ABC&#39;,)], [&#39;a&#39;]).select(sha1(&#39;a&#39;).alias(&#39;hash&#39;)).collect()</span>
<span class="sd"> [Row(hash=&#39;3c01bdbb26f358bab27f267924aa2c9a03fcfdb8&#39;)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span>
<span class="n">jc</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">functions</span><span class="o">.</span><span class="n">sha1</span><span class="p">(</span><span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">))</span>
<span class="k">return</span> <span class="n">Column</span><span class="p">(</span><span class="n">jc</span><span class="p">)</span></div>
<div class="viewcode-block" id="sha2"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.sha2.html#pyspark.sql.functions.sha2">[docs]</a><span class="k">def</span> <span class="nf">sha2</span><span class="p">(</span><span class="n">col</span><span class="p">,</span> <span class="n">numBits</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;Returns the hex string result of SHA-2 family of hash functions (SHA-224, SHA-256, SHA-384,</span>
<span class="sd"> and SHA-512). The numBits indicates the desired bit length of the result, which must have a</span>
<span class="sd"> value of 224, 256, 384, 512, or 0 (which is equivalent to 256).</span>
<span class="sd"> .. versionadded:: 1.5.0</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; digests = df.select(sha2(df.name, 256).alias(&#39;s&#39;)).collect()</span>
<span class="sd"> &gt;&gt;&gt; digests[0]</span>
<span class="sd"> Row(s=&#39;3bc51062973c458d5a6f2d8d64a023246354ad7e064b1e4e009ec8a0699a3043&#39;)</span>
<span class="sd"> &gt;&gt;&gt; digests[1]</span>
<span class="sd"> Row(s=&#39;cd9fb1e148ccd8442e5aa74904cc73bf6fb54d1d54d333bd596aa9bb4bb4e961&#39;)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span>
<span class="n">jc</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">functions</span><span class="o">.</span><span class="n">sha2</span><span class="p">(</span><span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">),</span> <span class="n">numBits</span><span class="p">)</span>
<span class="k">return</span> <span class="n">Column</span><span class="p">(</span><span class="n">jc</span><span class="p">)</span></div>
<div class="viewcode-block" id="hash"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.hash.html#pyspark.sql.functions.hash">[docs]</a><span class="k">def</span> <span class="nf">hash</span><span class="p">(</span><span class="o">*</span><span class="n">cols</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;Calculates the hash code of given columns, and returns the result as an int column.</span>
<span class="sd"> .. versionadded:: 2.0.0</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; spark.createDataFrame([(&#39;ABC&#39;,)], [&#39;a&#39;]).select(hash(&#39;a&#39;).alias(&#39;hash&#39;)).collect()</span>
<span class="sd"> [Row(hash=-757602832)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span>
<span class="n">jc</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">functions</span><span class="o">.</span><span class="n">hash</span><span class="p">(</span><span class="n">_to_seq</span><span class="p">(</span><span class="n">sc</span><span class="p">,</span> <span class="n">cols</span><span class="p">,</span> <span class="n">_to_java_column</span><span class="p">))</span>
<span class="k">return</span> <span class="n">Column</span><span class="p">(</span><span class="n">jc</span><span class="p">)</span></div>
<div class="viewcode-block" id="xxhash64"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.xxhash64.html#pyspark.sql.functions.xxhash64">[docs]</a><span class="k">def</span> <span class="nf">xxhash64</span><span class="p">(</span><span class="o">*</span><span class="n">cols</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;Calculates the hash code of given columns using the 64-bit variant of the xxHash algorithm,</span>
<span class="sd"> and returns the result as a long column.</span>
<span class="sd"> .. versionadded:: 3.0.0</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; spark.createDataFrame([(&#39;ABC&#39;,)], [&#39;a&#39;]).select(xxhash64(&#39;a&#39;).alias(&#39;hash&#39;)).collect()</span>
<span class="sd"> [Row(hash=4105715581806190027)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span>
<span class="n">jc</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">functions</span><span class="o">.</span><span class="n">xxhash64</span><span class="p">(</span><span class="n">_to_seq</span><span class="p">(</span><span class="n">sc</span><span class="p">,</span> <span class="n">cols</span><span class="p">,</span> <span class="n">_to_java_column</span><span class="p">))</span>
<span class="k">return</span> <span class="n">Column</span><span class="p">(</span><span class="n">jc</span><span class="p">)</span></div>
<div class="viewcode-block" id="assert_true"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.assert_true.html#pyspark.sql.functions.assert_true">[docs]</a><span class="k">def</span> <span class="nf">assert_true</span><span class="p">(</span><span class="n">col</span><span class="p">,</span> <span class="n">errMsg</span><span class="o">=</span><span class="kc">None</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns null if the input column is true; throws an exception with the provided error message</span>
<span class="sd"> otherwise.</span>
<span class="sd"> .. versionadded:: 3.1.0</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(0,1)], [&#39;a&#39;, &#39;b&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(assert_true(df.a &lt; df.b).alias(&#39;r&#39;)).collect()</span>
<span class="sd"> [Row(r=None)]</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(0,1)], [&#39;a&#39;, &#39;b&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(assert_true(df.a &lt; df.b, df.a).alias(&#39;r&#39;)).collect()</span>
<span class="sd"> [Row(r=None)]</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(0,1)], [&#39;a&#39;, &#39;b&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(assert_true(df.a &lt; df.b, &#39;error&#39;).alias(&#39;r&#39;)).collect()</span>
<span class="sd"> [Row(r=None)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span>
<span class="k">if</span> <span class="n">errMsg</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
<span class="k">return</span> <span class="n">Column</span><span class="p">(</span><span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">functions</span><span class="o">.</span><span class="n">assert_true</span><span class="p">(</span><span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">)))</span>
<span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">errMsg</span><span class="p">,</span> <span class="p">(</span><span class="nb">str</span><span class="p">,</span> <span class="n">Column</span><span class="p">)):</span>
<span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span>
<span class="s2">&quot;errMsg should be a Column or a str, got </span><span class="si">{}</span><span class="s2">&quot;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="nb">type</span><span class="p">(</span><span class="n">errMsg</span><span class="p">))</span>
<span class="p">)</span>
<span class="n">errMsg</span> <span class="o">=</span> <span class="p">(</span>
<span class="n">_create_column_from_literal</span><span class="p">(</span><span class="n">errMsg</span><span class="p">)</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">errMsg</span><span class="p">,</span> <span class="nb">str</span><span class="p">)</span>
<span class="k">else</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">errMsg</span><span class="p">)</span>
<span class="p">)</span>
<span class="k">return</span> <span class="n">Column</span><span class="p">(</span><span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">functions</span><span class="o">.</span><span class="n">assert_true</span><span class="p">(</span><span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">),</span> <span class="n">errMsg</span><span class="p">))</span></div>
<div class="viewcode-block" id="raise_error"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.raise_error.html#pyspark.sql.functions.raise_error">[docs]</a><span class="nd">@since</span><span class="p">(</span><span class="mf">3.1</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">raise_error</span><span class="p">(</span><span class="n">errMsg</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Throws an exception with the provided error message.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">errMsg</span><span class="p">,</span> <span class="p">(</span><span class="nb">str</span><span class="p">,</span> <span class="n">Column</span><span class="p">)):</span>
<span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span>
<span class="s2">&quot;errMsg should be a Column or a str, got </span><span class="si">{}</span><span class="s2">&quot;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="nb">type</span><span class="p">(</span><span class="n">errMsg</span><span class="p">))</span>
<span class="p">)</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span>
<span class="n">errMsg</span> <span class="o">=</span> <span class="p">(</span>
<span class="n">_create_column_from_literal</span><span class="p">(</span><span class="n">errMsg</span><span class="p">)</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">errMsg</span><span class="p">,</span> <span class="nb">str</span><span class="p">)</span>
<span class="k">else</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">errMsg</span><span class="p">)</span>
<span class="p">)</span>
<span class="k">return</span> <span class="n">Column</span><span class="p">(</span><span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">functions</span><span class="o">.</span><span class="n">raise_error</span><span class="p">(</span><span class="n">errMsg</span><span class="p">))</span></div>
<span class="c1"># ---------------------- String/Binary functions ------------------------------</span>
<div class="viewcode-block" id="upper"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.upper.html#pyspark.sql.functions.upper">[docs]</a><span class="nd">@since</span><span class="p">(</span><span class="mf">1.5</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">upper</span><span class="p">(</span><span class="n">col</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Converts a string expression to upper case.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_column</span><span class="p">(</span><span class="s2">&quot;upper&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="lower"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.lower.html#pyspark.sql.functions.lower">[docs]</a><span class="nd">@since</span><span class="p">(</span><span class="mf">1.5</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">lower</span><span class="p">(</span><span class="n">col</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Converts a string expression to lower case.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_column</span><span class="p">(</span><span class="s2">&quot;lower&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="ascii"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.ascii.html#pyspark.sql.functions.ascii">[docs]</a><span class="nd">@since</span><span class="p">(</span><span class="mf">1.5</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">ascii</span><span class="p">(</span><span class="n">col</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Computes the numeric value of the first character of the string column.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_column</span><span class="p">(</span><span class="s2">&quot;ascii&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="base64"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.base64.html#pyspark.sql.functions.base64">[docs]</a><span class="nd">@since</span><span class="p">(</span><span class="mf">1.5</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">base64</span><span class="p">(</span><span class="n">col</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Computes the BASE64 encoding of a binary column and returns it as a string column.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_column</span><span class="p">(</span><span class="s2">&quot;base64&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="unbase64"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.unbase64.html#pyspark.sql.functions.unbase64">[docs]</a><span class="nd">@since</span><span class="p">(</span><span class="mf">1.5</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">unbase64</span><span class="p">(</span><span class="n">col</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Decodes a BASE64 encoded string column and returns it as a binary column.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_column</span><span class="p">(</span><span class="s2">&quot;unbase64&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="ltrim"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.ltrim.html#pyspark.sql.functions.ltrim">[docs]</a><span class="nd">@since</span><span class="p">(</span><span class="mf">1.5</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">ltrim</span><span class="p">(</span><span class="n">col</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Trim the spaces from left end for the specified string value.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_column</span><span class="p">(</span><span class="s2">&quot;ltrim&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="rtrim"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.rtrim.html#pyspark.sql.functions.rtrim">[docs]</a><span class="nd">@since</span><span class="p">(</span><span class="mf">1.5</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">rtrim</span><span class="p">(</span><span class="n">col</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Trim the spaces from right end for the specified string value.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_column</span><span class="p">(</span><span class="s2">&quot;rtrim&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="trim"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.trim.html#pyspark.sql.functions.trim">[docs]</a><span class="nd">@since</span><span class="p">(</span><span class="mf">1.5</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">trim</span><span class="p">(</span><span class="n">col</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Trim the spaces from both ends for the specified string column.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_function_over_column</span><span class="p">(</span><span class="s2">&quot;trim&quot;</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div>
<div class="viewcode-block" id="concat_ws"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.concat_ws.html#pyspark.sql.functions.concat_ws">[docs]</a><span class="k">def</span> <span class="nf">concat_ws</span><span class="p">(</span><span class="n">sep</span><span class="p">,</span> <span class="o">*</span><span class="n">cols</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Concatenates multiple input string columns together into a single string column,</span>
<span class="sd"> using the given separator.</span>
<span class="sd"> .. versionadded:: 1.5.0</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(&#39;abcd&#39;,&#39;123&#39;)], [&#39;s&#39;, &#39;d&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(concat_ws(&#39;-&#39;, df.s, df.d).alias(&#39;s&#39;)).collect()</span>
<span class="sd"> [Row(s=&#39;abcd-123&#39;)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span>
<span class="k">return</span> <span class="n">Column</span><span class="p">(</span><span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">functions</span><span class="o">.</span><span class="n">concat_ws</span><span class="p">(</span><span class="n">sep</span><span class="p">,</span> <span class="n">_to_seq</span><span class="p">(</span><span class="n">sc</span><span class="p">,</span> <span class="n">cols</span><span class="p">,</span> <span class="n">_to_java_column</span><span class="p">)))</span></div>
<div class="viewcode-block" id="decode"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.decode.html#pyspark.sql.functions.decode">[docs]</a><span class="nd">@since</span><span class="p">(</span><span class="mf">1.5</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">decode</span><span class="p">(</span><span class="n">col</span><span class="p">,</span> <span class="n">charset</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Computes the first argument into a string from a binary using the provided character set</span>
<span class="sd"> (one of &#39;US-ASCII&#39;, &#39;ISO-8859-1&#39;, &#39;UTF-8&#39;, &#39;UTF-16BE&#39;, &#39;UTF-16LE&#39;, &#39;UTF-16&#39;).</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span>
<span class="k">return</span> <span class="n">Column</span><span class="p">(</span><span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">functions</span><span class="o">.</span><span class="n">decode</span><span class="p">(</span><span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">),</span> <span class="n">charset</span><span class="p">))</span></div>
<div class="viewcode-block" id="encode"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.encode.html#pyspark.sql.functions.encode">[docs]</a><span class="nd">@since</span><span class="p">(</span><span class="mf">1.5</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">encode</span><span class="p">(</span><span class="n">col</span><span class="p">,</span> <span class="n">charset</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Computes the first argument into a binary from a string using the provided character set</span>
<span class="sd"> (one of &#39;US-ASCII&#39;, &#39;ISO-8859-1&#39;, &#39;UTF-8&#39;, &#39;UTF-16BE&#39;, &#39;UTF-16LE&#39;, &#39;UTF-16&#39;).</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span>
<span class="k">return</span> <span class="n">Column</span><span class="p">(</span><span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">functions</span><span class="o">.</span><span class="n">encode</span><span class="p">(</span><span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">),</span> <span class="n">charset</span><span class="p">))</span></div>
<div class="viewcode-block" id="format_number"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.format_number.html#pyspark.sql.functions.format_number">[docs]</a><span class="k">def</span> <span class="nf">format_number</span><span class="p">(</span><span class="n">col</span><span class="p">,</span> <span class="n">d</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Formats the number X to a format like &#39;#,--#,--#.--&#39;, rounded to d decimal places</span>
<span class="sd"> with HALF_EVEN round mode, and returns the result as a string.</span>
<span class="sd"> .. versionadded:: 1.5.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> the column name of the numeric value to be formatted</span>
<span class="sd"> d : int</span>
<span class="sd"> the N decimal places</span>
<span class="sd"> &gt;&gt;&gt; spark.createDataFrame([(5,)], [&#39;a&#39;]).select(format_number(&#39;a&#39;, 4).alias(&#39;v&#39;)).collect()</span>
<span class="sd"> [Row(v=&#39;5.0000&#39;)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span>
<span class="k">return</span> <span class="n">Column</span><span class="p">(</span><span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">functions</span><span class="o">.</span><span class="n">format_number</span><span class="p">(</span><span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">),</span> <span class="n">d</span><span class="p">))</span></div>
<div class="viewcode-block" id="format_string"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.format_string.html#pyspark.sql.functions.format_string">[docs]</a><span class="k">def</span> <span class="nf">format_string</span><span class="p">(</span><span class="nb">format</span><span class="p">,</span> <span class="o">*</span><span class="n">cols</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Formats the arguments in printf-style and returns the result as a string column.</span>
<span class="sd"> .. versionadded:: 1.5.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> format : str</span>
<span class="sd"> string that can contain embedded format tags and used as result column&#39;s value</span>
<span class="sd"> cols : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> column names or :class:`~pyspark.sql.Column`\\s to be used in formatting</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(5, &quot;hello&quot;)], [&#39;a&#39;, &#39;b&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(format_string(&#39;%d %s&#39;, df.a, df.b).alias(&#39;v&#39;)).collect()</span>
<span class="sd"> [Row(v=&#39;5 hello&#39;)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span>
<span class="k">return</span> <span class="n">Column</span><span class="p">(</span><span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">functions</span><span class="o">.</span><span class="n">format_string</span><span class="p">(</span><span class="nb">format</span><span class="p">,</span> <span class="n">_to_seq</span><span class="p">(</span><span class="n">sc</span><span class="p">,</span> <span class="n">cols</span><span class="p">,</span> <span class="n">_to_java_column</span><span class="p">)))</span></div>
<div class="viewcode-block" id="instr"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.instr.html#pyspark.sql.functions.instr">[docs]</a><span class="k">def</span> <span class="nf">instr</span><span class="p">(</span><span class="nb">str</span><span class="p">,</span> <span class="n">substr</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Locate the position of the first occurrence of substr column in the given string.</span>
<span class="sd"> Returns null if either of the arguments are null.</span>
<span class="sd"> .. versionadded:: 1.5.0</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> The position is not zero based, but 1 based index. Returns 0 if substr</span>
<span class="sd"> could not be found in str.</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(&#39;abcd&#39;,)], [&#39;s&#39;,])</span>
<span class="sd"> &gt;&gt;&gt; df.select(instr(df.s, &#39;b&#39;).alias(&#39;s&#39;)).collect()</span>
<span class="sd"> [Row(s=2)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span>
<span class="k">return</span> <span class="n">Column</span><span class="p">(</span><span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">functions</span><span class="o">.</span><span class="n">instr</span><span class="p">(</span><span class="n">_to_java_column</span><span class="p">(</span><span class="nb">str</span><span class="p">),</span> <span class="n">substr</span><span class="p">))</span></div>
<div class="viewcode-block" id="overlay"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.overlay.html#pyspark.sql.functions.overlay">[docs]</a><span class="k">def</span> <span class="nf">overlay</span><span class="p">(</span><span class="n">src</span><span class="p">,</span> <span class="n">replace</span><span class="p">,</span> <span class="n">pos</span><span class="p">,</span> <span class="nb">len</span><span class="o">=-</span><span class="mi">1</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Overlay the specified portion of `src` with `replace`,</span>
<span class="sd"> starting from byte position `pos` of `src` and proceeding for `len` bytes.</span>
<span class="sd"> .. versionadded:: 3.0.0</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(&quot;SPARK_SQL&quot;, &quot;CORE&quot;)], (&quot;x&quot;, &quot;y&quot;))</span>
<span class="sd"> &gt;&gt;&gt; df.select(overlay(&quot;x&quot;, &quot;y&quot;, 7).alias(&quot;overlayed&quot;)).show()</span>
<span class="sd"> +----------+</span>
<span class="sd"> | overlayed|</span>
<span class="sd"> +----------+</span>
<span class="sd"> |SPARK_CORE|</span>
<span class="sd"> +----------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">pos</span><span class="p">,</span> <span class="p">(</span><span class="nb">int</span><span class="p">,</span> <span class="nb">str</span><span class="p">,</span> <span class="n">Column</span><span class="p">)):</span>
<span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span>
<span class="s2">&quot;pos should be an integer or a Column / column name, got </span><span class="si">{}</span><span class="s2">&quot;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="nb">type</span><span class="p">(</span><span class="n">pos</span><span class="p">)))</span>
<span class="k">if</span> <span class="nb">len</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> <span class="ow">and</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="nb">len</span><span class="p">,</span> <span class="p">(</span><span class="nb">int</span><span class="p">,</span> <span class="nb">str</span><span class="p">,</span> <span class="n">Column</span><span class="p">)):</span>
<span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span>
<span class="s2">&quot;len should be an integer or a Column / column name, got </span><span class="si">{}</span><span class="s2">&quot;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="nb">type</span><span class="p">(</span><span class="nb">len</span><span class="p">)))</span>
<span class="n">pos</span> <span class="o">=</span> <span class="n">_create_column_from_literal</span><span class="p">(</span><span class="n">pos</span><span class="p">)</span> <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">pos</span><span class="p">,</span> <span class="nb">int</span><span class="p">)</span> <span class="k">else</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">pos</span><span class="p">)</span>
<span class="nb">len</span> <span class="o">=</span> <span class="n">_create_column_from_literal</span><span class="p">(</span><span class="nb">len</span><span class="p">)</span> <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="nb">len</span><span class="p">,</span> <span class="nb">int</span><span class="p">)</span> <span class="k">else</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="nb">len</span><span class="p">)</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span>
<span class="k">return</span> <span class="n">Column</span><span class="p">(</span><span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">functions</span><span class="o">.</span><span class="n">overlay</span><span class="p">(</span>
<span class="n">_to_java_column</span><span class="p">(</span><span class="n">src</span><span class="p">),</span>
<span class="n">_to_java_column</span><span class="p">(</span><span class="n">replace</span><span class="p">),</span>
<span class="n">pos</span><span class="p">,</span>
<span class="nb">len</span>
<span class="p">))</span></div>
<div class="viewcode-block" id="sentences"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.sentences.html#pyspark.sql.functions.sentences">[docs]</a><span class="k">def</span> <span class="nf">sentences</span><span class="p">(</span><span class="n">string</span><span class="p">,</span> <span class="n">language</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">country</span><span class="o">=</span><span class="kc">None</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Splits a string into arrays of sentences, where each sentence is an array of words.</span>
<span class="sd"> The &#39;language&#39; and &#39;country&#39; arguments are optional, and if omitted, the default locale is used.</span>
<span class="sd"> .. versionadded:: 3.2.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> string : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> a string to be split</span>
<span class="sd"> language : :class:`~pyspark.sql.Column` or str, optional</span>
<span class="sd"> a language of the locale</span>
<span class="sd"> country : :class:`~pyspark.sql.Column` or str, optional</span>
<span class="sd"> a country of the locale</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([[&quot;This is an example sentence.&quot;]], [&quot;string&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(sentences(df.string, lit(&quot;en&quot;), lit(&quot;US&quot;))).show(truncate=False)</span>
<span class="sd"> +-----------------------------------+</span>
<span class="sd"> |sentences(string, en, US) |</span>
<span class="sd"> +-----------------------------------+</span>
<span class="sd"> |[[This, is, an, example, sentence]]|</span>
<span class="sd"> +-----------------------------------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="n">language</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
<span class="n">language</span> <span class="o">=</span> <span class="n">lit</span><span class="p">(</span><span class="s2">&quot;&quot;</span><span class="p">)</span>
<span class="k">if</span> <span class="n">country</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
<span class="n">country</span> <span class="o">=</span> <span class="n">lit</span><span class="p">(</span><span class="s2">&quot;&quot;</span><span class="p">)</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span>
<span class="k">return</span> <span class="n">Column</span><span class="p">(</span><span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">functions</span><span class="o">.</span><span class="n">sentences</span><span class="p">(</span>
<span class="n">_to_java_column</span><span class="p">(</span><span class="n">string</span><span class="p">),</span>
<span class="n">_to_java_column</span><span class="p">(</span><span class="n">language</span><span class="p">),</span>
<span class="n">_to_java_column</span><span class="p">(</span><span class="n">country</span><span class="p">)</span>
<span class="p">))</span></div>
<div class="viewcode-block" id="substring"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.substring.html#pyspark.sql.functions.substring">[docs]</a><span class="k">def</span> <span class="nf">substring</span><span class="p">(</span><span class="nb">str</span><span class="p">,</span> <span class="n">pos</span><span class="p">,</span> <span class="nb">len</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Substring starts at `pos` and is of length `len` when str is String type or</span>
<span class="sd"> returns the slice of byte array that starts at `pos` in byte and is of length `len`</span>
<span class="sd"> when str is Binary type.</span>
<span class="sd"> .. versionadded:: 1.5.0</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> The position is not zero based, but 1 based index.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(&#39;abcd&#39;,)], [&#39;s&#39;,])</span>
<span class="sd"> &gt;&gt;&gt; df.select(substring(df.s, 1, 2).alias(&#39;s&#39;)).collect()</span>
<span class="sd"> [Row(s=&#39;ab&#39;)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span>
<span class="k">return</span> <span class="n">Column</span><span class="p">(</span><span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">functions</span><span class="o">.</span><span class="n">substring</span><span class="p">(</span><span class="n">_to_java_column</span><span class="p">(</span><span class="nb">str</span><span class="p">),</span> <span class="n">pos</span><span class="p">,</span> <span class="nb">len</span><span class="p">))</span></div>
<div class="viewcode-block" id="substring_index"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.substring_index.html#pyspark.sql.functions.substring_index">[docs]</a><span class="k">def</span> <span class="nf">substring_index</span><span class="p">(</span><span class="nb">str</span><span class="p">,</span> <span class="n">delim</span><span class="p">,</span> <span class="n">count</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns the substring from string str before count occurrences of the delimiter delim.</span>
<span class="sd"> If count is positive, everything the left of the final delimiter (counting from left) is</span>
<span class="sd"> returned. If count is negative, every to the right of the final delimiter (counting from the</span>
<span class="sd"> right) is returned. substring_index performs a case-sensitive match when searching for delim.</span>
<span class="sd"> .. versionadded:: 1.5.0</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(&#39;a.b.c.d&#39;,)], [&#39;s&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(substring_index(df.s, &#39;.&#39;, 2).alias(&#39;s&#39;)).collect()</span>
<span class="sd"> [Row(s=&#39;a.b&#39;)]</span>
<span class="sd"> &gt;&gt;&gt; df.select(substring_index(df.s, &#39;.&#39;, -3).alias(&#39;s&#39;)).collect()</span>
<span class="sd"> [Row(s=&#39;b.c.d&#39;)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span>
<span class="k">return</span> <span class="n">Column</span><span class="p">(</span><span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">functions</span><span class="o">.</span><span class="n">substring_index</span><span class="p">(</span><span class="n">_to_java_column</span><span class="p">(</span><span class="nb">str</span><span class="p">),</span> <span class="n">delim</span><span class="p">,</span> <span class="n">count</span><span class="p">))</span></div>
<div class="viewcode-block" id="levenshtein"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.levenshtein.html#pyspark.sql.functions.levenshtein">[docs]</a><span class="k">def</span> <span class="nf">levenshtein</span><span class="p">(</span><span class="n">left</span><span class="p">,</span> <span class="n">right</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;Computes the Levenshtein distance of the two given strings.</span>
<span class="sd"> .. versionadded:: 1.5.0</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df0 = spark.createDataFrame([(&#39;kitten&#39;, &#39;sitting&#39;,)], [&#39;l&#39;, &#39;r&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df0.select(levenshtein(&#39;l&#39;, &#39;r&#39;).alias(&#39;d&#39;)).collect()</span>
<span class="sd"> [Row(d=3)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span>
<span class="n">jc</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">functions</span><span class="o">.</span><span class="n">levenshtein</span><span class="p">(</span><span class="n">_to_java_column</span><span class="p">(</span><span class="n">left</span><span class="p">),</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">right</span><span class="p">))</span>
<span class="k">return</span> <span class="n">Column</span><span class="p">(</span><span class="n">jc</span><span class="p">)</span></div>
<div class="viewcode-block" id="locate"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.locate.html#pyspark.sql.functions.locate">[docs]</a><span class="k">def</span> <span class="nf">locate</span><span class="p">(</span><span class="n">substr</span><span class="p">,</span> <span class="nb">str</span><span class="p">,</span> <span class="n">pos</span><span class="o">=</span><span class="mi">1</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Locate the position of the first occurrence of substr in a string column, after position pos.</span>
<span class="sd"> .. versionadded:: 1.5.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> substr : str</span>
<span class="sd"> a string</span>
<span class="sd"> str : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> a Column of :class:`pyspark.sql.types.StringType`</span>
<span class="sd"> pos : int, optional</span>
<span class="sd"> start position (zero based)</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> The position is not zero based, but 1 based index. Returns 0 if substr</span>
<span class="sd"> could not be found in str.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(&#39;abcd&#39;,)], [&#39;s&#39;,])</span>
<span class="sd"> &gt;&gt;&gt; df.select(locate(&#39;b&#39;, df.s, 1).alias(&#39;s&#39;)).collect()</span>
<span class="sd"> [Row(s=2)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span>
<span class="k">return</span> <span class="n">Column</span><span class="p">(</span><span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">functions</span><span class="o">.</span><span class="n">locate</span><span class="p">(</span><span class="n">substr</span><span class="p">,</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="nb">str</span><span class="p">),</span> <span class="n">pos</span><span class="p">))</span></div>
<div class="viewcode-block" id="lpad"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.lpad.html#pyspark.sql.functions.lpad">[docs]</a><span class="k">def</span> <span class="nf">lpad</span><span class="p">(</span><span class="n">col</span><span class="p">,</span> <span class="nb">len</span><span class="p">,</span> <span class="n">pad</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Left-pad the string column to width `len` with `pad`.</span>
<span class="sd"> .. versionadded:: 1.5.0</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(&#39;abcd&#39;,)], [&#39;s&#39;,])</span>
<span class="sd"> &gt;&gt;&gt; df.select(lpad(df.s, 6, &#39;#&#39;).alias(&#39;s&#39;)).collect()</span>
<span class="sd"> [Row(s=&#39;##abcd&#39;)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span>
<span class="k">return</span> <span class="n">Column</span><span class="p">(</span><span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">functions</span><span class="o">.</span><span class="n">lpad</span><span class="p">(</span><span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">),</span> <span class="nb">len</span><span class="p">,</span> <span class="n">pad</span><span class="p">))</span></div>
<div class="viewcode-block" id="rpad"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.rpad.html#pyspark.sql.functions.rpad">[docs]</a><span class="k">def</span> <span class="nf">rpad</span><span class="p">(</span><span class="n">col</span><span class="p">,</span> <span class="nb">len</span><span class="p">,</span> <span class="n">pad</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Right-pad the string column to width `len` with `pad`.</span>
<span class="sd"> .. versionadded:: 1.5.0</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(&#39;abcd&#39;,)], [&#39;s&#39;,])</span>
<span class="sd"> &gt;&gt;&gt; df.select(rpad(df.s, 6, &#39;#&#39;).alias(&#39;s&#39;)).collect()</span>
<span class="sd"> [Row(s=&#39;abcd##&#39;)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span>
<span class="k">return</span> <span class="n">Column</span><span class="p">(</span><span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">functions</span><span class="o">.</span><span class="n">rpad</span><span class="p">(</span><span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">),</span> <span class="nb">len</span><span class="p">,</span> <span class="n">pad</span><span class="p">))</span></div>
<div class="viewcode-block" id="repeat"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.repeat.html#pyspark.sql.functions.repeat">[docs]</a><span class="k">def</span> <span class="nf">repeat</span><span class="p">(</span><span class="n">col</span><span class="p">,</span> <span class="n">n</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Repeats a string column n times, and returns it as a new string column.</span>
<span class="sd"> .. versionadded:: 1.5.0</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(&#39;ab&#39;,)], [&#39;s&#39;,])</span>
<span class="sd"> &gt;&gt;&gt; df.select(repeat(df.s, 3).alias(&#39;s&#39;)).collect()</span>
<span class="sd"> [Row(s=&#39;ababab&#39;)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span>
<span class="k">return</span> <span class="n">Column</span><span class="p">(</span><span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">functions</span><span class="o">.</span><span class="n">repeat</span><span class="p">(</span><span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">),</span> <span class="n">n</span><span class="p">))</span></div>
<div class="viewcode-block" id="split"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.split.html#pyspark.sql.functions.split">[docs]</a><span class="k">def</span> <span class="nf">split</span><span class="p">(</span><span class="nb">str</span><span class="p">,</span> <span class="n">pattern</span><span class="p">,</span> <span class="n">limit</span><span class="o">=-</span><span class="mi">1</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Splits str around matches of the given pattern.</span>
<span class="sd"> .. versionadded:: 1.5.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> str : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> a string expression to split</span>
<span class="sd"> pattern : str</span>
<span class="sd"> a string representing a regular expression. The regex string should be</span>
<span class="sd"> a Java regular expression.</span>
<span class="sd"> limit : int, optional</span>
<span class="sd"> an integer which controls the number of times `pattern` is applied.</span>
<span class="sd"> * ``limit &gt; 0``: The resulting array&#39;s length will not be more than `limit`, and the</span>
<span class="sd"> resulting array&#39;s last entry will contain all input beyond the last</span>
<span class="sd"> matched pattern.</span>
<span class="sd"> * ``limit &lt;= 0``: `pattern` will be applied as many times as possible, and the resulting</span>
<span class="sd"> array can be of any size.</span>
<span class="sd"> .. versionchanged:: 3.0</span>
<span class="sd"> `split` now takes an optional `limit` field. If not provided, default limit value is -1.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(&#39;oneAtwoBthreeC&#39;,)], [&#39;s&#39;,])</span>
<span class="sd"> &gt;&gt;&gt; df.select(split(df.s, &#39;[ABC]&#39;, 2).alias(&#39;s&#39;)).collect()</span>
<span class="sd"> [Row(s=[&#39;one&#39;, &#39;twoBthreeC&#39;])]</span>
<span class="sd"> &gt;&gt;&gt; df.select(split(df.s, &#39;[ABC]&#39;, -1).alias(&#39;s&#39;)).collect()</span>
<span class="sd"> [Row(s=[&#39;one&#39;, &#39;two&#39;, &#39;three&#39;, &#39;&#39;])]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span>
<span class="k">return</span> <span class="n">Column</span><span class="p">(</span><span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">functions</span><span class="o">.</span><span class="n">split</span><span class="p">(</span><span class="n">_to_java_column</span><span class="p">(</span><span class="nb">str</span><span class="p">),</span> <span class="n">pattern</span><span class="p">,</span> <span class="n">limit</span><span class="p">))</span></div>
<div class="viewcode-block" id="regexp_extract"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.regexp_extract.html#pyspark.sql.functions.regexp_extract">[docs]</a><span class="k">def</span> <span class="nf">regexp_extract</span><span class="p">(</span><span class="nb">str</span><span class="p">,</span> <span class="n">pattern</span><span class="p">,</span> <span class="n">idx</span><span class="p">):</span>
<span class="sa">r</span><span class="sd">&quot;&quot;&quot;Extract a specific group matched by a Java regex, from the specified string column.</span>
<span class="sd"> If the regex did not match, or the specified group did not match, an empty string is returned.</span>
<span class="sd"> .. versionadded:: 1.5.0</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(&#39;100-200&#39;,)], [&#39;str&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(regexp_extract(&#39;str&#39;, r&#39;(\d+)-(\d+)&#39;, 1).alias(&#39;d&#39;)).collect()</span>
<span class="sd"> [Row(d=&#39;100&#39;)]</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(&#39;foo&#39;,)], [&#39;str&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(regexp_extract(&#39;str&#39;, r&#39;(\d+)&#39;, 1).alias(&#39;d&#39;)).collect()</span>
<span class="sd"> [Row(d=&#39;&#39;)]</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(&#39;aaaac&#39;,)], [&#39;str&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(regexp_extract(&#39;str&#39;, &#39;(a+)(b)?(c)&#39;, 2).alias(&#39;d&#39;)).collect()</span>
<span class="sd"> [Row(d=&#39;&#39;)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span>
<span class="n">jc</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">functions</span><span class="o">.</span><span class="n">regexp_extract</span><span class="p">(</span><span class="n">_to_java_column</span><span class="p">(</span><span class="nb">str</span><span class="p">),</span> <span class="n">pattern</span><span class="p">,</span> <span class="n">idx</span><span class="p">)</span>
<span class="k">return</span> <span class="n">Column</span><span class="p">(</span><span class="n">jc</span><span class="p">)</span></div>
<div class="viewcode-block" id="regexp_replace"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.regexp_replace.html#pyspark.sql.functions.regexp_replace">[docs]</a><span class="k">def</span> <span class="nf">regexp_replace</span><span class="p">(</span><span class="nb">str</span><span class="p">,</span> <span class="n">pattern</span><span class="p">,</span> <span class="n">replacement</span><span class="p">):</span>
<span class="sa">r</span><span class="sd">&quot;&quot;&quot;Replace all substrings of the specified string value that match regexp with rep.</span>
<span class="sd"> .. versionadded:: 1.5.0</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(&#39;100-200&#39;,)], [&#39;str&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(regexp_replace(&#39;str&#39;, r&#39;(\d+)&#39;, &#39;--&#39;).alias(&#39;d&#39;)).collect()</span>
<span class="sd"> [Row(d=&#39;-----&#39;)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span>
<span class="n">jc</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">functions</span><span class="o">.</span><span class="n">regexp_replace</span><span class="p">(</span><span class="n">_to_java_column</span><span class="p">(</span><span class="nb">str</span><span class="p">),</span> <span class="n">pattern</span><span class="p">,</span> <span class="n">replacement</span><span class="p">)</span>
<span class="k">return</span> <span class="n">Column</span><span class="p">(</span><span class="n">jc</span><span class="p">)</span></div>
<div class="viewcode-block" id="initcap"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.initcap.html#pyspark.sql.functions.initcap">[docs]</a><span class="k">def</span> <span class="nf">initcap</span><span class="p">(</span><span class="n">col</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;Translate the first letter of each word to upper case in the sentence.</span>
<span class="sd"> .. versionadded:: 1.5.0</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; spark.createDataFrame([(&#39;ab cd&#39;,)], [&#39;a&#39;]).select(initcap(&quot;a&quot;).alias(&#39;v&#39;)).collect()</span>
<span class="sd"> [Row(v=&#39;Ab Cd&#39;)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span>
<span class="k">return</span> <span class="n">Column</span><span class="p">(</span><span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">functions</span><span class="o">.</span><span class="n">initcap</span><span class="p">(</span><span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">)))</span></div>
<div class="viewcode-block" id="soundex"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.soundex.html#pyspark.sql.functions.soundex">[docs]</a><span class="k">def</span> <span class="nf">soundex</span><span class="p">(</span><span class="n">col</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns the SoundEx encoding for a string</span>
<span class="sd"> .. versionadded:: 1.5.0</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(&quot;Peters&quot;,),(&quot;Uhrbach&quot;,)], [&#39;name&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(soundex(df.name).alias(&quot;soundex&quot;)).collect()</span>
<span class="sd"> [Row(soundex=&#39;P362&#39;), Row(soundex=&#39;U612&#39;)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span>
<span class="k">return</span> <span class="n">Column</span><span class="p">(</span><span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">functions</span><span class="o">.</span><span class="n">soundex</span><span class="p">(</span><span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">)))</span></div>
<div class="viewcode-block" id="bin"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.bin.html#pyspark.sql.functions.bin">[docs]</a><span class="k">def</span> <span class="nf">bin</span><span class="p">(</span><span class="n">col</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;Returns the string representation of the binary value of the given column.</span>
<span class="sd"> .. versionadded:: 1.5.0</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df.select(bin(df.age).alias(&#39;c&#39;)).collect()</span>
<span class="sd"> [Row(c=&#39;10&#39;), Row(c=&#39;101&#39;)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span>
<span class="n">jc</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">functions</span><span class="o">.</span><span class="n">bin</span><span class="p">(</span><span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">))</span>
<span class="k">return</span> <span class="n">Column</span><span class="p">(</span><span class="n">jc</span><span class="p">)</span></div>
<div class="viewcode-block" id="hex"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.hex.html#pyspark.sql.functions.hex">[docs]</a><span class="k">def</span> <span class="nf">hex</span><span class="p">(</span><span class="n">col</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;Computes hex value of the given column, which could be :class:`pyspark.sql.types.StringType`,</span>
<span class="sd"> :class:`pyspark.sql.types.BinaryType`, :class:`pyspark.sql.types.IntegerType` or</span>
<span class="sd"> :class:`pyspark.sql.types.LongType`.</span>
<span class="sd"> .. versionadded:: 1.5.0</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; spark.createDataFrame([(&#39;ABC&#39;, 3)], [&#39;a&#39;, &#39;b&#39;]).select(hex(&#39;a&#39;), hex(&#39;b&#39;)).collect()</span>
<span class="sd"> [Row(hex(a)=&#39;414243&#39;, hex(b)=&#39;3&#39;)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span>
<span class="n">jc</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">functions</span><span class="o">.</span><span class="n">hex</span><span class="p">(</span><span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">))</span>
<span class="k">return</span> <span class="n">Column</span><span class="p">(</span><span class="n">jc</span><span class="p">)</span></div>
<div class="viewcode-block" id="unhex"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.unhex.html#pyspark.sql.functions.unhex">[docs]</a><span class="k">def</span> <span class="nf">unhex</span><span class="p">(</span><span class="n">col</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;Inverse of hex. Interprets each pair of characters as a hexadecimal number</span>
<span class="sd"> and converts to the byte representation of number.</span>
<span class="sd"> .. versionadded:: 1.5.0</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; spark.createDataFrame([(&#39;414243&#39;,)], [&#39;a&#39;]).select(unhex(&#39;a&#39;)).collect()</span>
<span class="sd"> [Row(unhex(a)=bytearray(b&#39;ABC&#39;))]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span>
<span class="k">return</span> <span class="n">Column</span><span class="p">(</span><span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">functions</span><span class="o">.</span><span class="n">unhex</span><span class="p">(</span><span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">)))</span></div>
<div class="viewcode-block" id="length"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.length.html#pyspark.sql.functions.length">[docs]</a><span class="k">def</span> <span class="nf">length</span><span class="p">(</span><span class="n">col</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;Computes the character length of string data or number of bytes of binary data.</span>
<span class="sd"> The length of character data includes the trailing spaces. The length of binary data</span>
<span class="sd"> includes binary zeros.</span>
<span class="sd"> .. versionadded:: 1.5.0</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; spark.createDataFrame([(&#39;ABC &#39;,)], [&#39;a&#39;]).select(length(&#39;a&#39;).alias(&#39;length&#39;)).collect()</span>
<span class="sd"> [Row(length=4)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span>
<span class="k">return</span> <span class="n">Column</span><span class="p">(</span><span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">functions</span><span class="o">.</span><span class="n">length</span><span class="p">(</span><span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">)))</span></div>
<div class="viewcode-block" id="translate"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.translate.html#pyspark.sql.functions.translate">[docs]</a><span class="k">def</span> <span class="nf">translate</span><span class="p">(</span><span class="n">srcCol</span><span class="p">,</span> <span class="n">matching</span><span class="p">,</span> <span class="n">replace</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;A function translate any character in the `srcCol` by a character in `matching`.</span>
<span class="sd"> The characters in `replace` is corresponding to the characters in `matching`.</span>
<span class="sd"> The translate will happen when any character in the string matching with the character</span>
<span class="sd"> in the `matching`.</span>
<span class="sd"> .. versionadded:: 1.5.0</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; spark.createDataFrame([(&#39;translate&#39;,)], [&#39;a&#39;]).select(translate(&#39;a&#39;, &quot;rnlt&quot;, &quot;123&quot;) \\</span>
<span class="sd"> ... .alias(&#39;r&#39;)).collect()</span>
<span class="sd"> [Row(r=&#39;1a2s3ae&#39;)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span>
<span class="k">return</span> <span class="n">Column</span><span class="p">(</span><span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">functions</span><span class="o">.</span><span class="n">translate</span><span class="p">(</span><span class="n">_to_java_column</span><span class="p">(</span><span class="n">srcCol</span><span class="p">),</span> <span class="n">matching</span><span class="p">,</span> <span class="n">replace</span><span class="p">))</span></div>
<span class="c1"># ---------------------- Collection functions ------------------------------</span>
<div class="viewcode-block" id="create_map"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.create_map.html#pyspark.sql.functions.create_map">[docs]</a><span class="k">def</span> <span class="nf">create_map</span><span class="p">(</span><span class="o">*</span><span class="n">cols</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;Creates a new map column.</span>
<span class="sd"> .. versionadded:: 2.0.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> cols : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> column names or :class:`~pyspark.sql.Column`\\s that are</span>
<span class="sd"> grouped as key-value pairs, e.g. (key1, value1, key2, value2, ...).</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df.select(create_map(&#39;name&#39;, &#39;age&#39;).alias(&quot;map&quot;)).collect()</span>
<span class="sd"> [Row(map={&#39;Alice&#39;: 2}), Row(map={&#39;Bob&#39;: 5})]</span>
<span class="sd"> &gt;&gt;&gt; df.select(create_map([df.name, df.age]).alias(&quot;map&quot;)).collect()</span>
<span class="sd"> [Row(map={&#39;Alice&#39;: 2}), Row(map={&#39;Bob&#39;: 5})]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span>
<span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">cols</span><span class="p">)</span> <span class="o">==</span> <span class="mi">1</span> <span class="ow">and</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">cols</span><span class="p">[</span><span class="mi">0</span><span class="p">],</span> <span class="p">(</span><span class="nb">list</span><span class="p">,</span> <span class="nb">set</span><span class="p">)):</span>
<span class="n">cols</span> <span class="o">=</span> <span class="n">cols</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span>
<span class="n">jc</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">functions</span><span class="o">.</span><span class="n">map</span><span class="p">(</span><span class="n">_to_seq</span><span class="p">(</span><span class="n">sc</span><span class="p">,</span> <span class="n">cols</span><span class="p">,</span> <span class="n">_to_java_column</span><span class="p">))</span>
<span class="k">return</span> <span class="n">Column</span><span class="p">(</span><span class="n">jc</span><span class="p">)</span></div>
<div class="viewcode-block" id="map_from_arrays"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.map_from_arrays.html#pyspark.sql.functions.map_from_arrays">[docs]</a><span class="k">def</span> <span class="nf">map_from_arrays</span><span class="p">(</span><span class="n">col1</span><span class="p">,</span> <span class="n">col2</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;Creates a new map from two arrays.</span>
<span class="sd"> .. versionadded:: 2.4.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col1 : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> name of column containing a set of keys. All elements should not be null</span>
<span class="sd"> col2 : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> name of column containing a set of values</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([([2, 5], [&#39;a&#39;, &#39;b&#39;])], [&#39;k&#39;, &#39;v&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(map_from_arrays(df.k, df.v).alias(&quot;map&quot;)).show()</span>
<span class="sd"> +----------------+</span>
<span class="sd"> | map|</span>
<span class="sd"> +----------------+</span>
<span class="sd"> |{2 -&gt; a, 5 -&gt; b}|</span>
<span class="sd"> +----------------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span>
<span class="k">return</span> <span class="n">Column</span><span class="p">(</span><span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">functions</span><span class="o">.</span><span class="n">map_from_arrays</span><span class="p">(</span><span class="n">_to_java_column</span><span class="p">(</span><span class="n">col1</span><span class="p">),</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">col2</span><span class="p">)))</span></div>
<div class="viewcode-block" id="array"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.array.html#pyspark.sql.functions.array">[docs]</a><span class="k">def</span> <span class="nf">array</span><span class="p">(</span><span class="o">*</span><span class="n">cols</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;Creates a new array column.</span>
<span class="sd"> .. versionadded:: 1.4.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> cols : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> column names or :class:`~pyspark.sql.Column`\\s that have</span>
<span class="sd"> the same data type.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df.select(array(&#39;age&#39;, &#39;age&#39;).alias(&quot;arr&quot;)).collect()</span>
<span class="sd"> [Row(arr=[2, 2]), Row(arr=[5, 5])]</span>
<span class="sd"> &gt;&gt;&gt; df.select(array([df.age, df.age]).alias(&quot;arr&quot;)).collect()</span>
<span class="sd"> [Row(arr=[2, 2]), Row(arr=[5, 5])]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span>
<span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">cols</span><span class="p">)</span> <span class="o">==</span> <span class="mi">1</span> <span class="ow">and</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">cols</span><span class="p">[</span><span class="mi">0</span><span class="p">],</span> <span class="p">(</span><span class="nb">list</span><span class="p">,</span> <span class="nb">set</span><span class="p">)):</span>
<span class="n">cols</span> <span class="o">=</span> <span class="n">cols</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span>
<span class="n">jc</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">functions</span><span class="o">.</span><span class="n">array</span><span class="p">(</span><span class="n">_to_seq</span><span class="p">(</span><span class="n">sc</span><span class="p">,</span> <span class="n">cols</span><span class="p">,</span> <span class="n">_to_java_column</span><span class="p">))</span>
<span class="k">return</span> <span class="n">Column</span><span class="p">(</span><span class="n">jc</span><span class="p">)</span></div>
<div class="viewcode-block" id="array_contains"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.array_contains.html#pyspark.sql.functions.array_contains">[docs]</a><span class="k">def</span> <span class="nf">array_contains</span><span class="p">(</span><span class="n">col</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Collection function: returns null if the array is null, true if the array contains the</span>
<span class="sd"> given value, and false otherwise.</span>
<span class="sd"> .. versionadded:: 1.5.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> name of column containing array</span>
<span class="sd"> value :</span>
<span class="sd"> value or column to check for in array</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([([&quot;a&quot;, &quot;b&quot;, &quot;c&quot;],), ([],)], [&#39;data&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(array_contains(df.data, &quot;a&quot;)).collect()</span>
<span class="sd"> [Row(array_contains(data, a)=True), Row(array_contains(data, a)=False)]</span>
<span class="sd"> &gt;&gt;&gt; df.select(array_contains(df.data, lit(&quot;a&quot;))).collect()</span>
<span class="sd"> [Row(array_contains(data, a)=True), Row(array_contains(data, a)=False)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span>
<span class="n">value</span> <span class="o">=</span> <span class="n">value</span><span class="o">.</span><span class="n">_jc</span> <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">value</span><span class="p">,</span> <span class="n">Column</span><span class="p">)</span> <span class="k">else</span> <span class="n">value</span>
<span class="k">return</span> <span class="n">Column</span><span class="p">(</span><span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">functions</span><span class="o">.</span><span class="n">array_contains</span><span class="p">(</span><span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">),</span> <span class="n">value</span><span class="p">))</span></div>
<div class="viewcode-block" id="arrays_overlap"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.arrays_overlap.html#pyspark.sql.functions.arrays_overlap">[docs]</a><span class="k">def</span> <span class="nf">arrays_overlap</span><span class="p">(</span><span class="n">a1</span><span class="p">,</span> <span class="n">a2</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Collection function: returns true if the arrays contain any common non-null element; if not,</span>
<span class="sd"> returns null if both the arrays are non-empty and any of them contains a null element; returns</span>
<span class="sd"> false otherwise.</span>
<span class="sd"> .. versionadded:: 2.4.0</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([([&quot;a&quot;, &quot;b&quot;], [&quot;b&quot;, &quot;c&quot;]), ([&quot;a&quot;], [&quot;b&quot;, &quot;c&quot;])], [&#39;x&#39;, &#39;y&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(arrays_overlap(df.x, df.y).alias(&quot;overlap&quot;)).collect()</span>
<span class="sd"> [Row(overlap=True), Row(overlap=False)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span>
<span class="k">return</span> <span class="n">Column</span><span class="p">(</span><span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">functions</span><span class="o">.</span><span class="n">arrays_overlap</span><span class="p">(</span><span class="n">_to_java_column</span><span class="p">(</span><span class="n">a1</span><span class="p">),</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">a2</span><span class="p">)))</span></div>
<div class="viewcode-block" id="slice"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.slice.html#pyspark.sql.functions.slice">[docs]</a><span class="k">def</span> <span class="nf">slice</span><span class="p">(</span><span class="n">x</span><span class="p">,</span> <span class="n">start</span><span class="p">,</span> <span class="n">length</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Collection function: returns an array containing all the elements in `x` from index `start`</span>
<span class="sd"> (array indices start at 1, or from the end if `start` is negative) with the specified `length`.</span>
<span class="sd"> .. versionadded:: 2.4.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> x : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> the array to be sliced</span>
<span class="sd"> start : :class:`~pyspark.sql.Column` or int</span>
<span class="sd"> the starting index</span>
<span class="sd"> length : :class:`~pyspark.sql.Column` or int</span>
<span class="sd"> the length of the slice</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([([1, 2, 3],), ([4, 5],)], [&#39;x&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(slice(df.x, 2, 2).alias(&quot;sliced&quot;)).collect()</span>
<span class="sd"> [Row(sliced=[2, 3]), Row(sliced=[5])]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span>
<span class="k">return</span> <span class="n">Column</span><span class="p">(</span><span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">functions</span><span class="o">.</span><span class="n">slice</span><span class="p">(</span>
<span class="n">_to_java_column</span><span class="p">(</span><span class="n">x</span><span class="p">),</span>
<span class="n">start</span><span class="o">.</span><span class="n">_jc</span> <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">start</span><span class="p">,</span> <span class="n">Column</span><span class="p">)</span> <span class="k">else</span> <span class="n">start</span><span class="p">,</span>
<span class="n">length</span><span class="o">.</span><span class="n">_jc</span> <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">length</span><span class="p">,</span> <span class="n">Column</span><span class="p">)</span> <span class="k">else</span> <span class="n">length</span>
<span class="p">))</span></div>
<div class="viewcode-block" id="array_join"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.array_join.html#pyspark.sql.functions.array_join">[docs]</a><span class="k">def</span> <span class="nf">array_join</span><span class="p">(</span><span class="n">col</span><span class="p">,</span> <span class="n">delimiter</span><span class="p">,</span> <span class="n">null_replacement</span><span class="o">=</span><span class="kc">None</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Concatenates the elements of `column` using the `delimiter`. Null values are replaced with</span>
<span class="sd"> `null_replacement` if set, otherwise they are ignored.</span>
<span class="sd"> .. versionadded:: 2.4.0</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([([&quot;a&quot;, &quot;b&quot;, &quot;c&quot;],), ([&quot;a&quot;, None],)], [&#39;data&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(array_join(df.data, &quot;,&quot;).alias(&quot;joined&quot;)).collect()</span>
<span class="sd"> [Row(joined=&#39;a,b,c&#39;), Row(joined=&#39;a&#39;)]</span>
<span class="sd"> &gt;&gt;&gt; df.select(array_join(df.data, &quot;,&quot;, &quot;NULL&quot;).alias(&quot;joined&quot;)).collect()</span>
<span class="sd"> [Row(joined=&#39;a,b,c&#39;), Row(joined=&#39;a,NULL&#39;)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span>
<span class="k">if</span> <span class="n">null_replacement</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
<span class="k">return</span> <span class="n">Column</span><span class="p">(</span><span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">functions</span><span class="o">.</span><span class="n">array_join</span><span class="p">(</span><span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">),</span> <span class="n">delimiter</span><span class="p">))</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">return</span> <span class="n">Column</span><span class="p">(</span><span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">functions</span><span class="o">.</span><span class="n">array_join</span><span class="p">(</span>
<span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">),</span> <span class="n">delimiter</span><span class="p">,</span> <span class="n">null_replacement</span><span class="p">))</span></div>
<div class="viewcode-block" id="concat"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.concat.html#pyspark.sql.functions.concat">[docs]</a><span class="k">def</span> <span class="nf">concat</span><span class="p">(</span><span class="o">*</span><span class="n">cols</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Concatenates multiple input columns together into a single column.</span>
<span class="sd"> The function works with strings, binary and compatible array columns.</span>
<span class="sd"> .. versionadded:: 1.5.0</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(&#39;abcd&#39;,&#39;123&#39;)], [&#39;s&#39;, &#39;d&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(concat(df.s, df.d).alias(&#39;s&#39;)).collect()</span>
<span class="sd"> [Row(s=&#39;abcd123&#39;)]</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([([1, 2], [3, 4], [5]), ([1, 2], None, [3])], [&#39;a&#39;, &#39;b&#39;, &#39;c&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(concat(df.a, df.b, df.c).alias(&quot;arr&quot;)).collect()</span>
<span class="sd"> [Row(arr=[1, 2, 3, 4, 5]), Row(arr=None)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span>
<span class="k">return</span> <span class="n">Column</span><span class="p">(</span><span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">functions</span><span class="o">.</span><span class="n">concat</span><span class="p">(</span><span class="n">_to_seq</span><span class="p">(</span><span class="n">sc</span><span class="p">,</span> <span class="n">cols</span><span class="p">,</span> <span class="n">_to_java_column</span><span class="p">)))</span></div>
<div class="viewcode-block" id="array_position"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.array_position.html#pyspark.sql.functions.array_position">[docs]</a><span class="k">def</span> <span class="nf">array_position</span><span class="p">(</span><span class="n">col</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Collection function: Locates the position of the first occurrence of the given value</span>
<span class="sd"> in the given array. Returns null if either of the arguments are null.</span>
<span class="sd"> .. versionadded:: 2.4.0</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> The position is not zero based, but 1 based index. Returns 0 if the given</span>
<span class="sd"> value could not be found in the array.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([([&quot;c&quot;, &quot;b&quot;, &quot;a&quot;],), ([],)], [&#39;data&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(array_position(df.data, &quot;a&quot;)).collect()</span>
<span class="sd"> [Row(array_position(data, a)=3), Row(array_position(data, a)=0)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span>
<span class="k">return</span> <span class="n">Column</span><span class="p">(</span><span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">functions</span><span class="o">.</span><span class="n">array_position</span><span class="p">(</span><span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">),</span> <span class="n">value</span><span class="p">))</span></div>
<div class="viewcode-block" id="element_at"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.element_at.html#pyspark.sql.functions.element_at">[docs]</a><span class="k">def</span> <span class="nf">element_at</span><span class="p">(</span><span class="n">col</span><span class="p">,</span> <span class="n">extraction</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Collection function: Returns element of array at given index in extraction if col is array.</span>
<span class="sd"> Returns value for the given key in extraction if col is map.</span>
<span class="sd"> .. versionadded:: 2.4.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> name of column containing array or map</span>
<span class="sd"> extraction :</span>
<span class="sd"> index to check for in array or key to check for in map</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> The position is not zero based, but 1 based index.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([([&quot;a&quot;, &quot;b&quot;, &quot;c&quot;],), ([],)], [&#39;data&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(element_at(df.data, 1)).collect()</span>
<span class="sd"> [Row(element_at(data, 1)=&#39;a&#39;), Row(element_at(data, 1)=None)]</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([({&quot;a&quot;: 1.0, &quot;b&quot;: 2.0},), ({},)], [&#39;data&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(element_at(df.data, lit(&quot;a&quot;))).collect()</span>
<span class="sd"> [Row(element_at(data, a)=1.0), Row(element_at(data, a)=None)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span>
<span class="k">return</span> <span class="n">Column</span><span class="p">(</span><span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">functions</span><span class="o">.</span><span class="n">element_at</span><span class="p">(</span>
<span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">),</span> <span class="n">lit</span><span class="p">(</span><span class="n">extraction</span><span class="p">)</span><span class="o">.</span><span class="n">_jc</span><span class="p">))</span></div>
<div class="viewcode-block" id="array_remove"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.array_remove.html#pyspark.sql.functions.array_remove">[docs]</a><span class="k">def</span> <span class="nf">array_remove</span><span class="p">(</span><span class="n">col</span><span class="p">,</span> <span class="n">element</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Collection function: Remove all elements that equal to element from the given array.</span>
<span class="sd"> .. versionadded:: 2.4.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> name of column containing array</span>
<span class="sd"> element :</span>
<span class="sd"> element to be removed from the array</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([([1, 2, 3, 1, 1],), ([],)], [&#39;data&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(array_remove(df.data, 1)).collect()</span>
<span class="sd"> [Row(array_remove(data, 1)=[2, 3]), Row(array_remove(data, 1)=[])]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span>
<span class="k">return</span> <span class="n">Column</span><span class="p">(</span><span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">functions</span><span class="o">.</span><span class="n">array_remove</span><span class="p">(</span><span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">),</span> <span class="n">element</span><span class="p">))</span></div>
<div class="viewcode-block" id="array_distinct"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.array_distinct.html#pyspark.sql.functions.array_distinct">[docs]</a><span class="k">def</span> <span class="nf">array_distinct</span><span class="p">(</span><span class="n">col</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Collection function: removes duplicate values from the array.</span>
<span class="sd"> .. versionadded:: 2.4.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> name of column or expression</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([([1, 2, 3, 2],), ([4, 5, 5, 4],)], [&#39;data&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(array_distinct(df.data)).collect()</span>
<span class="sd"> [Row(array_distinct(data)=[1, 2, 3]), Row(array_distinct(data)=[4, 5])]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span>
<span class="k">return</span> <span class="n">Column</span><span class="p">(</span><span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">functions</span><span class="o">.</span><span class="n">array_distinct</span><span class="p">(</span><span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">)))</span></div>
<div class="viewcode-block" id="array_intersect"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.array_intersect.html#pyspark.sql.functions.array_intersect">[docs]</a><span class="k">def</span> <span class="nf">array_intersect</span><span class="p">(</span><span class="n">col1</span><span class="p">,</span> <span class="n">col2</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Collection function: returns an array of the elements in the intersection of col1 and col2,</span>
<span class="sd"> without duplicates.</span>
<span class="sd"> .. versionadded:: 2.4.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col1 : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> name of column containing array</span>
<span class="sd"> col2 : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> name of column containing array</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; from pyspark.sql import Row</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([Row(c1=[&quot;b&quot;, &quot;a&quot;, &quot;c&quot;], c2=[&quot;c&quot;, &quot;d&quot;, &quot;a&quot;, &quot;f&quot;])])</span>
<span class="sd"> &gt;&gt;&gt; df.select(array_intersect(df.c1, df.c2)).collect()</span>
<span class="sd"> [Row(array_intersect(c1, c2)=[&#39;a&#39;, &#39;c&#39;])]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span>
<span class="k">return</span> <span class="n">Column</span><span class="p">(</span><span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">functions</span><span class="o">.</span><span class="n">array_intersect</span><span class="p">(</span><span class="n">_to_java_column</span><span class="p">(</span><span class="n">col1</span><span class="p">),</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">col2</span><span class="p">)))</span></div>
<div class="viewcode-block" id="array_union"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.array_union.html#pyspark.sql.functions.array_union">[docs]</a><span class="k">def</span> <span class="nf">array_union</span><span class="p">(</span><span class="n">col1</span><span class="p">,</span> <span class="n">col2</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Collection function: returns an array of the elements in the union of col1 and col2,</span>
<span class="sd"> without duplicates.</span>
<span class="sd"> .. versionadded:: 2.4.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col1 : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> name of column containing array</span>
<span class="sd"> col2 : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> name of column containing array</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; from pyspark.sql import Row</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([Row(c1=[&quot;b&quot;, &quot;a&quot;, &quot;c&quot;], c2=[&quot;c&quot;, &quot;d&quot;, &quot;a&quot;, &quot;f&quot;])])</span>
<span class="sd"> &gt;&gt;&gt; df.select(array_union(df.c1, df.c2)).collect()</span>
<span class="sd"> [Row(array_union(c1, c2)=[&#39;b&#39;, &#39;a&#39;, &#39;c&#39;, &#39;d&#39;, &#39;f&#39;])]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span>
<span class="k">return</span> <span class="n">Column</span><span class="p">(</span><span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">functions</span><span class="o">.</span><span class="n">array_union</span><span class="p">(</span><span class="n">_to_java_column</span><span class="p">(</span><span class="n">col1</span><span class="p">),</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">col2</span><span class="p">)))</span></div>
<div class="viewcode-block" id="array_except"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.array_except.html#pyspark.sql.functions.array_except">[docs]</a><span class="k">def</span> <span class="nf">array_except</span><span class="p">(</span><span class="n">col1</span><span class="p">,</span> <span class="n">col2</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Collection function: returns an array of the elements in col1 but not in col2,</span>
<span class="sd"> without duplicates.</span>
<span class="sd"> .. versionadded:: 2.4.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col1 : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> name of column containing array</span>
<span class="sd"> col2 : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> name of column containing array</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; from pyspark.sql import Row</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([Row(c1=[&quot;b&quot;, &quot;a&quot;, &quot;c&quot;], c2=[&quot;c&quot;, &quot;d&quot;, &quot;a&quot;, &quot;f&quot;])])</span>
<span class="sd"> &gt;&gt;&gt; df.select(array_except(df.c1, df.c2)).collect()</span>
<span class="sd"> [Row(array_except(c1, c2)=[&#39;b&#39;])]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span>
<span class="k">return</span> <span class="n">Column</span><span class="p">(</span><span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">functions</span><span class="o">.</span><span class="n">array_except</span><span class="p">(</span><span class="n">_to_java_column</span><span class="p">(</span><span class="n">col1</span><span class="p">),</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">col2</span><span class="p">)))</span></div>
<div class="viewcode-block" id="explode"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.explode.html#pyspark.sql.functions.explode">[docs]</a><span class="k">def</span> <span class="nf">explode</span><span class="p">(</span><span class="n">col</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns a new row for each element in the given array or map.</span>
<span class="sd"> Uses the default column name `col` for elements in the array and</span>
<span class="sd"> `key` and `value` for elements in the map unless specified otherwise.</span>
<span class="sd"> .. versionadded:: 1.4.0</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; from pyspark.sql import Row</span>
<span class="sd"> &gt;&gt;&gt; eDF = spark.createDataFrame([Row(a=1, intlist=[1,2,3], mapfield={&quot;a&quot;: &quot;b&quot;})])</span>
<span class="sd"> &gt;&gt;&gt; eDF.select(explode(eDF.intlist).alias(&quot;anInt&quot;)).collect()</span>
<span class="sd"> [Row(anInt=1), Row(anInt=2), Row(anInt=3)]</span>
<span class="sd"> &gt;&gt;&gt; eDF.select(explode(eDF.mapfield).alias(&quot;key&quot;, &quot;value&quot;)).show()</span>
<span class="sd"> +---+-----+</span>
<span class="sd"> |key|value|</span>
<span class="sd"> +---+-----+</span>
<span class="sd"> | a| b|</span>
<span class="sd"> +---+-----+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span>
<span class="n">jc</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">functions</span><span class="o">.</span><span class="n">explode</span><span class="p">(</span><span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">))</span>
<span class="k">return</span> <span class="n">Column</span><span class="p">(</span><span class="n">jc</span><span class="p">)</span></div>
<div class="viewcode-block" id="posexplode"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.posexplode.html#pyspark.sql.functions.posexplode">[docs]</a><span class="k">def</span> <span class="nf">posexplode</span><span class="p">(</span><span class="n">col</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns a new row for each element with position in the given array or map.</span>
<span class="sd"> Uses the default column name `pos` for position, and `col` for elements in the</span>
<span class="sd"> array and `key` and `value` for elements in the map unless specified otherwise.</span>
<span class="sd"> .. versionadded:: 2.1.0</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; from pyspark.sql import Row</span>
<span class="sd"> &gt;&gt;&gt; eDF = spark.createDataFrame([Row(a=1, intlist=[1,2,3], mapfield={&quot;a&quot;: &quot;b&quot;})])</span>
<span class="sd"> &gt;&gt;&gt; eDF.select(posexplode(eDF.intlist)).collect()</span>
<span class="sd"> [Row(pos=0, col=1), Row(pos=1, col=2), Row(pos=2, col=3)]</span>
<span class="sd"> &gt;&gt;&gt; eDF.select(posexplode(eDF.mapfield)).show()</span>
<span class="sd"> +---+---+-----+</span>
<span class="sd"> |pos|key|value|</span>
<span class="sd"> +---+---+-----+</span>
<span class="sd"> | 0| a| b|</span>
<span class="sd"> +---+---+-----+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span>
<span class="n">jc</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">functions</span><span class="o">.</span><span class="n">posexplode</span><span class="p">(</span><span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">))</span>
<span class="k">return</span> <span class="n">Column</span><span class="p">(</span><span class="n">jc</span><span class="p">)</span></div>
<div class="viewcode-block" id="explode_outer"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.explode_outer.html#pyspark.sql.functions.explode_outer">[docs]</a><span class="k">def</span> <span class="nf">explode_outer</span><span class="p">(</span><span class="n">col</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns a new row for each element in the given array or map.</span>
<span class="sd"> Unlike explode, if the array/map is null or empty then null is produced.</span>
<span class="sd"> Uses the default column name `col` for elements in the array and</span>
<span class="sd"> `key` and `value` for elements in the map unless specified otherwise.</span>
<span class="sd"> .. versionadded:: 2.3.0</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame(</span>
<span class="sd"> ... [(1, [&quot;foo&quot;, &quot;bar&quot;], {&quot;x&quot;: 1.0}), (2, [], {}), (3, None, None)],</span>
<span class="sd"> ... (&quot;id&quot;, &quot;an_array&quot;, &quot;a_map&quot;)</span>
<span class="sd"> ... )</span>
<span class="sd"> &gt;&gt;&gt; df.select(&quot;id&quot;, &quot;an_array&quot;, explode_outer(&quot;a_map&quot;)).show()</span>
<span class="sd"> +---+----------+----+-----+</span>
<span class="sd"> | id| an_array| key|value|</span>
<span class="sd"> +---+----------+----+-----+</span>
<span class="sd"> | 1|[foo, bar]| x| 1.0|</span>
<span class="sd"> | 2| []|null| null|</span>
<span class="sd"> | 3| null|null| null|</span>
<span class="sd"> +---+----------+----+-----+</span>
<span class="sd"> &gt;&gt;&gt; df.select(&quot;id&quot;, &quot;a_map&quot;, explode_outer(&quot;an_array&quot;)).show()</span>
<span class="sd"> +---+----------+----+</span>
<span class="sd"> | id| a_map| col|</span>
<span class="sd"> +---+----------+----+</span>
<span class="sd"> | 1|{x -&gt; 1.0}| foo|</span>
<span class="sd"> | 1|{x -&gt; 1.0}| bar|</span>
<span class="sd"> | 2| {}|null|</span>
<span class="sd"> | 3| null|null|</span>
<span class="sd"> +---+----------+----+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span>
<span class="n">jc</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">functions</span><span class="o">.</span><span class="n">explode_outer</span><span class="p">(</span><span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">))</span>
<span class="k">return</span> <span class="n">Column</span><span class="p">(</span><span class="n">jc</span><span class="p">)</span></div>
<div class="viewcode-block" id="posexplode_outer"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.posexplode_outer.html#pyspark.sql.functions.posexplode_outer">[docs]</a><span class="k">def</span> <span class="nf">posexplode_outer</span><span class="p">(</span><span class="n">col</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns a new row for each element with position in the given array or map.</span>
<span class="sd"> Unlike posexplode, if the array/map is null or empty then the row (null, null) is produced.</span>
<span class="sd"> Uses the default column name `pos` for position, and `col` for elements in the</span>
<span class="sd"> array and `key` and `value` for elements in the map unless specified otherwise.</span>
<span class="sd"> .. versionadded:: 2.3.0</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame(</span>
<span class="sd"> ... [(1, [&quot;foo&quot;, &quot;bar&quot;], {&quot;x&quot;: 1.0}), (2, [], {}), (3, None, None)],</span>
<span class="sd"> ... (&quot;id&quot;, &quot;an_array&quot;, &quot;a_map&quot;)</span>
<span class="sd"> ... )</span>
<span class="sd"> &gt;&gt;&gt; df.select(&quot;id&quot;, &quot;an_array&quot;, posexplode_outer(&quot;a_map&quot;)).show()</span>
<span class="sd"> +---+----------+----+----+-----+</span>
<span class="sd"> | id| an_array| pos| key|value|</span>
<span class="sd"> +---+----------+----+----+-----+</span>
<span class="sd"> | 1|[foo, bar]| 0| x| 1.0|</span>
<span class="sd"> | 2| []|null|null| null|</span>
<span class="sd"> | 3| null|null|null| null|</span>
<span class="sd"> +---+----------+----+----+-----+</span>
<span class="sd"> &gt;&gt;&gt; df.select(&quot;id&quot;, &quot;a_map&quot;, posexplode_outer(&quot;an_array&quot;)).show()</span>
<span class="sd"> +---+----------+----+----+</span>
<span class="sd"> | id| a_map| pos| col|</span>
<span class="sd"> +---+----------+----+----+</span>
<span class="sd"> | 1|{x -&gt; 1.0}| 0| foo|</span>
<span class="sd"> | 1|{x -&gt; 1.0}| 1| bar|</span>
<span class="sd"> | 2| {}|null|null|</span>
<span class="sd"> | 3| null|null|null|</span>
<span class="sd"> +---+----------+----+----+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span>
<span class="n">jc</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">functions</span><span class="o">.</span><span class="n">posexplode_outer</span><span class="p">(</span><span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">))</span>
<span class="k">return</span> <span class="n">Column</span><span class="p">(</span><span class="n">jc</span><span class="p">)</span></div>
<div class="viewcode-block" id="get_json_object"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.get_json_object.html#pyspark.sql.functions.get_json_object">[docs]</a><span class="k">def</span> <span class="nf">get_json_object</span><span class="p">(</span><span class="n">col</span><span class="p">,</span> <span class="n">path</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Extracts json object from a json string based on json path specified, and returns json string</span>
<span class="sd"> of the extracted json object. It will return null if the input json string is invalid.</span>
<span class="sd"> .. versionadded:: 1.6.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> string column in json format</span>
<span class="sd"> path : str</span>
<span class="sd"> path to the json object to extract</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; data = [(&quot;1&quot;, &#39;&#39;&#39;{&quot;f1&quot;: &quot;value1&quot;, &quot;f2&quot;: &quot;value2&quot;}&#39;&#39;&#39;), (&quot;2&quot;, &#39;&#39;&#39;{&quot;f1&quot;: &quot;value12&quot;}&#39;&#39;&#39;)]</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame(data, (&quot;key&quot;, &quot;jstring&quot;))</span>
<span class="sd"> &gt;&gt;&gt; df.select(df.key, get_json_object(df.jstring, &#39;$.f1&#39;).alias(&quot;c0&quot;), \\</span>
<span class="sd"> ... get_json_object(df.jstring, &#39;$.f2&#39;).alias(&quot;c1&quot;) ).collect()</span>
<span class="sd"> [Row(key=&#39;1&#39;, c0=&#39;value1&#39;, c1=&#39;value2&#39;), Row(key=&#39;2&#39;, c0=&#39;value12&#39;, c1=None)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span>
<span class="n">jc</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">functions</span><span class="o">.</span><span class="n">get_json_object</span><span class="p">(</span><span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">),</span> <span class="n">path</span><span class="p">)</span>
<span class="k">return</span> <span class="n">Column</span><span class="p">(</span><span class="n">jc</span><span class="p">)</span></div>
<div class="viewcode-block" id="json_tuple"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.json_tuple.html#pyspark.sql.functions.json_tuple">[docs]</a><span class="k">def</span> <span class="nf">json_tuple</span><span class="p">(</span><span class="n">col</span><span class="p">,</span> <span class="o">*</span><span class="n">fields</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;Creates a new row for a json column according to the given field names.</span>
<span class="sd"> .. versionadded:: 1.6.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> string column in json format</span>
<span class="sd"> fields : str</span>
<span class="sd"> fields to extract</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; data = [(&quot;1&quot;, &#39;&#39;&#39;{&quot;f1&quot;: &quot;value1&quot;, &quot;f2&quot;: &quot;value2&quot;}&#39;&#39;&#39;), (&quot;2&quot;, &#39;&#39;&#39;{&quot;f1&quot;: &quot;value12&quot;}&#39;&#39;&#39;)]</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame(data, (&quot;key&quot;, &quot;jstring&quot;))</span>
<span class="sd"> &gt;&gt;&gt; df.select(df.key, json_tuple(df.jstring, &#39;f1&#39;, &#39;f2&#39;)).collect()</span>
<span class="sd"> [Row(key=&#39;1&#39;, c0=&#39;value1&#39;, c1=&#39;value2&#39;), Row(key=&#39;2&#39;, c0=&#39;value12&#39;, c1=None)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span>
<span class="n">jc</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">functions</span><span class="o">.</span><span class="n">json_tuple</span><span class="p">(</span><span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">),</span> <span class="n">_to_seq</span><span class="p">(</span><span class="n">sc</span><span class="p">,</span> <span class="n">fields</span><span class="p">))</span>
<span class="k">return</span> <span class="n">Column</span><span class="p">(</span><span class="n">jc</span><span class="p">)</span></div>
<div class="viewcode-block" id="from_json"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.from_json.html#pyspark.sql.functions.from_json">[docs]</a><span class="k">def</span> <span class="nf">from_json</span><span class="p">(</span><span class="n">col</span><span class="p">,</span> <span class="n">schema</span><span class="p">,</span> <span class="n">options</span><span class="o">=</span><span class="kc">None</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Parses a column containing a JSON string into a :class:`MapType` with :class:`StringType`</span>
<span class="sd"> as keys type, :class:`StructType` or :class:`ArrayType` with</span>
<span class="sd"> the specified schema. Returns `null`, in the case of an unparseable string.</span>
<span class="sd"> .. versionadded:: 2.1.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> string column in json format</span>
<span class="sd"> schema : :class:`DataType` or str</span>
<span class="sd"> a StructType or ArrayType of StructType to use when parsing the json column.</span>
<span class="sd"> .. versionchanged:: 2.3</span>
<span class="sd"> the DDL-formatted string is also supported for ``schema``.</span>
<span class="sd"> options : dict, optional</span>
<span class="sd"> options to control parsing. accepts the same options as the json datasource.</span>
<span class="sd"> See `Data Source Option &lt;https://spark.apache.org/docs/latest/sql-data-sources-json.html#data-source-option&gt;`_</span>
<span class="sd"> in the version you use.</span>
<span class="sd"> .. # noqa</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; from pyspark.sql.types import *</span>
<span class="sd"> &gt;&gt;&gt; data = [(1, &#39;&#39;&#39;{&quot;a&quot;: 1}&#39;&#39;&#39;)]</span>
<span class="sd"> &gt;&gt;&gt; schema = StructType([StructField(&quot;a&quot;, IntegerType())])</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame(data, (&quot;key&quot;, &quot;value&quot;))</span>
<span class="sd"> &gt;&gt;&gt; df.select(from_json(df.value, schema).alias(&quot;json&quot;)).collect()</span>
<span class="sd"> [Row(json=Row(a=1))]</span>
<span class="sd"> &gt;&gt;&gt; df.select(from_json(df.value, &quot;a INT&quot;).alias(&quot;json&quot;)).collect()</span>
<span class="sd"> [Row(json=Row(a=1))]</span>
<span class="sd"> &gt;&gt;&gt; df.select(from_json(df.value, &quot;MAP&lt;STRING,INT&gt;&quot;).alias(&quot;json&quot;)).collect()</span>
<span class="sd"> [Row(json={&#39;a&#39;: 1})]</span>
<span class="sd"> &gt;&gt;&gt; data = [(1, &#39;&#39;&#39;[{&quot;a&quot;: 1}]&#39;&#39;&#39;)]</span>
<span class="sd"> &gt;&gt;&gt; schema = ArrayType(StructType([StructField(&quot;a&quot;, IntegerType())]))</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame(data, (&quot;key&quot;, &quot;value&quot;))</span>
<span class="sd"> &gt;&gt;&gt; df.select(from_json(df.value, schema).alias(&quot;json&quot;)).collect()</span>
<span class="sd"> [Row(json=[Row(a=1)])]</span>
<span class="sd"> &gt;&gt;&gt; schema = schema_of_json(lit(&#39;&#39;&#39;{&quot;a&quot;: 0}&#39;&#39;&#39;))</span>
<span class="sd"> &gt;&gt;&gt; df.select(from_json(df.value, schema).alias(&quot;json&quot;)).collect()</span>
<span class="sd"> [Row(json=Row(a=None))]</span>
<span class="sd"> &gt;&gt;&gt; data = [(1, &#39;&#39;&#39;[1, 2, 3]&#39;&#39;&#39;)]</span>
<span class="sd"> &gt;&gt;&gt; schema = ArrayType(IntegerType())</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame(data, (&quot;key&quot;, &quot;value&quot;))</span>
<span class="sd"> &gt;&gt;&gt; df.select(from_json(df.value, schema).alias(&quot;json&quot;)).collect()</span>
<span class="sd"> [Row(json=[1, 2, 3])]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">schema</span><span class="p">,</span> <span class="n">DataType</span><span class="p">):</span>
<span class="n">schema</span> <span class="o">=</span> <span class="n">schema</span><span class="o">.</span><span class="n">json</span><span class="p">()</span>
<span class="k">elif</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">schema</span><span class="p">,</span> <span class="n">Column</span><span class="p">):</span>
<span class="n">schema</span> <span class="o">=</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">schema</span><span class="p">)</span>
<span class="n">jc</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">functions</span><span class="o">.</span><span class="n">from_json</span><span class="p">(</span><span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">),</span> <span class="n">schema</span><span class="p">,</span> <span class="n">_options_to_str</span><span class="p">(</span><span class="n">options</span><span class="p">))</span>
<span class="k">return</span> <span class="n">Column</span><span class="p">(</span><span class="n">jc</span><span class="p">)</span></div>
<div class="viewcode-block" id="to_json"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.to_json.html#pyspark.sql.functions.to_json">[docs]</a><span class="k">def</span> <span class="nf">to_json</span><span class="p">(</span><span class="n">col</span><span class="p">,</span> <span class="n">options</span><span class="o">=</span><span class="kc">None</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Converts a column containing a :class:`StructType`, :class:`ArrayType` or a :class:`MapType`</span>
<span class="sd"> into a JSON string. Throws an exception, in the case of an unsupported type.</span>
<span class="sd"> .. versionadded:: 2.1.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> name of column containing a struct, an array or a map.</span>
<span class="sd"> options : dict, optional</span>
<span class="sd"> options to control converting. accepts the same options as the JSON datasource.</span>
<span class="sd"> See `Data Source Option &lt;https://spark.apache.org/docs/latest/sql-data-sources-json.html#data-source-option&gt;`_</span>
<span class="sd"> in the version you use.</span>
<span class="sd"> Additionally the function supports the `pretty` option which enables</span>
<span class="sd"> pretty JSON generation.</span>
<span class="sd"> .. # noqa</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; from pyspark.sql import Row</span>
<span class="sd"> &gt;&gt;&gt; from pyspark.sql.types import *</span>
<span class="sd"> &gt;&gt;&gt; data = [(1, Row(age=2, name=&#39;Alice&#39;))]</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame(data, (&quot;key&quot;, &quot;value&quot;))</span>
<span class="sd"> &gt;&gt;&gt; df.select(to_json(df.value).alias(&quot;json&quot;)).collect()</span>
<span class="sd"> [Row(json=&#39;{&quot;age&quot;:2,&quot;name&quot;:&quot;Alice&quot;}&#39;)]</span>
<span class="sd"> &gt;&gt;&gt; data = [(1, [Row(age=2, name=&#39;Alice&#39;), Row(age=3, name=&#39;Bob&#39;)])]</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame(data, (&quot;key&quot;, &quot;value&quot;))</span>
<span class="sd"> &gt;&gt;&gt; df.select(to_json(df.value).alias(&quot;json&quot;)).collect()</span>
<span class="sd"> [Row(json=&#39;[{&quot;age&quot;:2,&quot;name&quot;:&quot;Alice&quot;},{&quot;age&quot;:3,&quot;name&quot;:&quot;Bob&quot;}]&#39;)]</span>
<span class="sd"> &gt;&gt;&gt; data = [(1, {&quot;name&quot;: &quot;Alice&quot;})]</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame(data, (&quot;key&quot;, &quot;value&quot;))</span>
<span class="sd"> &gt;&gt;&gt; df.select(to_json(df.value).alias(&quot;json&quot;)).collect()</span>
<span class="sd"> [Row(json=&#39;{&quot;name&quot;:&quot;Alice&quot;}&#39;)]</span>
<span class="sd"> &gt;&gt;&gt; data = [(1, [{&quot;name&quot;: &quot;Alice&quot;}, {&quot;name&quot;: &quot;Bob&quot;}])]</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame(data, (&quot;key&quot;, &quot;value&quot;))</span>
<span class="sd"> &gt;&gt;&gt; df.select(to_json(df.value).alias(&quot;json&quot;)).collect()</span>
<span class="sd"> [Row(json=&#39;[{&quot;name&quot;:&quot;Alice&quot;},{&quot;name&quot;:&quot;Bob&quot;}]&#39;)]</span>
<span class="sd"> &gt;&gt;&gt; data = [(1, [&quot;Alice&quot;, &quot;Bob&quot;])]</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame(data, (&quot;key&quot;, &quot;value&quot;))</span>
<span class="sd"> &gt;&gt;&gt; df.select(to_json(df.value).alias(&quot;json&quot;)).collect()</span>
<span class="sd"> [Row(json=&#39;[&quot;Alice&quot;,&quot;Bob&quot;]&#39;)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span>
<span class="n">jc</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">functions</span><span class="o">.</span><span class="n">to_json</span><span class="p">(</span><span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">),</span> <span class="n">_options_to_str</span><span class="p">(</span><span class="n">options</span><span class="p">))</span>
<span class="k">return</span> <span class="n">Column</span><span class="p">(</span><span class="n">jc</span><span class="p">)</span></div>
<div class="viewcode-block" id="schema_of_json"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.schema_of_json.html#pyspark.sql.functions.schema_of_json">[docs]</a><span class="k">def</span> <span class="nf">schema_of_json</span><span class="p">(</span><span class="n">json</span><span class="p">,</span> <span class="n">options</span><span class="o">=</span><span class="kc">None</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Parses a JSON string and infers its schema in DDL format.</span>
<span class="sd"> .. versionadded:: 2.4.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> json : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> a JSON string or a foldable string column containing a JSON string.</span>
<span class="sd"> options : dict, optional</span>
<span class="sd"> options to control parsing. accepts the same options as the JSON datasource.</span>
<span class="sd"> See `Data Source Option &lt;https://spark.apache.org/docs/latest/sql-data-sources-json.html#data-source-option&gt;`_</span>
<span class="sd"> in the version you use.</span>
<span class="sd"> .. # noqa</span>
<span class="sd"> .. versionchanged:: 3.0</span>
<span class="sd"> It accepts `options` parameter to control schema inferring.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.range(1)</span>
<span class="sd"> &gt;&gt;&gt; df.select(schema_of_json(lit(&#39;{&quot;a&quot;: 0}&#39;)).alias(&quot;json&quot;)).collect()</span>
<span class="sd"> [Row(json=&#39;STRUCT&lt;`a`: BIGINT&gt;&#39;)]</span>
<span class="sd"> &gt;&gt;&gt; schema = schema_of_json(&#39;{a: 1}&#39;, {&#39;allowUnquotedFieldNames&#39;:&#39;true&#39;})</span>
<span class="sd"> &gt;&gt;&gt; df.select(schema.alias(&quot;json&quot;)).collect()</span>
<span class="sd"> [Row(json=&#39;STRUCT&lt;`a`: BIGINT&gt;&#39;)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">json</span><span class="p">,</span> <span class="nb">str</span><span class="p">):</span>
<span class="n">col</span> <span class="o">=</span> <span class="n">_create_column_from_literal</span><span class="p">(</span><span class="n">json</span><span class="p">)</span>
<span class="k">elif</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">json</span><span class="p">,</span> <span class="n">Column</span><span class="p">):</span>
<span class="n">col</span> <span class="o">=</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">json</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">&quot;schema argument should be a column or string&quot;</span><span class="p">)</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span>
<span class="n">jc</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">functions</span><span class="o">.</span><span class="n">schema_of_json</span><span class="p">(</span><span class="n">col</span><span class="p">,</span> <span class="n">_options_to_str</span><span class="p">(</span><span class="n">options</span><span class="p">))</span>
<span class="k">return</span> <span class="n">Column</span><span class="p">(</span><span class="n">jc</span><span class="p">)</span></div>
<div class="viewcode-block" id="schema_of_csv"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.schema_of_csv.html#pyspark.sql.functions.schema_of_csv">[docs]</a><span class="k">def</span> <span class="nf">schema_of_csv</span><span class="p">(</span><span class="n">csv</span><span class="p">,</span> <span class="n">options</span><span class="o">=</span><span class="kc">None</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Parses a CSV string and infers its schema in DDL format.</span>
<span class="sd"> .. versionadded:: 3.0.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> csv : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> a CSV string or a foldable string column containing a CSV string.</span>
<span class="sd"> options : dict, optional</span>
<span class="sd"> options to control parsing. accepts the same options as the CSV datasource.</span>
<span class="sd"> See `Data Source Option &lt;https://spark.apache.org/docs/latest/sql-data-sources-csv.html#data-source-option&gt;`_</span>
<span class="sd"> in the version you use.</span>
<span class="sd"> .. # noqa</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.range(1)</span>
<span class="sd"> &gt;&gt;&gt; df.select(schema_of_csv(lit(&#39;1|a&#39;), {&#39;sep&#39;:&#39;|&#39;}).alias(&quot;csv&quot;)).collect()</span>
<span class="sd"> [Row(csv=&#39;STRUCT&lt;`_c0`: INT, `_c1`: STRING&gt;&#39;)]</span>
<span class="sd"> &gt;&gt;&gt; df.select(schema_of_csv(&#39;1|a&#39;, {&#39;sep&#39;:&#39;|&#39;}).alias(&quot;csv&quot;)).collect()</span>
<span class="sd"> [Row(csv=&#39;STRUCT&lt;`_c0`: INT, `_c1`: STRING&gt;&#39;)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">csv</span><span class="p">,</span> <span class="nb">str</span><span class="p">):</span>
<span class="n">col</span> <span class="o">=</span> <span class="n">_create_column_from_literal</span><span class="p">(</span><span class="n">csv</span><span class="p">)</span>
<span class="k">elif</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">csv</span><span class="p">,</span> <span class="n">Column</span><span class="p">):</span>
<span class="n">col</span> <span class="o">=</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">csv</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">&quot;schema argument should be a column or string&quot;</span><span class="p">)</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span>
<span class="n">jc</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">functions</span><span class="o">.</span><span class="n">schema_of_csv</span><span class="p">(</span><span class="n">col</span><span class="p">,</span> <span class="n">_options_to_str</span><span class="p">(</span><span class="n">options</span><span class="p">))</span>
<span class="k">return</span> <span class="n">Column</span><span class="p">(</span><span class="n">jc</span><span class="p">)</span></div>
<div class="viewcode-block" id="to_csv"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.to_csv.html#pyspark.sql.functions.to_csv">[docs]</a><span class="k">def</span> <span class="nf">to_csv</span><span class="p">(</span><span class="n">col</span><span class="p">,</span> <span class="n">options</span><span class="o">=</span><span class="kc">None</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Converts a column containing a :class:`StructType` into a CSV string.</span>
<span class="sd"> Throws an exception, in the case of an unsupported type.</span>
<span class="sd"> .. versionadded:: 3.0.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> name of column containing a struct.</span>
<span class="sd"> options: dict, optional</span>
<span class="sd"> options to control converting. accepts the same options as the CSV datasource.</span>
<span class="sd"> See `Data Source Option &lt;https://spark.apache.org/docs/latest/sql-data-sources-csv.html#data-source-option&gt;`_</span>
<span class="sd"> in the version you use.</span>
<span class="sd"> .. # noqa</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; from pyspark.sql import Row</span>
<span class="sd"> &gt;&gt;&gt; data = [(1, Row(age=2, name=&#39;Alice&#39;))]</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame(data, (&quot;key&quot;, &quot;value&quot;))</span>
<span class="sd"> &gt;&gt;&gt; df.select(to_csv(df.value).alias(&quot;csv&quot;)).collect()</span>
<span class="sd"> [Row(csv=&#39;2,Alice&#39;)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span>
<span class="n">jc</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">functions</span><span class="o">.</span><span class="n">to_csv</span><span class="p">(</span><span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">),</span> <span class="n">_options_to_str</span><span class="p">(</span><span class="n">options</span><span class="p">))</span>
<span class="k">return</span> <span class="n">Column</span><span class="p">(</span><span class="n">jc</span><span class="p">)</span></div>
<div class="viewcode-block" id="size"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.size.html#pyspark.sql.functions.size">[docs]</a><span class="k">def</span> <span class="nf">size</span><span class="p">(</span><span class="n">col</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Collection function: returns the length of the array or map stored in the column.</span>
<span class="sd"> .. versionadded:: 1.5.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> name of column or expression</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([([1, 2, 3],),([1],),([],)], [&#39;data&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(size(df.data)).collect()</span>
<span class="sd"> [Row(size(data)=3), Row(size(data)=1), Row(size(data)=0)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span>
<span class="k">return</span> <span class="n">Column</span><span class="p">(</span><span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">functions</span><span class="o">.</span><span class="n">size</span><span class="p">(</span><span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">)))</span></div>
<div class="viewcode-block" id="array_min"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.array_min.html#pyspark.sql.functions.array_min">[docs]</a><span class="k">def</span> <span class="nf">array_min</span><span class="p">(</span><span class="n">col</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Collection function: returns the minimum value of the array.</span>
<span class="sd"> .. versionadded:: 2.4.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> name of column or expression</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([([2, 1, 3],), ([None, 10, -1],)], [&#39;data&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(array_min(df.data).alias(&#39;min&#39;)).collect()</span>
<span class="sd"> [Row(min=1), Row(min=-1)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span>
<span class="k">return</span> <span class="n">Column</span><span class="p">(</span><span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">functions</span><span class="o">.</span><span class="n">array_min</span><span class="p">(</span><span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">)))</span></div>
<div class="viewcode-block" id="array_max"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.array_max.html#pyspark.sql.functions.array_max">[docs]</a><span class="k">def</span> <span class="nf">array_max</span><span class="p">(</span><span class="n">col</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Collection function: returns the maximum value of the array.</span>
<span class="sd"> .. versionadded:: 2.4.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> name of column or expression</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([([2, 1, 3],), ([None, 10, -1],)], [&#39;data&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(array_max(df.data).alias(&#39;max&#39;)).collect()</span>
<span class="sd"> [Row(max=3), Row(max=10)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span>
<span class="k">return</span> <span class="n">Column</span><span class="p">(</span><span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">functions</span><span class="o">.</span><span class="n">array_max</span><span class="p">(</span><span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">)))</span></div>
<div class="viewcode-block" id="sort_array"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.sort_array.html#pyspark.sql.functions.sort_array">[docs]</a><span class="k">def</span> <span class="nf">sort_array</span><span class="p">(</span><span class="n">col</span><span class="p">,</span> <span class="n">asc</span><span class="o">=</span><span class="kc">True</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Collection function: sorts the input array in ascending or descending order according</span>
<span class="sd"> to the natural ordering of the array elements. Null elements will be placed at the beginning</span>
<span class="sd"> of the returned array in ascending order or at the end of the returned array in descending</span>
<span class="sd"> order.</span>
<span class="sd"> .. versionadded:: 1.5.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> name of column or expression</span>
<span class="sd"> asc : bool, optional</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([([2, 1, None, 3],),([1],),([],)], [&#39;data&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(sort_array(df.data).alias(&#39;r&#39;)).collect()</span>
<span class="sd"> [Row(r=[None, 1, 2, 3]), Row(r=[1]), Row(r=[])]</span>
<span class="sd"> &gt;&gt;&gt; df.select(sort_array(df.data, asc=False).alias(&#39;r&#39;)).collect()</span>
<span class="sd"> [Row(r=[3, 2, 1, None]), Row(r=[1]), Row(r=[])]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span>
<span class="k">return</span> <span class="n">Column</span><span class="p">(</span><span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">functions</span><span class="o">.</span><span class="n">sort_array</span><span class="p">(</span><span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">),</span> <span class="n">asc</span><span class="p">))</span></div>
<div class="viewcode-block" id="array_sort"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.array_sort.html#pyspark.sql.functions.array_sort">[docs]</a><span class="k">def</span> <span class="nf">array_sort</span><span class="p">(</span><span class="n">col</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Collection function: sorts the input array in ascending order. The elements of the input array</span>
<span class="sd"> must be orderable. Null elements will be placed at the end of the returned array.</span>
<span class="sd"> .. versionadded:: 2.4.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> name of column or expression</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([([2, 1, None, 3],),([1],),([],)], [&#39;data&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(array_sort(df.data).alias(&#39;r&#39;)).collect()</span>
<span class="sd"> [Row(r=[1, 2, 3, None]), Row(r=[1]), Row(r=[])]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span>
<span class="k">return</span> <span class="n">Column</span><span class="p">(</span><span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">functions</span><span class="o">.</span><span class="n">array_sort</span><span class="p">(</span><span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">)))</span></div>
<div class="viewcode-block" id="shuffle"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.shuffle.html#pyspark.sql.functions.shuffle">[docs]</a><span class="k">def</span> <span class="nf">shuffle</span><span class="p">(</span><span class="n">col</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Collection function: Generates a random permutation of the given array.</span>
<span class="sd"> .. versionadded:: 2.4.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> name of column or expression</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> The function is non-deterministic.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([([1, 20, 3, 5],), ([1, 20, None, 3],)], [&#39;data&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(shuffle(df.data).alias(&#39;s&#39;)).collect() # doctest: +SKIP</span>
<span class="sd"> [Row(s=[3, 1, 5, 20]), Row(s=[20, None, 3, 1])]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span>
<span class="k">return</span> <span class="n">Column</span><span class="p">(</span><span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">functions</span><span class="o">.</span><span class="n">shuffle</span><span class="p">(</span><span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">)))</span></div>
<div class="viewcode-block" id="reverse"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.reverse.html#pyspark.sql.functions.reverse">[docs]</a><span class="k">def</span> <span class="nf">reverse</span><span class="p">(</span><span class="n">col</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Collection function: returns a reversed string or an array with reverse order of elements.</span>
<span class="sd"> .. versionadded:: 1.5.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> name of column or expression</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(&#39;Spark SQL&#39;,)], [&#39;data&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(reverse(df.data).alias(&#39;s&#39;)).collect()</span>
<span class="sd"> [Row(s=&#39;LQS krapS&#39;)]</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([([2, 1, 3],) ,([1],) ,([],)], [&#39;data&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(reverse(df.data).alias(&#39;r&#39;)).collect()</span>
<span class="sd"> [Row(r=[3, 1, 2]), Row(r=[1]), Row(r=[])]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span>
<span class="k">return</span> <span class="n">Column</span><span class="p">(</span><span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">functions</span><span class="o">.</span><span class="n">reverse</span><span class="p">(</span><span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">)))</span></div>
<div class="viewcode-block" id="flatten"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.flatten.html#pyspark.sql.functions.flatten">[docs]</a><span class="k">def</span> <span class="nf">flatten</span><span class="p">(</span><span class="n">col</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Collection function: creates a single array from an array of arrays.</span>
<span class="sd"> If a structure of nested arrays is deeper than two levels,</span>
<span class="sd"> only one level of nesting is removed.</span>
<span class="sd"> .. versionadded:: 2.4.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> name of column or expression</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([([[1, 2, 3], [4, 5], [6]],), ([None, [4, 5]],)], [&#39;data&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(flatten(df.data).alias(&#39;r&#39;)).collect()</span>
<span class="sd"> [Row(r=[1, 2, 3, 4, 5, 6]), Row(r=None)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span>
<span class="k">return</span> <span class="n">Column</span><span class="p">(</span><span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">functions</span><span class="o">.</span><span class="n">flatten</span><span class="p">(</span><span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">)))</span></div>
<div class="viewcode-block" id="map_keys"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.map_keys.html#pyspark.sql.functions.map_keys">[docs]</a><span class="k">def</span> <span class="nf">map_keys</span><span class="p">(</span><span class="n">col</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Collection function: Returns an unordered array containing the keys of the map.</span>
<span class="sd"> .. versionadded:: 2.3.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> name of column or expression</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; from pyspark.sql.functions import map_keys</span>
<span class="sd"> &gt;&gt;&gt; df = spark.sql(&quot;SELECT map(1, &#39;a&#39;, 2, &#39;b&#39;) as data&quot;)</span>
<span class="sd"> &gt;&gt;&gt; df.select(map_keys(&quot;data&quot;).alias(&quot;keys&quot;)).show()</span>
<span class="sd"> +------+</span>
<span class="sd"> | keys|</span>
<span class="sd"> +------+</span>
<span class="sd"> |[1, 2]|</span>
<span class="sd"> +------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span>
<span class="k">return</span> <span class="n">Column</span><span class="p">(</span><span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">functions</span><span class="o">.</span><span class="n">map_keys</span><span class="p">(</span><span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">)))</span></div>
<div class="viewcode-block" id="map_values"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.map_values.html#pyspark.sql.functions.map_values">[docs]</a><span class="k">def</span> <span class="nf">map_values</span><span class="p">(</span><span class="n">col</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Collection function: Returns an unordered array containing the values of the map.</span>
<span class="sd"> .. versionadded:: 2.3.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> name of column or expression</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; from pyspark.sql.functions import map_values</span>
<span class="sd"> &gt;&gt;&gt; df = spark.sql(&quot;SELECT map(1, &#39;a&#39;, 2, &#39;b&#39;) as data&quot;)</span>
<span class="sd"> &gt;&gt;&gt; df.select(map_values(&quot;data&quot;).alias(&quot;values&quot;)).show()</span>
<span class="sd"> +------+</span>
<span class="sd"> |values|</span>
<span class="sd"> +------+</span>
<span class="sd"> |[a, b]|</span>
<span class="sd"> +------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span>
<span class="k">return</span> <span class="n">Column</span><span class="p">(</span><span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">functions</span><span class="o">.</span><span class="n">map_values</span><span class="p">(</span><span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">)))</span></div>
<div class="viewcode-block" id="map_entries"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.map_entries.html#pyspark.sql.functions.map_entries">[docs]</a><span class="k">def</span> <span class="nf">map_entries</span><span class="p">(</span><span class="n">col</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Collection function: Returns an unordered array of all entries in the given map.</span>
<span class="sd"> .. versionadded:: 3.0.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> name of column or expression</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; from pyspark.sql.functions import map_entries</span>
<span class="sd"> &gt;&gt;&gt; df = spark.sql(&quot;SELECT map(1, &#39;a&#39;, 2, &#39;b&#39;) as data&quot;)</span>
<span class="sd"> &gt;&gt;&gt; df.select(map_entries(&quot;data&quot;).alias(&quot;entries&quot;)).show()</span>
<span class="sd"> +----------------+</span>
<span class="sd"> | entries|</span>
<span class="sd"> +----------------+</span>
<span class="sd"> |[{1, a}, {2, b}]|</span>
<span class="sd"> +----------------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span>
<span class="k">return</span> <span class="n">Column</span><span class="p">(</span><span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">functions</span><span class="o">.</span><span class="n">map_entries</span><span class="p">(</span><span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">)))</span></div>
<div class="viewcode-block" id="map_from_entries"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.map_from_entries.html#pyspark.sql.functions.map_from_entries">[docs]</a><span class="k">def</span> <span class="nf">map_from_entries</span><span class="p">(</span><span class="n">col</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Collection function: Returns a map created from the given array of entries.</span>
<span class="sd"> .. versionadded:: 2.4.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> name of column or expression</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; from pyspark.sql.functions import map_from_entries</span>
<span class="sd"> &gt;&gt;&gt; df = spark.sql(&quot;SELECT array(struct(1, &#39;a&#39;), struct(2, &#39;b&#39;)) as data&quot;)</span>
<span class="sd"> &gt;&gt;&gt; df.select(map_from_entries(&quot;data&quot;).alias(&quot;map&quot;)).show()</span>
<span class="sd"> +----------------+</span>
<span class="sd"> | map|</span>
<span class="sd"> +----------------+</span>
<span class="sd"> |{1 -&gt; a, 2 -&gt; b}|</span>
<span class="sd"> +----------------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span>
<span class="k">return</span> <span class="n">Column</span><span class="p">(</span><span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">functions</span><span class="o">.</span><span class="n">map_from_entries</span><span class="p">(</span><span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">)))</span></div>
<div class="viewcode-block" id="array_repeat"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.array_repeat.html#pyspark.sql.functions.array_repeat">[docs]</a><span class="k">def</span> <span class="nf">array_repeat</span><span class="p">(</span><span class="n">col</span><span class="p">,</span> <span class="n">count</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Collection function: creates an array containing a column repeated count times.</span>
<span class="sd"> .. versionadded:: 2.4.0</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(&#39;ab&#39;,)], [&#39;data&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(array_repeat(df.data, 3).alias(&#39;r&#39;)).collect()</span>
<span class="sd"> [Row(r=[&#39;ab&#39;, &#39;ab&#39;, &#39;ab&#39;])]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span>
<span class="k">return</span> <span class="n">Column</span><span class="p">(</span><span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">functions</span><span class="o">.</span><span class="n">array_repeat</span><span class="p">(</span>
<span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">),</span>
<span class="n">_to_java_column</span><span class="p">(</span><span class="n">count</span><span class="p">)</span> <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">count</span><span class="p">,</span> <span class="n">Column</span><span class="p">)</span> <span class="k">else</span> <span class="n">count</span>
<span class="p">))</span></div>
<div class="viewcode-block" id="arrays_zip"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.arrays_zip.html#pyspark.sql.functions.arrays_zip">[docs]</a><span class="k">def</span> <span class="nf">arrays_zip</span><span class="p">(</span><span class="o">*</span><span class="n">cols</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Collection function: Returns a merged array of structs in which the N-th struct contains all</span>
<span class="sd"> N-th values of input arrays.</span>
<span class="sd"> .. versionadded:: 2.4.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> cols : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> columns of arrays to be merged.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; from pyspark.sql.functions import arrays_zip</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(([1, 2, 3], [2, 3, 4]))], [&#39;vals1&#39;, &#39;vals2&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(arrays_zip(df.vals1, df.vals2).alias(&#39;zipped&#39;)).collect()</span>
<span class="sd"> [Row(zipped=[Row(vals1=1, vals2=2), Row(vals1=2, vals2=3), Row(vals1=3, vals2=4)])]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span>
<span class="k">return</span> <span class="n">Column</span><span class="p">(</span><span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">functions</span><span class="o">.</span><span class="n">arrays_zip</span><span class="p">(</span><span class="n">_to_seq</span><span class="p">(</span><span class="n">sc</span><span class="p">,</span> <span class="n">cols</span><span class="p">,</span> <span class="n">_to_java_column</span><span class="p">)))</span></div>
<div class="viewcode-block" id="map_concat"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.map_concat.html#pyspark.sql.functions.map_concat">[docs]</a><span class="k">def</span> <span class="nf">map_concat</span><span class="p">(</span><span class="o">*</span><span class="n">cols</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;Returns the union of all the given maps.</span>
<span class="sd"> .. versionadded:: 2.4.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> cols : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> column names or :class:`~pyspark.sql.Column`\\s</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; from pyspark.sql.functions import map_concat</span>
<span class="sd"> &gt;&gt;&gt; df = spark.sql(&quot;SELECT map(1, &#39;a&#39;, 2, &#39;b&#39;) as map1, map(3, &#39;c&#39;) as map2&quot;)</span>
<span class="sd"> &gt;&gt;&gt; df.select(map_concat(&quot;map1&quot;, &quot;map2&quot;).alias(&quot;map3&quot;)).show(truncate=False)</span>
<span class="sd"> +------------------------+</span>
<span class="sd"> |map3 |</span>
<span class="sd"> +------------------------+</span>
<span class="sd"> |{1 -&gt; a, 2 -&gt; b, 3 -&gt; c}|</span>
<span class="sd"> +------------------------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span>
<span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">cols</span><span class="p">)</span> <span class="o">==</span> <span class="mi">1</span> <span class="ow">and</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">cols</span><span class="p">[</span><span class="mi">0</span><span class="p">],</span> <span class="p">(</span><span class="nb">list</span><span class="p">,</span> <span class="nb">set</span><span class="p">)):</span>
<span class="n">cols</span> <span class="o">=</span> <span class="n">cols</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span>
<span class="n">jc</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">functions</span><span class="o">.</span><span class="n">map_concat</span><span class="p">(</span><span class="n">_to_seq</span><span class="p">(</span><span class="n">sc</span><span class="p">,</span> <span class="n">cols</span><span class="p">,</span> <span class="n">_to_java_column</span><span class="p">))</span>
<span class="k">return</span> <span class="n">Column</span><span class="p">(</span><span class="n">jc</span><span class="p">)</span></div>
<div class="viewcode-block" id="sequence"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.sequence.html#pyspark.sql.functions.sequence">[docs]</a><span class="k">def</span> <span class="nf">sequence</span><span class="p">(</span><span class="n">start</span><span class="p">,</span> <span class="n">stop</span><span class="p">,</span> <span class="n">step</span><span class="o">=</span><span class="kc">None</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Generate a sequence of integers from `start` to `stop`, incrementing by `step`.</span>
<span class="sd"> If `step` is not set, incrementing by 1 if `start` is less than or equal to `stop`,</span>
<span class="sd"> otherwise -1.</span>
<span class="sd"> .. versionadded:: 2.4.0</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df1 = spark.createDataFrame([(-2, 2)], (&#39;C1&#39;, &#39;C2&#39;))</span>
<span class="sd"> &gt;&gt;&gt; df1.select(sequence(&#39;C1&#39;, &#39;C2&#39;).alias(&#39;r&#39;)).collect()</span>
<span class="sd"> [Row(r=[-2, -1, 0, 1, 2])]</span>
<span class="sd"> &gt;&gt;&gt; df2 = spark.createDataFrame([(4, -4, -2)], (&#39;C1&#39;, &#39;C2&#39;, &#39;C3&#39;))</span>
<span class="sd"> &gt;&gt;&gt; df2.select(sequence(&#39;C1&#39;, &#39;C2&#39;, &#39;C3&#39;).alias(&#39;r&#39;)).collect()</span>
<span class="sd"> [Row(r=[4, 2, 0, -2, -4])]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span>
<span class="k">if</span> <span class="n">step</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
<span class="k">return</span> <span class="n">Column</span><span class="p">(</span><span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">functions</span><span class="o">.</span><span class="n">sequence</span><span class="p">(</span><span class="n">_to_java_column</span><span class="p">(</span><span class="n">start</span><span class="p">),</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">stop</span><span class="p">)))</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">return</span> <span class="n">Column</span><span class="p">(</span><span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">functions</span><span class="o">.</span><span class="n">sequence</span><span class="p">(</span>
<span class="n">_to_java_column</span><span class="p">(</span><span class="n">start</span><span class="p">),</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">stop</span><span class="p">),</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">step</span><span class="p">)))</span></div>
<div class="viewcode-block" id="from_csv"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.from_csv.html#pyspark.sql.functions.from_csv">[docs]</a><span class="k">def</span> <span class="nf">from_csv</span><span class="p">(</span><span class="n">col</span><span class="p">,</span> <span class="n">schema</span><span class="p">,</span> <span class="n">options</span><span class="o">=</span><span class="kc">None</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Parses a column containing a CSV string to a row with the specified schema.</span>
<span class="sd"> Returns `null`, in the case of an unparseable string.</span>
<span class="sd"> .. versionadded:: 3.0.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> string column in CSV format</span>
<span class="sd"> schema :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> a string with schema in DDL format to use when parsing the CSV column.</span>
<span class="sd"> options : dict, optional</span>
<span class="sd"> options to control parsing. accepts the same options as the CSV datasource.</span>
<span class="sd"> See `Data Source Option &lt;https://spark.apache.org/docs/latest/sql-data-sources-csv.html#data-source-option&gt;`_</span>
<span class="sd"> in the version you use.</span>
<span class="sd"> .. # noqa</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; data = [(&quot;1,2,3&quot;,)]</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame(data, (&quot;value&quot;,))</span>
<span class="sd"> &gt;&gt;&gt; df.select(from_csv(df.value, &quot;a INT, b INT, c INT&quot;).alias(&quot;csv&quot;)).collect()</span>
<span class="sd"> [Row(csv=Row(a=1, b=2, c=3))]</span>
<span class="sd"> &gt;&gt;&gt; value = data[0][0]</span>
<span class="sd"> &gt;&gt;&gt; df.select(from_csv(df.value, schema_of_csv(value)).alias(&quot;csv&quot;)).collect()</span>
<span class="sd"> [Row(csv=Row(_c0=1, _c1=2, _c2=3))]</span>
<span class="sd"> &gt;&gt;&gt; data = [(&quot; abc&quot;,)]</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame(data, (&quot;value&quot;,))</span>
<span class="sd"> &gt;&gt;&gt; options = {&#39;ignoreLeadingWhiteSpace&#39;: True}</span>
<span class="sd"> &gt;&gt;&gt; df.select(from_csv(df.value, &quot;s string&quot;, options).alias(&quot;csv&quot;)).collect()</span>
<span class="sd"> [Row(csv=Row(s=&#39;abc&#39;))]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">schema</span><span class="p">,</span> <span class="nb">str</span><span class="p">):</span>
<span class="n">schema</span> <span class="o">=</span> <span class="n">_create_column_from_literal</span><span class="p">(</span><span class="n">schema</span><span class="p">)</span>
<span class="k">elif</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">schema</span><span class="p">,</span> <span class="n">Column</span><span class="p">):</span>
<span class="n">schema</span> <span class="o">=</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">schema</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">&quot;schema argument should be a column or string&quot;</span><span class="p">)</span>
<span class="n">jc</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">functions</span><span class="o">.</span><span class="n">from_csv</span><span class="p">(</span><span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">),</span> <span class="n">schema</span><span class="p">,</span> <span class="n">_options_to_str</span><span class="p">(</span><span class="n">options</span><span class="p">))</span>
<span class="k">return</span> <span class="n">Column</span><span class="p">(</span><span class="n">jc</span><span class="p">)</span></div>
<span class="k">def</span> <span class="nf">_unresolved_named_lambda_variable</span><span class="p">(</span><span class="o">*</span><span class="n">name_parts</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Create `o.a.s.sql.expressions.UnresolvedNamedLambdaVariable`,</span>
<span class="sd"> convert it to o.s.sql.Column and wrap in Python `Column`</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> name_parts : str</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span>
<span class="n">name_parts_seq</span> <span class="o">=</span> <span class="n">_to_seq</span><span class="p">(</span><span class="n">sc</span><span class="p">,</span> <span class="n">name_parts</span><span class="p">)</span>
<span class="n">expressions</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">org</span><span class="o">.</span><span class="n">apache</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">sql</span><span class="o">.</span><span class="n">catalyst</span><span class="o">.</span><span class="n">expressions</span>
<span class="k">return</span> <span class="n">Column</span><span class="p">(</span>
<span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">Column</span><span class="p">(</span>
<span class="n">expressions</span><span class="o">.</span><span class="n">UnresolvedNamedLambdaVariable</span><span class="p">(</span><span class="n">name_parts_seq</span><span class="p">)</span>
<span class="p">)</span>
<span class="p">)</span>
<span class="k">def</span> <span class="nf">_get_lambda_parameters</span><span class="p">(</span><span class="n">f</span><span class="p">):</span>
<span class="kn">import</span> <span class="nn">inspect</span>
<span class="n">signature</span> <span class="o">=</span> <span class="n">inspect</span><span class="o">.</span><span class="n">signature</span><span class="p">(</span><span class="n">f</span><span class="p">)</span>
<span class="n">parameters</span> <span class="o">=</span> <span class="n">signature</span><span class="o">.</span><span class="n">parameters</span><span class="o">.</span><span class="n">values</span><span class="p">()</span>
<span class="c1"># We should exclude functions that use</span>
<span class="c1"># variable args and keyword argnames</span>
<span class="c1"># as well as keyword only args</span>
<span class="n">supported_parameter_types</span> <span class="o">=</span> <span class="p">{</span>
<span class="n">inspect</span><span class="o">.</span><span class="n">Parameter</span><span class="o">.</span><span class="n">POSITIONAL_OR_KEYWORD</span><span class="p">,</span>
<span class="n">inspect</span><span class="o">.</span><span class="n">Parameter</span><span class="o">.</span><span class="n">POSITIONAL_ONLY</span><span class="p">,</span>
<span class="p">}</span>
<span class="c1"># Validate that</span>
<span class="c1"># function arity is between 1 and 3</span>
<span class="k">if</span> <span class="ow">not</span> <span class="p">(</span><span class="mi">1</span> <span class="o">&lt;=</span> <span class="nb">len</span><span class="p">(</span><span class="n">parameters</span><span class="p">)</span> <span class="o">&lt;=</span> <span class="mi">3</span><span class="p">):</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span>
<span class="s2">&quot;f should take between 1 and 3 arguments, but provided function takes </span><span class="si">{}</span><span class="s2">&quot;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span>
<span class="nb">len</span><span class="p">(</span><span class="n">parameters</span><span class="p">)</span>
<span class="p">)</span>
<span class="p">)</span>
<span class="c1"># and all arguments can be used as positional</span>
<span class="k">if</span> <span class="ow">not</span> <span class="nb">all</span><span class="p">(</span><span class="n">p</span><span class="o">.</span><span class="n">kind</span> <span class="ow">in</span> <span class="n">supported_parameter_types</span> <span class="k">for</span> <span class="n">p</span> <span class="ow">in</span> <span class="n">parameters</span><span class="p">):</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span>
<span class="s2">&quot;f should use only POSITIONAL or POSITIONAL OR KEYWORD arguments&quot;</span>
<span class="p">)</span>
<span class="k">return</span> <span class="n">parameters</span>
<span class="k">def</span> <span class="nf">_create_lambda</span><span class="p">(</span><span class="n">f</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Create `o.a.s.sql.expressions.LambdaFunction` corresponding</span>
<span class="sd"> to transformation described by f</span>
<span class="sd"> :param f: A Python of one of the following forms:</span>
<span class="sd"> - (Column) -&gt; Column: ...</span>
<span class="sd"> - (Column, Column) -&gt; Column: ...</span>
<span class="sd"> - (Column, Column, Column) -&gt; Column: ...</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">parameters</span> <span class="o">=</span> <span class="n">_get_lambda_parameters</span><span class="p">(</span><span class="n">f</span><span class="p">)</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span>
<span class="n">expressions</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">org</span><span class="o">.</span><span class="n">apache</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">sql</span><span class="o">.</span><span class="n">catalyst</span><span class="o">.</span><span class="n">expressions</span>
<span class="n">argnames</span> <span class="o">=</span> <span class="p">[</span><span class="s2">&quot;x&quot;</span><span class="p">,</span> <span class="s2">&quot;y&quot;</span><span class="p">,</span> <span class="s2">&quot;z&quot;</span><span class="p">]</span>
<span class="n">args</span> <span class="o">=</span> <span class="p">[</span>
<span class="n">_unresolved_named_lambda_variable</span><span class="p">(</span>
<span class="n">expressions</span><span class="o">.</span><span class="n">UnresolvedNamedLambdaVariable</span><span class="o">.</span><span class="n">freshVarName</span><span class="p">(</span><span class="n">arg</span><span class="p">)</span>
<span class="p">)</span>
<span class="k">for</span> <span class="n">arg</span> <span class="ow">in</span> <span class="n">argnames</span><span class="p">[:</span> <span class="nb">len</span><span class="p">(</span><span class="n">parameters</span><span class="p">)]</span>
<span class="p">]</span>
<span class="n">result</span> <span class="o">=</span> <span class="n">f</span><span class="p">(</span><span class="o">*</span><span class="n">args</span><span class="p">)</span>
<span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">result</span><span class="p">,</span> <span class="n">Column</span><span class="p">):</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">&quot;f should return Column, got </span><span class="si">{}</span><span class="s2">&quot;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="nb">type</span><span class="p">(</span><span class="n">result</span><span class="p">)))</span>
<span class="n">jexpr</span> <span class="o">=</span> <span class="n">result</span><span class="o">.</span><span class="n">_jc</span><span class="o">.</span><span class="n">expr</span><span class="p">()</span>
<span class="n">jargs</span> <span class="o">=</span> <span class="n">_to_seq</span><span class="p">(</span><span class="n">sc</span><span class="p">,</span> <span class="p">[</span><span class="n">arg</span><span class="o">.</span><span class="n">_jc</span><span class="o">.</span><span class="n">expr</span><span class="p">()</span> <span class="k">for</span> <span class="n">arg</span> <span class="ow">in</span> <span class="n">args</span><span class="p">])</span>
<span class="k">return</span> <span class="n">expressions</span><span class="o">.</span><span class="n">LambdaFunction</span><span class="p">(</span><span class="n">jexpr</span><span class="p">,</span> <span class="n">jargs</span><span class="p">,</span> <span class="kc">False</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">_invoke_higher_order_function</span><span class="p">(</span><span class="n">name</span><span class="p">,</span> <span class="n">cols</span><span class="p">,</span> <span class="n">funs</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Invokes expression identified by name,</span>
<span class="sd"> (relative to ```org.apache.spark.sql.catalyst.expressions``)</span>
<span class="sd"> and wraps the result with Column (first Scala one, then Python).</span>
<span class="sd"> :param name: Name of the expression</span>
<span class="sd"> :param cols: a list of columns</span>
<span class="sd"> :param funs: a list of((*Column) -&gt; Column functions.</span>
<span class="sd"> :return: a Column</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span>
<span class="n">expressions</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">org</span><span class="o">.</span><span class="n">apache</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">sql</span><span class="o">.</span><span class="n">catalyst</span><span class="o">.</span><span class="n">expressions</span>
<span class="n">expr</span> <span class="o">=</span> <span class="nb">getattr</span><span class="p">(</span><span class="n">expressions</span><span class="p">,</span> <span class="n">name</span><span class="p">)</span>
<span class="n">jcols</span> <span class="o">=</span> <span class="p">[</span><span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">)</span><span class="o">.</span><span class="n">expr</span><span class="p">()</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">cols</span><span class="p">]</span>
<span class="n">jfuns</span> <span class="o">=</span> <span class="p">[</span><span class="n">_create_lambda</span><span class="p">(</span><span class="n">f</span><span class="p">)</span> <span class="k">for</span> <span class="n">f</span> <span class="ow">in</span> <span class="n">funs</span><span class="p">]</span>
<span class="k">return</span> <span class="n">Column</span><span class="p">(</span><span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">Column</span><span class="p">(</span><span class="n">expr</span><span class="p">(</span><span class="o">*</span><span class="n">jcols</span> <span class="o">+</span> <span class="n">jfuns</span><span class="p">)))</span>
<div class="viewcode-block" id="transform"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.transform.html#pyspark.sql.functions.transform">[docs]</a><span class="k">def</span> <span class="nf">transform</span><span class="p">(</span><span class="n">col</span><span class="p">,</span> <span class="n">f</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns an array of elements after applying a transformation to each element in the input array.</span>
<span class="sd"> .. versionadded:: 3.1.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> name of column or expression</span>
<span class="sd"> f : function</span>
<span class="sd"> a function that is applied to each element of the input array.</span>
<span class="sd"> Can take one of the following forms:</span>
<span class="sd"> - Unary ``(x: Column) -&gt; Column: ...``</span>
<span class="sd"> - Binary ``(x: Column, i: Column) -&gt; Column...``, where the second argument is</span>
<span class="sd"> a 0-based index of the element.</span>
<span class="sd"> and can use methods of :class:`~pyspark.sql.Column`, functions defined in</span>
<span class="sd"> :py:mod:`pyspark.sql.functions` and Scala ``UserDefinedFunctions``.</span>
<span class="sd"> Python ``UserDefinedFunctions`` are not supported</span>
<span class="sd"> (`SPARK-27052 &lt;https://issues.apache.org/jira/browse/SPARK-27052&gt;`__).</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(1, [1, 2, 3, 4])], (&quot;key&quot;, &quot;values&quot;))</span>
<span class="sd"> &gt;&gt;&gt; df.select(transform(&quot;values&quot;, lambda x: x * 2).alias(&quot;doubled&quot;)).show()</span>
<span class="sd"> +------------+</span>
<span class="sd"> | doubled|</span>
<span class="sd"> +------------+</span>
<span class="sd"> |[2, 4, 6, 8]|</span>
<span class="sd"> +------------+</span>
<span class="sd"> &gt;&gt;&gt; def alternate(x, i):</span>
<span class="sd"> ... return when(i % 2 == 0, x).otherwise(-x)</span>
<span class="sd"> &gt;&gt;&gt; df.select(transform(&quot;values&quot;, alternate).alias(&quot;alternated&quot;)).show()</span>
<span class="sd"> +--------------+</span>
<span class="sd"> | alternated|</span>
<span class="sd"> +--------------+</span>
<span class="sd"> |[1, -2, 3, -4]|</span>
<span class="sd"> +--------------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_higher_order_function</span><span class="p">(</span><span class="s2">&quot;ArrayTransform&quot;</span><span class="p">,</span> <span class="p">[</span><span class="n">col</span><span class="p">],</span> <span class="p">[</span><span class="n">f</span><span class="p">])</span></div>
<div class="viewcode-block" id="exists"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.exists.html#pyspark.sql.functions.exists">[docs]</a><span class="k">def</span> <span class="nf">exists</span><span class="p">(</span><span class="n">col</span><span class="p">,</span> <span class="n">f</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns whether a predicate holds for one or more elements in the array.</span>
<span class="sd"> .. versionadded:: 3.1.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> name of column or expression</span>
<span class="sd"> f : function</span>
<span class="sd"> ``(x: Column) -&gt; Column: ...`` returning the Boolean expression.</span>
<span class="sd"> Can use methods of :class:`~pyspark.sql.Column`, functions defined in</span>
<span class="sd"> :py:mod:`pyspark.sql.functions` and Scala ``UserDefinedFunctions``.</span>
<span class="sd"> Python ``UserDefinedFunctions`` are not supported</span>
<span class="sd"> (`SPARK-27052 &lt;https://issues.apache.org/jira/browse/SPARK-27052&gt;`__).</span>
<span class="sd"> :return: a :class:`~pyspark.sql.Column`</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(1, [1, 2, 3, 4]), (2, [3, -1, 0])],(&quot;key&quot;, &quot;values&quot;))</span>
<span class="sd"> &gt;&gt;&gt; df.select(exists(&quot;values&quot;, lambda x: x &lt; 0).alias(&quot;any_negative&quot;)).show()</span>
<span class="sd"> +------------+</span>
<span class="sd"> |any_negative|</span>
<span class="sd"> +------------+</span>
<span class="sd"> | false|</span>
<span class="sd"> | true|</span>
<span class="sd"> +------------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_higher_order_function</span><span class="p">(</span><span class="s2">&quot;ArrayExists&quot;</span><span class="p">,</span> <span class="p">[</span><span class="n">col</span><span class="p">],</span> <span class="p">[</span><span class="n">f</span><span class="p">])</span></div>
<div class="viewcode-block" id="forall"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.forall.html#pyspark.sql.functions.forall">[docs]</a><span class="k">def</span> <span class="nf">forall</span><span class="p">(</span><span class="n">col</span><span class="p">,</span> <span class="n">f</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns whether a predicate holds for every element in the array.</span>
<span class="sd"> .. versionadded:: 3.1.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> name of column or expression</span>
<span class="sd"> f : function</span>
<span class="sd"> ``(x: Column) -&gt; Column: ...`` returning the Boolean expression.</span>
<span class="sd"> Can use methods of :class:`~pyspark.sql.Column`, functions defined in</span>
<span class="sd"> :py:mod:`pyspark.sql.functions` and Scala ``UserDefinedFunctions``.</span>
<span class="sd"> Python ``UserDefinedFunctions`` are not supported</span>
<span class="sd"> (`SPARK-27052 &lt;https://issues.apache.org/jira/browse/SPARK-27052&gt;`__).</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame(</span>
<span class="sd"> ... [(1, [&quot;bar&quot;]), (2, [&quot;foo&quot;, &quot;bar&quot;]), (3, [&quot;foobar&quot;, &quot;foo&quot;])],</span>
<span class="sd"> ... (&quot;key&quot;, &quot;values&quot;)</span>
<span class="sd"> ... )</span>
<span class="sd"> &gt;&gt;&gt; df.select(forall(&quot;values&quot;, lambda x: x.rlike(&quot;foo&quot;)).alias(&quot;all_foo&quot;)).show()</span>
<span class="sd"> +-------+</span>
<span class="sd"> |all_foo|</span>
<span class="sd"> +-------+</span>
<span class="sd"> | false|</span>
<span class="sd"> | false|</span>
<span class="sd"> | true|</span>
<span class="sd"> +-------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_higher_order_function</span><span class="p">(</span><span class="s2">&quot;ArrayForAll&quot;</span><span class="p">,</span> <span class="p">[</span><span class="n">col</span><span class="p">],</span> <span class="p">[</span><span class="n">f</span><span class="p">])</span></div>
<div class="viewcode-block" id="filter"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.filter.html#pyspark.sql.functions.filter">[docs]</a><span class="k">def</span> <span class="nf">filter</span><span class="p">(</span><span class="n">col</span><span class="p">,</span> <span class="n">f</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns an array of elements for which a predicate holds in a given array.</span>
<span class="sd"> .. versionadded:: 3.1.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> name of column or expression</span>
<span class="sd"> f : function</span>
<span class="sd"> A function that returns the Boolean expression.</span>
<span class="sd"> Can take one of the following forms:</span>
<span class="sd"> - Unary ``(x: Column) -&gt; Column: ...``</span>
<span class="sd"> - Binary ``(x: Column, i: Column) -&gt; Column...``, where the second argument is</span>
<span class="sd"> a 0-based index of the element.</span>
<span class="sd"> and can use methods of :class:`~pyspark.sql.Column`, functions defined in</span>
<span class="sd"> :py:mod:`pyspark.sql.functions` and Scala ``UserDefinedFunctions``.</span>
<span class="sd"> Python ``UserDefinedFunctions`` are not supported</span>
<span class="sd"> (`SPARK-27052 &lt;https://issues.apache.org/jira/browse/SPARK-27052&gt;`__).</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame(</span>
<span class="sd"> ... [(1, [&quot;2018-09-20&quot;, &quot;2019-02-03&quot;, &quot;2019-07-01&quot;, &quot;2020-06-01&quot;])],</span>
<span class="sd"> ... (&quot;key&quot;, &quot;values&quot;)</span>
<span class="sd"> ... )</span>
<span class="sd"> &gt;&gt;&gt; def after_second_quarter(x):</span>
<span class="sd"> ... return month(to_date(x)) &gt; 6</span>
<span class="sd"> &gt;&gt;&gt; df.select(</span>
<span class="sd"> ... filter(&quot;values&quot;, after_second_quarter).alias(&quot;after_second_quarter&quot;)</span>
<span class="sd"> ... ).show(truncate=False)</span>
<span class="sd"> +------------------------+</span>
<span class="sd"> |after_second_quarter |</span>
<span class="sd"> +------------------------+</span>
<span class="sd"> |[2018-09-20, 2019-07-01]|</span>
<span class="sd"> +------------------------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_higher_order_function</span><span class="p">(</span><span class="s2">&quot;ArrayFilter&quot;</span><span class="p">,</span> <span class="p">[</span><span class="n">col</span><span class="p">],</span> <span class="p">[</span><span class="n">f</span><span class="p">])</span></div>
<div class="viewcode-block" id="aggregate"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.aggregate.html#pyspark.sql.functions.aggregate">[docs]</a><span class="k">def</span> <span class="nf">aggregate</span><span class="p">(</span><span class="n">col</span><span class="p">,</span> <span class="n">initialValue</span><span class="p">,</span> <span class="n">merge</span><span class="p">,</span> <span class="n">finish</span><span class="o">=</span><span class="kc">None</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Applies a binary operator to an initial state and all elements in the array,</span>
<span class="sd"> and reduces this to a single state. The final state is converted into the final result</span>
<span class="sd"> by applying a finish function.</span>
<span class="sd"> Both functions can use methods of :class:`~pyspark.sql.Column`, functions defined in</span>
<span class="sd"> :py:mod:`pyspark.sql.functions` and Scala ``UserDefinedFunctions``.</span>
<span class="sd"> Python ``UserDefinedFunctions`` are not supported</span>
<span class="sd"> (`SPARK-27052 &lt;https://issues.apache.org/jira/browse/SPARK-27052&gt;`__).</span>
<span class="sd"> .. versionadded:: 3.1.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> name of column or expression</span>
<span class="sd"> initialValue : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> initial value. Name of column or expression</span>
<span class="sd"> merge : function</span>
<span class="sd"> a binary function ``(acc: Column, x: Column) -&gt; Column...`` returning expression</span>
<span class="sd"> of the same type as ``zero``</span>
<span class="sd"> finish : function</span>
<span class="sd"> an optional unary function ``(x: Column) -&gt; Column: ...``</span>
<span class="sd"> used to convert accumulated value.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(1, [20.0, 4.0, 2.0, 6.0, 10.0])], (&quot;id&quot;, &quot;values&quot;))</span>
<span class="sd"> &gt;&gt;&gt; df.select(aggregate(&quot;values&quot;, lit(0.0), lambda acc, x: acc + x).alias(&quot;sum&quot;)).show()</span>
<span class="sd"> +----+</span>
<span class="sd"> | sum|</span>
<span class="sd"> +----+</span>
<span class="sd"> |42.0|</span>
<span class="sd"> +----+</span>
<span class="sd"> &gt;&gt;&gt; def merge(acc, x):</span>
<span class="sd"> ... count = acc.count + 1</span>
<span class="sd"> ... sum = acc.sum + x</span>
<span class="sd"> ... return struct(count.alias(&quot;count&quot;), sum.alias(&quot;sum&quot;))</span>
<span class="sd"> &gt;&gt;&gt; df.select(</span>
<span class="sd"> ... aggregate(</span>
<span class="sd"> ... &quot;values&quot;,</span>
<span class="sd"> ... struct(lit(0).alias(&quot;count&quot;), lit(0.0).alias(&quot;sum&quot;)),</span>
<span class="sd"> ... merge,</span>
<span class="sd"> ... lambda acc: acc.sum / acc.count,</span>
<span class="sd"> ... ).alias(&quot;mean&quot;)</span>
<span class="sd"> ... ).show()</span>
<span class="sd"> +----+</span>
<span class="sd"> |mean|</span>
<span class="sd"> +----+</span>
<span class="sd"> | 8.4|</span>
<span class="sd"> +----+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="n">finish</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="k">return</span> <span class="n">_invoke_higher_order_function</span><span class="p">(</span>
<span class="s2">&quot;ArrayAggregate&quot;</span><span class="p">,</span>
<span class="p">[</span><span class="n">col</span><span class="p">,</span> <span class="n">initialValue</span><span class="p">],</span>
<span class="p">[</span><span class="n">merge</span><span class="p">,</span> <span class="n">finish</span><span class="p">]</span>
<span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">return</span> <span class="n">_invoke_higher_order_function</span><span class="p">(</span>
<span class="s2">&quot;ArrayAggregate&quot;</span><span class="p">,</span>
<span class="p">[</span><span class="n">col</span><span class="p">,</span> <span class="n">initialValue</span><span class="p">],</span>
<span class="p">[</span><span class="n">merge</span><span class="p">]</span>
<span class="p">)</span></div>
<div class="viewcode-block" id="zip_with"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.zip_with.html#pyspark.sql.functions.zip_with">[docs]</a><span class="k">def</span> <span class="nf">zip_with</span><span class="p">(</span><span class="n">left</span><span class="p">,</span> <span class="n">right</span><span class="p">,</span> <span class="n">f</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Merge two given arrays, element-wise, into a single array using a function.</span>
<span class="sd"> If one array is shorter, nulls are appended at the end to match the length of the longer</span>
<span class="sd"> array, before applying the function.</span>
<span class="sd"> .. versionadded:: 3.1.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> left : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> name of the first column or expression</span>
<span class="sd"> right : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> name of the second column or expression</span>
<span class="sd"> f : function</span>
<span class="sd"> a binary function ``(x1: Column, x2: Column) -&gt; Column...``</span>
<span class="sd"> Can use methods of :class:`~pyspark.sql.Column`, functions defined in</span>
<span class="sd"> :py:mod:`pyspark.sql.functions` and Scala ``UserDefinedFunctions``.</span>
<span class="sd"> Python ``UserDefinedFunctions`` are not supported</span>
<span class="sd"> (`SPARK-27052 &lt;https://issues.apache.org/jira/browse/SPARK-27052&gt;`__).</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(1, [1, 3, 5, 8], [0, 2, 4, 6])], (&quot;id&quot;, &quot;xs&quot;, &quot;ys&quot;))</span>
<span class="sd"> &gt;&gt;&gt; df.select(zip_with(&quot;xs&quot;, &quot;ys&quot;, lambda x, y: x ** y).alias(&quot;powers&quot;)).show(truncate=False)</span>
<span class="sd"> +---------------------------+</span>
<span class="sd"> |powers |</span>
<span class="sd"> +---------------------------+</span>
<span class="sd"> |[1.0, 9.0, 625.0, 262144.0]|</span>
<span class="sd"> +---------------------------+</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(1, [&quot;foo&quot;, &quot;bar&quot;], [1, 2, 3])], (&quot;id&quot;, &quot;xs&quot;, &quot;ys&quot;))</span>
<span class="sd"> &gt;&gt;&gt; df.select(zip_with(&quot;xs&quot;, &quot;ys&quot;, lambda x, y: concat_ws(&quot;_&quot;, x, y)).alias(&quot;xs_ys&quot;)).show()</span>
<span class="sd"> +-----------------+</span>
<span class="sd"> | xs_ys|</span>
<span class="sd"> +-----------------+</span>
<span class="sd"> |[foo_1, bar_2, 3]|</span>
<span class="sd"> +-----------------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_higher_order_function</span><span class="p">(</span><span class="s2">&quot;ZipWith&quot;</span><span class="p">,</span> <span class="p">[</span><span class="n">left</span><span class="p">,</span> <span class="n">right</span><span class="p">],</span> <span class="p">[</span><span class="n">f</span><span class="p">])</span></div>
<div class="viewcode-block" id="transform_keys"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.transform_keys.html#pyspark.sql.functions.transform_keys">[docs]</a><span class="k">def</span> <span class="nf">transform_keys</span><span class="p">(</span><span class="n">col</span><span class="p">,</span> <span class="n">f</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Applies a function to every key-value pair in a map and returns</span>
<span class="sd"> a map with the results of those applications as the new keys for the pairs.</span>
<span class="sd"> .. versionadded:: 3.1.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> name of column or expression</span>
<span class="sd"> f : function</span>
<span class="sd"> a binary function ``(k: Column, v: Column) -&gt; Column...``</span>
<span class="sd"> Can use methods of :class:`~pyspark.sql.Column`, functions defined in</span>
<span class="sd"> :py:mod:`pyspark.sql.functions` and Scala ``UserDefinedFunctions``.</span>
<span class="sd"> Python ``UserDefinedFunctions`` are not supported</span>
<span class="sd"> (`SPARK-27052 &lt;https://issues.apache.org/jira/browse/SPARK-27052&gt;`__).</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(1, {&quot;foo&quot;: -2.0, &quot;bar&quot;: 2.0})], (&quot;id&quot;, &quot;data&quot;))</span>
<span class="sd"> &gt;&gt;&gt; df.select(transform_keys(</span>
<span class="sd"> ... &quot;data&quot;, lambda k, _: upper(k)).alias(&quot;data_upper&quot;)</span>
<span class="sd"> ... ).show(truncate=False)</span>
<span class="sd"> +-------------------------+</span>
<span class="sd"> |data_upper |</span>
<span class="sd"> +-------------------------+</span>
<span class="sd"> |{BAR -&gt; 2.0, FOO -&gt; -2.0}|</span>
<span class="sd"> +-------------------------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_higher_order_function</span><span class="p">(</span><span class="s2">&quot;TransformKeys&quot;</span><span class="p">,</span> <span class="p">[</span><span class="n">col</span><span class="p">],</span> <span class="p">[</span><span class="n">f</span><span class="p">])</span></div>
<div class="viewcode-block" id="transform_values"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.transform_values.html#pyspark.sql.functions.transform_values">[docs]</a><span class="k">def</span> <span class="nf">transform_values</span><span class="p">(</span><span class="n">col</span><span class="p">,</span> <span class="n">f</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Applies a function to every key-value pair in a map and returns</span>
<span class="sd"> a map with the results of those applications as the new values for the pairs.</span>
<span class="sd"> .. versionadded:: 3.1.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> name of column or expression</span>
<span class="sd"> f : function</span>
<span class="sd"> a binary function ``(k: Column, v: Column) -&gt; Column...``</span>
<span class="sd"> Can use methods of :class:`~pyspark.sql.Column`, functions defined in</span>
<span class="sd"> :py:mod:`pyspark.sql.functions` and Scala ``UserDefinedFunctions``.</span>
<span class="sd"> Python ``UserDefinedFunctions`` are not supported</span>
<span class="sd"> (`SPARK-27052 &lt;https://issues.apache.org/jira/browse/SPARK-27052&gt;`__).</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(1, {&quot;IT&quot;: 10.0, &quot;SALES&quot;: 2.0, &quot;OPS&quot;: 24.0})], (&quot;id&quot;, &quot;data&quot;))</span>
<span class="sd"> &gt;&gt;&gt; df.select(transform_values(</span>
<span class="sd"> ... &quot;data&quot;, lambda k, v: when(k.isin(&quot;IT&quot;, &quot;OPS&quot;), v + 10.0).otherwise(v)</span>
<span class="sd"> ... ).alias(&quot;new_data&quot;)).show(truncate=False)</span>
<span class="sd"> +---------------------------------------+</span>
<span class="sd"> |new_data |</span>
<span class="sd"> +---------------------------------------+</span>
<span class="sd"> |{OPS -&gt; 34.0, IT -&gt; 20.0, SALES -&gt; 2.0}|</span>
<span class="sd"> +---------------------------------------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_higher_order_function</span><span class="p">(</span><span class="s2">&quot;TransformValues&quot;</span><span class="p">,</span> <span class="p">[</span><span class="n">col</span><span class="p">],</span> <span class="p">[</span><span class="n">f</span><span class="p">])</span></div>
<div class="viewcode-block" id="map_filter"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.map_filter.html#pyspark.sql.functions.map_filter">[docs]</a><span class="k">def</span> <span class="nf">map_filter</span><span class="p">(</span><span class="n">col</span><span class="p">,</span> <span class="n">f</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns a map whose key-value pairs satisfy a predicate.</span>
<span class="sd"> .. versionadded:: 3.1.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> name of column or expression</span>
<span class="sd"> f : function</span>
<span class="sd"> a binary function ``(k: Column, v: Column) -&gt; Column...``</span>
<span class="sd"> Can use methods of :class:`~pyspark.sql.Column`, functions defined in</span>
<span class="sd"> :py:mod:`pyspark.sql.functions` and Scala ``UserDefinedFunctions``.</span>
<span class="sd"> Python ``UserDefinedFunctions`` are not supported</span>
<span class="sd"> (`SPARK-27052 &lt;https://issues.apache.org/jira/browse/SPARK-27052&gt;`__).</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(1, {&quot;foo&quot;: 42.0, &quot;bar&quot;: 1.0, &quot;baz&quot;: 32.0})], (&quot;id&quot;, &quot;data&quot;))</span>
<span class="sd"> &gt;&gt;&gt; df.select(map_filter(</span>
<span class="sd"> ... &quot;data&quot;, lambda _, v: v &gt; 30.0).alias(&quot;data_filtered&quot;)</span>
<span class="sd"> ... ).show(truncate=False)</span>
<span class="sd"> +--------------------------+</span>
<span class="sd"> |data_filtered |</span>
<span class="sd"> +--------------------------+</span>
<span class="sd"> |{baz -&gt; 32.0, foo -&gt; 42.0}|</span>
<span class="sd"> +--------------------------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_higher_order_function</span><span class="p">(</span><span class="s2">&quot;MapFilter&quot;</span><span class="p">,</span> <span class="p">[</span><span class="n">col</span><span class="p">],</span> <span class="p">[</span><span class="n">f</span><span class="p">])</span></div>
<div class="viewcode-block" id="map_zip_with"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.map_zip_with.html#pyspark.sql.functions.map_zip_with">[docs]</a><span class="k">def</span> <span class="nf">map_zip_with</span><span class="p">(</span><span class="n">col1</span><span class="p">,</span> <span class="n">col2</span><span class="p">,</span> <span class="n">f</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Merge two given maps, key-wise into a single map using a function.</span>
<span class="sd"> .. versionadded:: 3.1.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col1 : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> name of the first column or expression</span>
<span class="sd"> col2 : :class:`~pyspark.sql.Column` or str</span>
<span class="sd"> name of the second column or expression</span>
<span class="sd"> f : function</span>
<span class="sd"> a ternary function ``(k: Column, v1: Column, v2: Column) -&gt; Column...``</span>
<span class="sd"> Can use methods of :class:`~pyspark.sql.Column`, functions defined in</span>
<span class="sd"> :py:mod:`pyspark.sql.functions` and Scala ``UserDefinedFunctions``.</span>
<span class="sd"> Python ``UserDefinedFunctions`` are not supported</span>
<span class="sd"> (`SPARK-27052 &lt;https://issues.apache.org/jira/browse/SPARK-27052&gt;`__).</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`~pyspark.sql.Column`</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([</span>
<span class="sd"> ... (1, {&quot;IT&quot;: 24.0, &quot;SALES&quot;: 12.00}, {&quot;IT&quot;: 2.0, &quot;SALES&quot;: 1.4})],</span>
<span class="sd"> ... (&quot;id&quot;, &quot;base&quot;, &quot;ratio&quot;)</span>
<span class="sd"> ... )</span>
<span class="sd"> &gt;&gt;&gt; df.select(map_zip_with(</span>
<span class="sd"> ... &quot;base&quot;, &quot;ratio&quot;, lambda k, v1, v2: round(v1 * v2, 2)).alias(&quot;updated_data&quot;)</span>
<span class="sd"> ... ).show(truncate=False)</span>
<span class="sd"> +---------------------------+</span>
<span class="sd"> |updated_data |</span>
<span class="sd"> +---------------------------+</span>
<span class="sd"> |{SALES -&gt; 16.8, IT -&gt; 48.0}|</span>
<span class="sd"> +---------------------------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">_invoke_higher_order_function</span><span class="p">(</span><span class="s2">&quot;MapZipWith&quot;</span><span class="p">,</span> <span class="p">[</span><span class="n">col1</span><span class="p">,</span> <span class="n">col2</span><span class="p">],</span> <span class="p">[</span><span class="n">f</span><span class="p">])</span></div>
<span class="c1"># ---------------------- Partition transform functions --------------------------------</span>
<div class="viewcode-block" id="years"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.years.html#pyspark.sql.functions.years">[docs]</a><span class="k">def</span> <span class="nf">years</span><span class="p">(</span><span class="n">col</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Partition transform function: A transform for timestamps and dates</span>
<span class="sd"> to partition data into years.</span>
<span class="sd"> .. versionadded:: 3.1.0</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df.writeTo(&quot;catalog.db.table&quot;).partitionedBy( # doctest: +SKIP</span>
<span class="sd"> ... years(&quot;ts&quot;)</span>
<span class="sd"> ... ).createOrReplace()</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> This function can be used only in combination with</span>
<span class="sd"> :py:meth:`~pyspark.sql.readwriter.DataFrameWriterV2.partitionedBy`</span>
<span class="sd"> method of the `DataFrameWriterV2`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span>
<span class="k">return</span> <span class="n">Column</span><span class="p">(</span><span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">functions</span><span class="o">.</span><span class="n">years</span><span class="p">(</span><span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">)))</span></div>
<div class="viewcode-block" id="months"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.months.html#pyspark.sql.functions.months">[docs]</a><span class="k">def</span> <span class="nf">months</span><span class="p">(</span><span class="n">col</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Partition transform function: A transform for timestamps and dates</span>
<span class="sd"> to partition data into months.</span>
<span class="sd"> .. versionadded:: 3.1.0</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df.writeTo(&quot;catalog.db.table&quot;).partitionedBy(</span>
<span class="sd"> ... months(&quot;ts&quot;)</span>
<span class="sd"> ... ).createOrReplace() # doctest: +SKIP</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> This function can be used only in combination with</span>
<span class="sd"> :py:meth:`~pyspark.sql.readwriter.DataFrameWriterV2.partitionedBy`</span>
<span class="sd"> method of the `DataFrameWriterV2`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span>
<span class="k">return</span> <span class="n">Column</span><span class="p">(</span><span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">functions</span><span class="o">.</span><span class="n">months</span><span class="p">(</span><span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">)))</span></div>
<div class="viewcode-block" id="days"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.days.html#pyspark.sql.functions.days">[docs]</a><span class="k">def</span> <span class="nf">days</span><span class="p">(</span><span class="n">col</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Partition transform function: A transform for timestamps and dates</span>
<span class="sd"> to partition data into days.</span>
<span class="sd"> .. versionadded:: 3.1.0</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df.writeTo(&quot;catalog.db.table&quot;).partitionedBy( # doctest: +SKIP</span>
<span class="sd"> ... days(&quot;ts&quot;)</span>
<span class="sd"> ... ).createOrReplace()</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> This function can be used only in combination with</span>
<span class="sd"> :py:meth:`~pyspark.sql.readwriter.DataFrameWriterV2.partitionedBy`</span>
<span class="sd"> method of the `DataFrameWriterV2`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span>
<span class="k">return</span> <span class="n">Column</span><span class="p">(</span><span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">functions</span><span class="o">.</span><span class="n">days</span><span class="p">(</span><span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">)))</span></div>
<div class="viewcode-block" id="hours"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.hours.html#pyspark.sql.functions.hours">[docs]</a><span class="k">def</span> <span class="nf">hours</span><span class="p">(</span><span class="n">col</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Partition transform function: A transform for timestamps</span>
<span class="sd"> to partition data into hours.</span>
<span class="sd"> .. versionadded:: 3.1.0</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df.writeTo(&quot;catalog.db.table&quot;).partitionedBy( # doctest: +SKIP</span>
<span class="sd"> ... hours(&quot;ts&quot;)</span>
<span class="sd"> ... ).createOrReplace()</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> This function can be used only in combination with</span>
<span class="sd"> :py:meth:`~pyspark.sql.readwriter.DataFrameWriterV2.partitionedBy`</span>
<span class="sd"> method of the `DataFrameWriterV2`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span>
<span class="k">return</span> <span class="n">Column</span><span class="p">(</span><span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">functions</span><span class="o">.</span><span class="n">hours</span><span class="p">(</span><span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">)))</span></div>
<div class="viewcode-block" id="bucket"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.bucket.html#pyspark.sql.functions.bucket">[docs]</a><span class="k">def</span> <span class="nf">bucket</span><span class="p">(</span><span class="n">numBuckets</span><span class="p">,</span> <span class="n">col</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Partition transform function: A transform for any type that partitions</span>
<span class="sd"> by a hash of the input column.</span>
<span class="sd"> .. versionadded:: 3.1.0</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df.writeTo(&quot;catalog.db.table&quot;).partitionedBy( # doctest: +SKIP</span>
<span class="sd"> ... bucket(42, &quot;ts&quot;)</span>
<span class="sd"> ... ).createOrReplace()</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> This function can be used only in combination with</span>
<span class="sd"> :py:meth:`~pyspark.sql.readwriter.DataFrameWriterV2.partitionedBy`</span>
<span class="sd"> method of the `DataFrameWriterV2`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">numBuckets</span><span class="p">,</span> <span class="p">(</span><span class="nb">int</span><span class="p">,</span> <span class="n">Column</span><span class="p">)):</span>
<span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span>
<span class="s2">&quot;numBuckets should be a Column or an int, got </span><span class="si">{}</span><span class="s2">&quot;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="nb">type</span><span class="p">(</span><span class="n">numBuckets</span><span class="p">))</span>
<span class="p">)</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span>
<span class="n">numBuckets</span> <span class="o">=</span> <span class="p">(</span>
<span class="n">_create_column_from_literal</span><span class="p">(</span><span class="n">numBuckets</span><span class="p">)</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">numBuckets</span><span class="p">,</span> <span class="nb">int</span><span class="p">)</span>
<span class="k">else</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">numBuckets</span><span class="p">)</span>
<span class="p">)</span>
<span class="k">return</span> <span class="n">Column</span><span class="p">(</span><span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">functions</span><span class="o">.</span><span class="n">bucket</span><span class="p">(</span><span class="n">numBuckets</span><span class="p">,</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">)))</span></div>
<span class="c1"># ---------------------------- User Defined Function ----------------------------------</span>
<div class="viewcode-block" id="udf"><a class="viewcode-back" href="../../../reference/api/pyspark.sql.functions.udf.html#pyspark.sql.functions.udf">[docs]</a><span class="k">def</span> <span class="nf">udf</span><span class="p">(</span><span class="n">f</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">returnType</span><span class="o">=</span><span class="n">StringType</span><span class="p">()):</span>
<span class="sd">&quot;&quot;&quot;Creates a user defined function (UDF).</span>
<span class="sd"> .. versionadded:: 1.3.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> f : function</span>
<span class="sd"> python function if used as a standalone function</span>
<span class="sd"> returnType : :class:`pyspark.sql.types.DataType` or str</span>
<span class="sd"> the return type of the user-defined function. The value can be either a</span>
<span class="sd"> :class:`pyspark.sql.types.DataType` object or a DDL-formatted type string.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; from pyspark.sql.types import IntegerType</span>
<span class="sd"> &gt;&gt;&gt; slen = udf(lambda s: len(s), IntegerType())</span>
<span class="sd"> &gt;&gt;&gt; @udf</span>
<span class="sd"> ... def to_upper(s):</span>
<span class="sd"> ... if s is not None:</span>
<span class="sd"> ... return s.upper()</span>
<span class="sd"> ...</span>
<span class="sd"> &gt;&gt;&gt; @udf(returnType=IntegerType())</span>
<span class="sd"> ... def add_one(x):</span>
<span class="sd"> ... if x is not None:</span>
<span class="sd"> ... return x + 1</span>
<span class="sd"> ...</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(1, &quot;John Doe&quot;, 21)], (&quot;id&quot;, &quot;name&quot;, &quot;age&quot;))</span>
<span class="sd"> &gt;&gt;&gt; df.select(slen(&quot;name&quot;).alias(&quot;slen(name)&quot;), to_upper(&quot;name&quot;), add_one(&quot;age&quot;)).show()</span>
<span class="sd"> +----------+--------------+------------+</span>
<span class="sd"> |slen(name)|to_upper(name)|add_one(age)|</span>
<span class="sd"> +----------+--------------+------------+</span>
<span class="sd"> | 8| JOHN DOE| 22|</span>
<span class="sd"> +----------+--------------+------------+</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> The user-defined functions are considered deterministic by default. Due to</span>
<span class="sd"> optimization, duplicate invocations may be eliminated or the function may even be invoked</span>
<span class="sd"> more times than it is present in the query. If your function is not deterministic, call</span>
<span class="sd"> `asNondeterministic` on the user defined function. E.g.:</span>
<span class="sd"> &gt;&gt;&gt; from pyspark.sql.types import IntegerType</span>
<span class="sd"> &gt;&gt;&gt; import random</span>
<span class="sd"> &gt;&gt;&gt; random_udf = udf(lambda: int(random.random() * 100), IntegerType()).asNondeterministic()</span>
<span class="sd"> The user-defined functions do not support conditional expressions or short circuiting</span>
<span class="sd"> in boolean expressions and it ends up with being executed all internally. If the functions</span>
<span class="sd"> can fail on special rows, the workaround is to incorporate the condition into the functions.</span>
<span class="sd"> The user-defined functions do not take keyword arguments on the calling side.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="c1"># The following table shows most of Python data and SQL type conversions in normal UDFs that</span>
<span class="c1"># are not yet visible to the user. Some of behaviors are buggy and might be changed in the near</span>
<span class="c1"># future. The table might have to be eventually documented externally.</span>
<span class="c1"># Please see SPARK-28131&#39;s PR to see the codes in order to generate the table below.</span>
<span class="c1">#</span>
<span class="c1"># +-----------------------------+--------------+----------+------+---------------+--------------------+-----------------------------+----------+----------------------+---------+--------------------+----------------------------+------------+--------------+------------------+----------------------+ # noqa</span>
<span class="c1"># |SQL Type \ Python Value(Type)|None(NoneType)|True(bool)|1(int)| a(str)| 1970-01-01(date)|1970-01-01 00:00:00(datetime)|1.0(float)|array(&#39;i&#39;, [1])(array)|[1](list)| (1,)(tuple)|bytearray(b&#39;ABC&#39;)(bytearray)| 1(Decimal)|{&#39;a&#39;: 1}(dict)|Row(kwargs=1)(Row)|Row(namedtuple=1)(Row)| # noqa</span>
<span class="c1"># +-----------------------------+--------------+----------+------+---------------+--------------------+-----------------------------+----------+----------------------+---------+--------------------+----------------------------+------------+--------------+------------------+----------------------+ # noqa</span>
<span class="c1"># | boolean| None| True| None| None| None| None| None| None| None| None| None| None| None| X| X| # noqa</span>
<span class="c1"># | tinyint| None| None| 1| None| None| None| None| None| None| None| None| None| None| X| X| # noqa</span>
<span class="c1"># | smallint| None| None| 1| None| None| None| None| None| None| None| None| None| None| X| X| # noqa</span>
<span class="c1"># | int| None| None| 1| None| None| None| None| None| None| None| None| None| None| X| X| # noqa</span>
<span class="c1"># | bigint| None| None| 1| None| None| None| None| None| None| None| None| None| None| X| X| # noqa</span>
<span class="c1"># | string| None| &#39;true&#39;| &#39;1&#39;| &#39;a&#39;|&#39;java.util.Gregor...| &#39;java.util.Gregor...| &#39;1.0&#39;| &#39;[I@66cbb73a&#39;| &#39;[1]&#39;|&#39;[Ljava.lang.Obje...| &#39;[B@5a51eb1a&#39;| &#39;1&#39;| &#39;{a=1}&#39;| X| X| # noqa</span>
<span class="c1"># | date| None| X| X| X|datetime.date(197...| datetime.date(197...| X| X| X| X| X| X| X| X| X| # noqa</span>
<span class="c1"># | timestamp| None| X| X| X| X| datetime.datetime...| X| X| X| X| X| X| X| X| X| # noqa</span>
<span class="c1"># | float| None| None| None| None| None| None| 1.0| None| None| None| None| None| None| X| X| # noqa</span>
<span class="c1"># | double| None| None| None| None| None| None| 1.0| None| None| None| None| None| None| X| X| # noqa</span>
<span class="c1"># | array&lt;int&gt;| None| None| None| None| None| None| None| [1]| [1]| [1]| [65, 66, 67]| None| None| X| X| # noqa</span>
<span class="c1"># | binary| None| None| None|bytearray(b&#39;a&#39;)| None| None| None| None| None| None| bytearray(b&#39;ABC&#39;)| None| None| X| X| # noqa</span>
<span class="c1"># | decimal(10,0)| None| None| None| None| None| None| None| None| None| None| None|Decimal(&#39;1&#39;)| None| X| X| # noqa</span>
<span class="c1"># | map&lt;string,int&gt;| None| None| None| None| None| None| None| None| None| None| None| None| {&#39;a&#39;: 1}| X| X| # noqa</span>
<span class="c1"># | struct&lt;_1:int&gt;| None| X| X| X| X| X| X| X|Row(_1=1)| Row(_1=1)| X| X| Row(_1=None)| Row(_1=1)| Row(_1=1)| # noqa</span>
<span class="c1"># +-----------------------------+--------------+----------+------+---------------+--------------------+-----------------------------+----------+----------------------+---------+--------------------+----------------------------+------------+--------------+------------------+----------------------+ # noqa</span>
<span class="c1">#</span>
<span class="c1"># Note: DDL formatted string is used for &#39;SQL Type&#39; for simplicity. This string can be</span>
<span class="c1"># used in `returnType`.</span>
<span class="c1"># Note: The values inside of the table are generated by `repr`.</span>
<span class="c1"># Note: &#39;X&#39; means it throws an exception during the conversion.</span>
<span class="c1"># Note: Python 3.7.3 is used.</span>
<span class="c1"># decorator @udf, @udf(), @udf(dataType())</span>
<span class="k">if</span> <span class="n">f</span> <span class="ow">is</span> <span class="kc">None</span> <span class="ow">or</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">f</span><span class="p">,</span> <span class="p">(</span><span class="nb">str</span><span class="p">,</span> <span class="n">DataType</span><span class="p">)):</span>
<span class="c1"># If DataType has been passed as a positional argument</span>
<span class="c1"># for decorator use it as a returnType</span>
<span class="n">return_type</span> <span class="o">=</span> <span class="n">f</span> <span class="ow">or</span> <span class="n">returnType</span>
<span class="k">return</span> <span class="n">functools</span><span class="o">.</span><span class="n">partial</span><span class="p">(</span><span class="n">_create_udf</span><span class="p">,</span> <span class="n">returnType</span><span class="o">=</span><span class="n">return_type</span><span class="p">,</span>
<span class="n">evalType</span><span class="o">=</span><span class="n">PythonEvalType</span><span class="o">.</span><span class="n">SQL_BATCHED_UDF</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">return</span> <span class="n">_create_udf</span><span class="p">(</span><span class="n">f</span><span class="o">=</span><span class="n">f</span><span class="p">,</span> <span class="n">returnType</span><span class="o">=</span><span class="n">returnType</span><span class="p">,</span>
<span class="n">evalType</span><span class="o">=</span><span class="n">PythonEvalType</span><span class="o">.</span><span class="n">SQL_BATCHED_UDF</span><span class="p">)</span></div>
<span class="k">def</span> <span class="nf">_test</span><span class="p">():</span>
<span class="kn">import</span> <span class="nn">doctest</span>
<span class="kn">from</span> <span class="nn">pyspark.sql</span> <span class="kn">import</span> <span class="n">Row</span><span class="p">,</span> <span class="n">SparkSession</span>
<span class="kn">import</span> <span class="nn">pyspark.sql.functions</span>
<span class="n">globs</span> <span class="o">=</span> <span class="n">pyspark</span><span class="o">.</span><span class="n">sql</span><span class="o">.</span><span class="n">functions</span><span class="o">.</span><span class="vm">__dict__</span><span class="o">.</span><span class="n">copy</span><span class="p">()</span>
<span class="n">spark</span> <span class="o">=</span> <span class="n">SparkSession</span><span class="o">.</span><span class="n">builder</span>\
<span class="o">.</span><span class="n">master</span><span class="p">(</span><span class="s2">&quot;local[4]&quot;</span><span class="p">)</span>\
<span class="o">.</span><span class="n">appName</span><span class="p">(</span><span class="s2">&quot;sql.functions tests&quot;</span><span class="p">)</span>\
<span class="o">.</span><span class="n">getOrCreate</span><span class="p">()</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">sparkContext</span>
<span class="n">globs</span><span class="p">[</span><span class="s1">&#39;sc&#39;</span><span class="p">]</span> <span class="o">=</span> <span class="n">sc</span>
<span class="n">globs</span><span class="p">[</span><span class="s1">&#39;spark&#39;</span><span class="p">]</span> <span class="o">=</span> <span class="n">spark</span>
<span class="n">globs</span><span class="p">[</span><span class="s1">&#39;df&#39;</span><span class="p">]</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">([</span><span class="n">Row</span><span class="p">(</span><span class="n">age</span><span class="o">=</span><span class="mi">2</span><span class="p">,</span> <span class="n">name</span><span class="o">=</span><span class="s1">&#39;Alice&#39;</span><span class="p">),</span> <span class="n">Row</span><span class="p">(</span><span class="n">age</span><span class="o">=</span><span class="mi">5</span><span class="p">,</span> <span class="n">name</span><span class="o">=</span><span class="s1">&#39;Bob&#39;</span><span class="p">)])</span>
<span class="p">(</span><span class="n">failure_count</span><span class="p">,</span> <span class="n">test_count</span><span class="p">)</span> <span class="o">=</span> <span class="n">doctest</span><span class="o">.</span><span class="n">testmod</span><span class="p">(</span>
<span class="n">pyspark</span><span class="o">.</span><span class="n">sql</span><span class="o">.</span><span class="n">functions</span><span class="p">,</span> <span class="n">globs</span><span class="o">=</span><span class="n">globs</span><span class="p">,</span>
<span class="n">optionflags</span><span class="o">=</span><span class="n">doctest</span><span class="o">.</span><span class="n">ELLIPSIS</span> <span class="o">|</span> <span class="n">doctest</span><span class="o">.</span><span class="n">NORMALIZE_WHITESPACE</span><span class="p">)</span>
<span class="n">spark</span><span class="o">.</span><span class="n">stop</span><span class="p">()</span>
<span class="k">if</span> <span class="n">failure_count</span><span class="p">:</span>
<span class="n">sys</span><span class="o">.</span><span class="n">exit</span><span class="p">(</span><span class="o">-</span><span class="mi">1</span><span class="p">)</span>
<span class="k">if</span> <span class="vm">__name__</span> <span class="o">==</span> <span class="s2">&quot;__main__&quot;</span><span class="p">:</span>
<span class="n">_test</span><span class="p">()</span>
</pre></div>
</div>
<div class='prev-next-bottom'>
</div>
</main>
</div>
</div>
<script src="../../../_static/js/index.3da636dd464baa7582d2.js"></script>
<footer class="footer mt-5 mt-md-0">
<div class="container">
<p>
&copy; Copyright .<br/>
Created using <a href="http://sphinx-doc.org/">Sphinx</a> 3.0.4.<br/>
</p>
</div>
</footer>
</body>
</html>